Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions hack/test-holmes-investigate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash
# Test script for the Holmes investigation admin API endpoint.
#
# Prerequisites:
# 1. VPN connected to the dev environment
# 2. secrets/ folder generated: SECRET_SA_ACCOUNT_NAME=rharosecretsdev make secrets
# 3. AKS kubeconfig generated: make aks.kubeconfig
# 4. A test cluster created via: CLUSTER=<name> go run ./hack/cluster create
# 5. Local RP running with Hive enabled (see below)
#
# Usage:
# ./hack/test-holmes-investigate.sh <cluster-name> [question]
#
# Examples:
# ./hack/test-holmes-investigate.sh haowang-holmes-test
# ./hack/test-holmes-investigate.sh haowang-holmes-test "why is pod X crashing?"
# ./hack/test-holmes-investigate.sh haowang-holmes-test "check node memory usage"
#
# To start the local RP with Hive + Holmes enabled:
#
# source env && source secrets/env
# export HIVE_KUBE_CONFIG_PATH=$(realpath aks.kubeconfig)
# export ARO_INSTALL_VIA_HIVE=true
# export ARO_ADOPT_BY_HIVE=true
# export ARO_PODMAN_SOCKET="unix://$(podman machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}')"
# export HOLMES_IMAGE="quay.io/haoran/holmesgpt:latest"
# export HOLMES_AZURE_API_KEY="<your-azure-openai-key>"
# export HOLMES_AZURE_API_BASE="<your-azure-openai-endpoint>"
# export HOLMES_AZURE_API_VERSION="2025-04-01-preview"
# export HOLMES_MODEL="azure/gpt-5.2"
# make runlocal-rp

set -euo pipefail

CLUSTER_NAME="${1:-}"
QUESTION="${2:-what is the cluster health status?}"

if [[ -z "$CLUSTER_NAME" ]]; then
echo "Usage: $0 <cluster-name> [question]"
echo ""
echo "Examples:"
echo " $0 haowang-holmes-test"
echo " $0 haowang-holmes-test 'why is pod X crashing?'"
exit 1
fi

# Source env if not already loaded
if [[ -z "${AZURE_SUBSCRIPTION_ID:-}" ]]; then
if [[ -f env ]] && [[ -f secrets/env ]]; then
source env
source secrets/env
else
echo "Error: AZURE_SUBSCRIPTION_ID not set and env files not found."
echo "Run from the repo root, or source env && source secrets/env first."
exit 1
fi
fi

RESOURCEGROUP="${RESOURCEGROUP:-v4-eastus}"
RP_URL="https://localhost:8443"
API_PATH="/admin/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourcegroups/${RESOURCEGROUP}/providers/Microsoft.RedHatOpenShift/openShiftClusters/${CLUSTER_NAME}/investigate"

echo "============================================"
echo " Holmes Investigation Test"
echo "============================================"
echo " Cluster: ${CLUSTER_NAME}"
echo " RG: ${RESOURCEGROUP}"
echo " Question: ${QUESTION}"
echo " Endpoint: POST ${RP_URL}${API_PATH}"
echo "============================================"
echo ""

# Check RP is running
if ! curl -sk -o /dev/null -w '' "${RP_URL}/healthz" 2>/dev/null; then
echo "Error: Local RP is not running at ${RP_URL}"
echo "Start it with: make runlocal-rp (see header comments for full env setup)"
exit 1
fi

echo "Sending investigation request..."
echo "Streaming results (this may take 1-5 minutes):"
echo "--------------------------------------------"

curl -sk --no-buffer -X POST \
"${RP_URL}${API_PATH}" \
-H "Content-Type: application/json" \
-d "$(jq -n --arg q "${QUESTION}" '{question: $q}')"

echo ""
echo "--------------------------------------------"
echo "Investigation complete."
9 changes: 5 additions & 4 deletions pkg/cluster/kubeconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ import (
// kubeconfig for the ARO service, based on the admin kubeconfig found in the
// graph.
func (m *manager) generateAROServiceKubeconfig(pg graph.PersistedGraph) ([]byte, error) {
return generateKubeconfig(pg, "system:aro-service", []string{"system:masters"}, installer.TenYears, true)
return GenerateKubeconfig(pg, "system:aro-service", []string{"system:masters"}, installer.TenYears, true)
}

// generateAROSREKubeconfig generates additional admin credentials and a
// kubeconfig for ARO SREs, based on the admin kubeconfig found in the graph.
func (m *manager) generateAROSREKubeconfig(pg graph.PersistedGraph) ([]byte, error) {
return generateKubeconfig(pg, "system:aro-sre", nil, installer.TenYears, true)
return GenerateKubeconfig(pg, "system:aro-sre", nil, installer.TenYears, true)
}

// checkUserAdminKubeconfigUpdated checks if the user kubeconfig is
Expand Down Expand Up @@ -82,7 +82,7 @@ func (m *manager) checkUserAdminKubeconfigUpdated() bool {
// generateUserAdminKubeconfig generates additional admin credentials and a
// kubeconfig for ARO User, based on the admin kubeconfig found in the graph.
func (m *manager) generateUserAdminKubeconfig(pg graph.PersistedGraph) ([]byte, error) {
return generateKubeconfig(pg, "system:admin", nil, installer.OneYear, false)
return GenerateKubeconfig(pg, "system:admin", nil, installer.OneYear, false)
}

func (m *manager) generateKubeconfigs(ctx context.Context) error {
Expand Down Expand Up @@ -127,7 +127,8 @@ func (m *manager) generateKubeconfigs(ctx context.Context) error {
return err
}

func generateKubeconfig(pg graph.PersistedGraph, commonName string, organization []string, validity time.Duration, internal bool) ([]byte, error) {
// GenerateKubeconfig generates a kubeconfig with a client certificate signed by the cluster CA.
func GenerateKubeconfig(pg graph.PersistedGraph, commonName string, organization []string, validity time.Duration, internal bool) ([]byte, error) {
var ca *installer.AdminKubeConfigSignerCertKey
var adminInternalClient *installer.AdminInternalClient
err := pg.GetByName(false, "*tls.AdminKubeConfigSignerCertKey", &ca)
Expand Down
130 changes: 130 additions & 0 deletions pkg/cluster/kubeconfig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,136 @@ func TestGenerateAROServiceKubeconfig(t *testing.T) {
}
}

func TestGenerateDiagnosticsKubeconfig(t *testing.T) {
validCaKey, validCaCerts, err := utiltls.GenerateKeyAndCertificate("validca", nil, nil, true, false)
if err != nil {
t.Fatal(err)
}
encodedKey, err := utilpem.Encode(validCaKey)
if err != nil {
t.Fatal(err)
}
encodedCert, err := utilpem.Encode(validCaCerts[0])
if err != nil {
t.Fatal(err)
}
ca := &installer.AdminKubeConfigSignerCertKey{
SelfSignedCertKey: installer.SelfSignedCertKey{
CertKey: installer.CertKey{
CertRaw: encodedCert,
KeyRaw: encodedKey,
},
},
}

apiserverURL := "https://api-int.hash.rg.mydomain:6443"
clusterName := "api-hash-rg-mydomain:6443"
diagnosticsName := "system:aro-diagnostics"

adminInternalClient := &installer.AdminInternalClient{}
adminInternalClient.Config = &clientcmdv1.Config{
Clusters: []clientcmdv1.NamedCluster{
{
Name: clusterName,
Cluster: clientcmdv1.Cluster{
Server: apiserverURL,
CertificateAuthorityData: []byte("internal API Cert"),
},
},
},
AuthInfos: []clientcmdv1.NamedAuthInfo{},
Contexts: []clientcmdv1.NamedContext{
{
Name: diagnosticsName,
Context: clientcmdv1.Context{
Cluster: clusterName,
AuthInfo: diagnosticsName,
},
},
},
CurrentContext: diagnosticsName,
}

pg := graph.PersistedGraph{}

caData, err := json.Marshal(ca)
if err != nil {
t.Fatal(err)
}
clientData, err := json.Marshal(adminInternalClient)
if err != nil {
t.Fatal(err)
}
pg["*kubeconfig.AdminInternalClient"] = clientData
pg["*tls.AdminKubeConfigSignerCertKey"] = caData

// Generate a 1-hour kubeconfig for system:aro-diagnostics
aroDiagnosticsClient, err := GenerateKubeconfig(pg, diagnosticsName, nil, time.Hour, true)
if err != nil {
t.Fatal(err)
}

var got *clientcmdv1.Config
err = yaml.Unmarshal(aroDiagnosticsClient, &got)
if err != nil {
t.Fatal(err)
}

innerpem := string(got.AuthInfos[0].AuthInfo.ClientCertificateData) + string(got.AuthInfos[0].AuthInfo.ClientKeyData)
_, innercert, err := utilpem.Parse([]byte(innerpem))
if err != nil {
t.Fatal(err)
}

err = innercert[0].CheckSignatureFrom(validCaCerts[0])
if err != nil {
t.Fatal(err)
}

issuer := innercert[0].Issuer.String()
if issuer != "CN=validca" {
t.Error(issuer)
}

subject := innercert[0].Subject.String()
if subject != "CN=system:aro-diagnostics" {
t.Error(subject)
}

// Verify no organization (no system:masters group)
if len(innercert[0].Subject.Organization) != 0 {
t.Errorf("expected no organization, got %v", innercert[0].Subject.Organization)
}

// Verify ~1 hour validity (not 10 years)
expectedExpiry := time.Now().Add(time.Hour)
if innercert[0].NotAfter.After(expectedExpiry.Add(5 * time.Minute)) {
t.Errorf("certificate expires too far in the future: %v", innercert[0].NotAfter)
}
if innercert[0].NotAfter.Before(expectedExpiry.Add(-5 * time.Minute)) {
t.Errorf("certificate expires too soon: %v", innercert[0].NotAfter)
}

keyUsage := innercert[0].KeyUsage
expectedKeyUsage := x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature
if keyUsage != expectedKeyUsage {
t.Error("Invalid keyUsage.")
}

// Verify internal URL is preserved (not rewritten to external)
if got.Clusters[0].Cluster.Server != apiserverURL {
t.Errorf("expected server %s, got %s", apiserverURL, got.Clusters[0].Cluster.Server)
}

// validate the rest of the struct
got.AuthInfos = []clientcmdv1.NamedAuthInfo{}
want := adminInternalClient.Config

if !reflect.DeepEqual(got, want) {
t.Fatal(cmp.Diff(got, want))
}
}

func TestGenerateUserAdminKubeconfig(t *testing.T) {
validCaKey, validCaCerts, err := utiltls.GenerateKeyAndCertificate("validca", nil, nil, true, false)
if err != nil {
Expand Down
124 changes: 124 additions & 0 deletions pkg/frontend/admin_openshiftcluster_investigate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package frontend

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"encoding/json"
"fmt"
"net/http"
"path/filepath"
"strings"
"sync/atomic"

"github.com/go-chi/chi/v5"
"github.com/sirupsen/logrus"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/database/cosmosdb"
"github.com/Azure/ARO-RP/pkg/frontend/middleware"
"github.com/Azure/ARO-RP/pkg/util/holmes"
)

type investigateRequest struct {
Question string `json:"question"`
}

func (f *frontend) postAdminOpenShiftClusterInvestigate(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := ctx.Value(middleware.ContextKeyLog).(*logrus.Entry)
r.URL.Path = filepath.Dir(r.URL.Path)

err := f._postAdminOpenShiftClusterInvestigate(ctx, r, log, w)
if err != nil {
// Only set Content-Type and call adminReply on error, since on success
// the response was already streamed as text/plain by InvestigateCluster.
adminReply(log, w, nil, nil, err)
}
}

func (f *frontend) _postAdminOpenShiftClusterInvestigate(ctx context.Context, r *http.Request, log *logrus.Entry, w http.ResponseWriter) error {
resType, resName, resGroupName := chi.URLParam(r, "resourceType"), chi.URLParam(r, "resourceName"), chi.URLParam(r, "resourceGroupName")

// Parse request body from context (middleware buffers the body).
body := r.Context().Value(middleware.ContextKeyBody).([]byte)
var req investigateRequest
err := json.Unmarshal(body, &req)
if err != nil {
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidRequestContent, "", fmt.Sprintf("The request body could not be parsed: %v.", err))
}

if req.Question == "" {
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "question", "The question parameter is required and must be non-empty.")
}

const maxQuestionLength = 1000
if len(req.Question) > maxQuestionLength {
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "question", fmt.Sprintf("The question must not exceed %d characters.", maxQuestionLength))
}

holmesConfig := holmes.NewHolmesConfigFromEnv()
if err := holmesConfig.Validate(); err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("Holmes configuration error: %v", err))
}

// Rate limit: reject if too many concurrent investigations are running.
// Use CAS loop so rejected requests don't temporarily inflate the counter.
maxConcurrent := int64(holmesConfig.MaxConcurrentInvestigations)
for {
current := atomic.LoadInt64(&f.activeInvestigations)
if current >= maxConcurrent {
return api.NewCloudError(http.StatusTooManyRequests, api.CloudErrorCodeThrottlingLimitExceeded, "", fmt.Sprintf("Too many concurrent investigations (%d). Please try again later.", holmesConfig.MaxConcurrentInvestigations))
}
if atomic.CompareAndSwapInt64(&f.activeInvestigations, current, current+1) {
break
}
}
defer atomic.AddInt64(&f.activeInvestigations, -1)

resourceID := strings.TrimPrefix(r.URL.Path, "/admin")

dbOpenShiftClusters, err := f.dbGroup.OpenShiftClusters()
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error())
}

doc, err := dbOpenShiftClusters.Get(ctx, resourceID)
switch {
case cosmosdb.IsErrorStatusCode(err, http.StatusNotFound):
return api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", fmt.Sprintf("The Resource '%s/%s' under resource group '%s' was not found.", resType, resName, resGroupName))
case err != nil:
return err
}

if f.hiveClusterManager == nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "hive is not enabled")
}

hiveNamespace := doc.OpenShiftCluster.Properties.HiveProfile.Namespace
if hiveNamespace == "" {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "cluster does not have a Hive namespace configured")
}

// Generate a short-lived (1h) read-only kubeconfig for the diagnostics identity.
// This uses the cluster CA from the persisted graph to sign a fresh client cert,
// then converts to the external API endpoint since Hive cannot resolve api-int.*.
kubeconfig, err := f.generateDiagnosticsKubeconfig(ctx, log, doc)
if err != nil {
return fmt.Errorf("failed to generate diagnostics kubeconfig: %w", err)
}

log.Infof("starting Holmes investigation for cluster %s with question: %s", resourceID, req.Question)

// Set Content-Type before streaming begins. Once bytes are written to w,
// the response is committed and errors cannot be reported via adminReply.
w.Header().Set("Content-Type", "text/plain")

err = f.hiveClusterManager.InvestigateCluster(ctx, hiveNamespace, kubeconfig, holmesConfig, req.Question, w)
if err != nil {
return fmt.Errorf("failed to investigate cluster: %w", err)
}

return nil
}
Loading
Loading