-
Notifications
You must be signed in to change notification settings - Fork 194
feat: add Holmes investigation admin API endpoint (ARO-25791) #4754
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 5 commits
0834253
a2b783a
717b3f0
3184970
f3a7829
13cc9c2
b67adfa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| #!/bin/bash | ||
| # Test script for the Holmes investigation admin API endpoint. | ||
| # | ||
| # Prerequisites: | ||
| # 1. VPN connected to the dev environment | ||
| # 2. secrets/ folder generated: SECRET_SA_ACCOUNT_NAME=rharosecretsdev make secrets | ||
| # 3. AKS kubeconfig generated: make aks.kubeconfig | ||
| # 4. A test cluster created via: CLUSTER=<name> go run ./hack/cluster create | ||
| # 5. Local RP running with Hive enabled (see below) | ||
| # | ||
| # Usage: | ||
| # ./hack/test-holmes-investigate.sh <cluster-name> [question] | ||
| # | ||
| # Examples: | ||
| # ./hack/test-holmes-investigate.sh haowang-holmes-test | ||
| # ./hack/test-holmes-investigate.sh haowang-holmes-test "why is pod X crashing?" | ||
| # ./hack/test-holmes-investigate.sh haowang-holmes-test "check node memory usage" | ||
| # | ||
| # To start the local RP with Hive + Holmes enabled: | ||
| # | ||
| # source env && source secrets/env | ||
| # export HIVE_KUBE_CONFIG_PATH=$(realpath aks.kubeconfig) | ||
| # export ARO_INSTALL_VIA_HIVE=true | ||
| # export ARO_ADOPT_BY_HIVE=true | ||
| # export ARO_PODMAN_SOCKET="unix://$(podman machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}')" | ||
| # export HOLMES_IMAGE="quay.io/haoran/holmesgpt:latest" | ||
| # export HOLMES_AZURE_API_KEY="<your-azure-openai-key>" | ||
| # export HOLMES_AZURE_API_BASE="<your-azure-openai-endpoint>" | ||
| # export HOLMES_AZURE_API_VERSION="2025-04-01-preview" | ||
| # export HOLMES_MODEL="azure/gpt-5.2" | ||
| # make runlocal-rp | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| CLUSTER_NAME="${1:-}" | ||
| QUESTION="${2:-what is the cluster health status?}" | ||
|
|
||
| if [[ -z "$CLUSTER_NAME" ]]; then | ||
| echo "Usage: $0 <cluster-name> [question]" | ||
| echo "" | ||
| echo "Examples:" | ||
| echo " $0 haowang-holmes-test" | ||
| echo " $0 haowang-holmes-test 'why is pod X crashing?'" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Source env if not already loaded | ||
| if [[ -z "${AZURE_SUBSCRIPTION_ID:-}" ]]; then | ||
| if [[ -f env ]] && [[ -f secrets/env ]]; then | ||
| source env | ||
| source secrets/env | ||
| else | ||
| echo "Error: AZURE_SUBSCRIPTION_ID not set and env files not found." | ||
| echo "Run from the repo root, or source env && source secrets/env first." | ||
| exit 1 | ||
| fi | ||
| fi | ||
|
|
||
| RESOURCEGROUP="${RESOURCEGROUP:-v4-eastus}" | ||
| RP_URL="https://localhost:8443" | ||
| API_PATH="/admin/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourcegroups/${RESOURCEGROUP}/providers/Microsoft.RedHatOpenShift/openShiftClusters/${CLUSTER_NAME}/investigate" | ||
|
|
||
| echo "============================================" | ||
| echo " Holmes Investigation Test" | ||
| echo "============================================" | ||
| echo " Cluster: ${CLUSTER_NAME}" | ||
| echo " RG: ${RESOURCEGROUP}" | ||
| echo " Question: ${QUESTION}" | ||
| echo " Endpoint: POST ${RP_URL}${API_PATH}" | ||
| echo "============================================" | ||
| echo "" | ||
|
|
||
| # Check RP is running | ||
| if ! curl -sk -o /dev/null -w '' "${RP_URL}/healthz" 2>/dev/null; then | ||
| echo "Error: Local RP is not running at ${RP_URL}" | ||
| echo "Start it with: make runlocal-rp (see header comments for full env setup)" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Sending investigation request..." | ||
| echo "Streaming results (this may take 1-5 minutes):" | ||
| echo "--------------------------------------------" | ||
|
|
||
| curl -sk --no-buffer -X POST \ | ||
| "${RP_URL}${API_PATH}" \ | ||
| -H "Content-Type: application/json" \ | ||
| -d "{\"question\": \"${QUESTION}\"}" | ||
|
|
||
| echo "" | ||
| echo "--------------------------------------------" | ||
| echo "Investigation complete." | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,117 @@ | ||||||||||||||||||||||||||||||||||
| package frontend | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| // Copyright (c) Microsoft Corporation. | ||||||||||||||||||||||||||||||||||
| // Licensed under the Apache License 2.0. | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| import ( | ||||||||||||||||||||||||||||||||||
| "context" | ||||||||||||||||||||||||||||||||||
| "encoding/json" | ||||||||||||||||||||||||||||||||||
| "fmt" | ||||||||||||||||||||||||||||||||||
| "net/http" | ||||||||||||||||||||||||||||||||||
| "path/filepath" | ||||||||||||||||||||||||||||||||||
| "strings" | ||||||||||||||||||||||||||||||||||
| "sync/atomic" | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| "github.com/go-chi/chi/v5" | ||||||||||||||||||||||||||||||||||
| "github.com/sirupsen/logrus" | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| "github.com/Azure/ARO-RP/pkg/api" | ||||||||||||||||||||||||||||||||||
| "github.com/Azure/ARO-RP/pkg/database/cosmosdb" | ||||||||||||||||||||||||||||||||||
| "github.com/Azure/ARO-RP/pkg/frontend/middleware" | ||||||||||||||||||||||||||||||||||
| "github.com/Azure/ARO-RP/pkg/util/holmes" | ||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| type investigateRequest struct { | ||||||||||||||||||||||||||||||||||
| Question string `json:"question"` | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| func (f *frontend) postAdminOpenShiftClusterInvestigate(w http.ResponseWriter, r *http.Request) { | ||||||||||||||||||||||||||||||||||
| ctx := r.Context() | ||||||||||||||||||||||||||||||||||
| log := ctx.Value(middleware.ContextKeyLog).(*logrus.Entry) | ||||||||||||||||||||||||||||||||||
| r.URL.Path = filepath.Dir(r.URL.Path) | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| err := f._postAdminOpenShiftClusterInvestigate(ctx, r, log, w) | ||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||
| // Only set Content-Type and call adminReply on error, since on success | ||||||||||||||||||||||||||||||||||
| // the response was already streamed as text/plain by InvestigateCluster. | ||||||||||||||||||||||||||||||||||
| adminReply(log, w, nil, nil, err) | ||||||||||||||||||||||||||||||||||
wanghaoran1988 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
wanghaoran1988 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| func (f *frontend) _postAdminOpenShiftClusterInvestigate(ctx context.Context, r *http.Request, log *logrus.Entry, w http.ResponseWriter) error { | ||||||||||||||||||||||||||||||||||
| resType, resName, resGroupName := chi.URLParam(r, "resourceType"), chi.URLParam(r, "resourceName"), chi.URLParam(r, "resourceGroupName") | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| // Parse request body from context (middleware buffers the body). | ||||||||||||||||||||||||||||||||||
| body := r.Context().Value(middleware.ContextKeyBody).([]byte) | ||||||||||||||||||||||||||||||||||
| var req investigateRequest | ||||||||||||||||||||||||||||||||||
| err := json.Unmarshal(body, &req) | ||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||
| return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidRequestContent, "", fmt.Sprintf("The request body could not be parsed: %v.", err)) | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| if req.Question == "" { | ||||||||||||||||||||||||||||||||||
| return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "question", "The question parameter is required and must be non-empty.") | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| const maxQuestionLength = 1000 | ||||||||||||||||||||||||||||||||||
| if len(req.Question) > maxQuestionLength { | ||||||||||||||||||||||||||||||||||
| return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "question", fmt.Sprintf("The question must not exceed %d characters.", maxQuestionLength)) | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
wanghaoran1988 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||||||
| holmesConfig := holmes.NewHolmesConfigFromEnv() | ||||||||||||||||||||||||||||||||||
| if err := holmesConfig.Validate(); err != nil { | ||||||||||||||||||||||||||||||||||
| return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("Holmes configuration error: %v", err)) | ||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| // Rate limit: reject if too many concurrent investigations are running. | ||||||||||||||||||||||||||||||||||
| current := atomic.AddInt64(&f.activeInvestigations, 1) | ||||||||||||||||||||||||||||||||||
| defer atomic.AddInt64(&f.activeInvestigations, -1) | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
| } | |
| // Rate limit: reject if too many concurrent investigations are running. | |
| current := atomic.AddInt64(&f.activeInvestigations, 1) | |
| defer atomic.AddInt64(&f.activeInvestigations, -1) | |
| maxConcurrentInvestigations := int64(holmesConfig.MaxConcurrentInvestigations) | |
| for { | |
| current := atomic.LoadInt64(&f.activeInvestigations) | |
| if current >= maxConcurrentInvestigations { | |
| return api.NewCloudError(http.StatusTooManyRequests, api.CloudErrorCodeThrottlingLimitExceeded, "", fmt.Sprintf("Too many concurrent investigations (%d). Please try again later.", holmesConfig.MaxConcurrentInvestigations)) | |
| } | |
| if atomic.CompareAndSwapInt64(&f.activeInvestigations, current, current+1) { | |
| break | |
| } | |
| } | |
| defer atomic.AddInt64(&f.activeInvestigations, -1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Now uses a CAS loop (LoadInt64 + CompareAndSwapInt64) so rejected requests don't temporarily inflate the counter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Already addressed in a previous commit.
Uh oh!
There was an error while loading. Please reload this page.