From 0627575f80207c259e1b68ce372212a7c4a32929 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Thu, 26 Mar 2026 17:41:08 +0100 Subject: [PATCH 01/14] add apiserver pod health check and apiserver healthz endpoint check to control plane resize pre-flight checks --- ...penshiftcluster_vmresize_pre_validation.go | 85 +++++- ...iftcluster_vmresize_pre_validation_test.go | 247 ++++++++++++++++++ pkg/frontend/adminactions/kubeactions.go | 37 ++- pkg/util/mocks/adminactions/azureactions.go | 5 +- pkg/util/mocks/adminactions/kubeactions.go | 18 +- 5 files changed, 382 insertions(+), 10 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index ed3257689e4..33b6e2bbb6f 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -16,6 +16,8 @@ import ( "github.com/go-chi/chi/v5" "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" @@ -109,6 +111,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( wg.Go(func() { collect(f.validateVMSKU(ctx, doc, subscriptionDoc, desiredVMSize, log)) }) wg.Go(func() { collect(validateAPIServerHealth(ctx, k)) }) + wg.Go(func() { collect(validateAPIServerPods(ctx, k)) }) wg.Go(func() { collect(validateEtcdHealth(ctx, k)) }) wg.Go(func() { collect(validateClusterSP(ctx, k)) }) @@ -219,9 +222,17 @@ func quotaCheckDisabled(_ context.Context, _ env.Interface, _ *api.SubscriptionD return nil } -// validateAPIServerHealth verifies that the kube-apiserver ClusterOperator is -// healthy (Available=True, Progressing=False, Degraded=False). +// validateAPIServerHealth verifies that: +// 1. The API server is reachable from the RP (via /healthz) +// 2. The kube-apiserver ClusterOperator is healthy (Available=True, Progressing=False, Degraded=False) func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) error { + if err := k.CheckAPIServerHealthz(ctx); err != nil { + return api.NewCloudError( + http.StatusServiceUnavailable, + api.CloudErrorCodeInternalServerError, "kube-apiserver", + fmt.Sprintf("API server is not reachable: %v", err)) + } + rawCO, err := k.KubeGet(ctx, "ClusterOperator.config.openshift.io", "", "kube-apiserver") if err != nil { return api.NewCloudError( @@ -249,6 +260,76 @@ func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) er return nil } +func validateAPIServerPods(ctx context.Context, k adminactions.KubeActions) error { + const ( + kubeAPIServerNamespace = "openshift-kube-apiserver" + kubeAPIServerAppLabel = "openshift-kube-apiserver" + ) + + rawPods, err := k.KubeList(ctx, "Pod", kubeAPIServerNamespace) + if err != nil { + return api.NewCloudError( + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "kube-apiserver-pods", + fmt.Sprintf("Failed to list pods in %s namespace: %v", kubeAPIServerNamespace, err)) + } + + var podList corev1.PodList + if err := json.Unmarshal(rawPods, &podList); err != nil { + return api.NewCloudError( + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "kube-apiserver-pods", + fmt.Sprintf("Failed to parse pod list: %v", err)) + } + + var apiServerPodCount int + var unhealthyPods []string + for _, pod := range podList.Items { + if pod.Labels["app"] != kubeAPIServerAppLabel { + continue + } + + apiServerPodCount++ + + if healthy, reason := isPodHealthy(&pod); !healthy { + unhealthyPods = append(unhealthyPods, fmt.Sprintf("%s (%s)", pod.Name, reason)) + } + } + + if apiServerPodCount != api.ControlPlaneNodeCount { + return api.NewCloudError( + http.StatusConflict, + api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods", + fmt.Sprintf("Expected %d kube-apiserver pods, found %d. Resize is not safe without full API server redundancy.", + api.ControlPlaneNodeCount, apiServerPodCount)) + } + + if len(unhealthyPods) > 0 { + return api.NewCloudError( + http.StatusConflict, + api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods", + fmt.Sprintf("Unhealthy kube-apiserver pods: %v. Resize is not safe without full API server redundancy.", + unhealthyPods)) + } + + return nil +} + +func isPodHealthy(pod *corev1.Pod) (healthy bool, reason string) { + if pod.Status.Phase != corev1.PodRunning { + return false, fmt.Sprintf("phase: %s", pod.Status.Phase) + } + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady { + if cond.Status != corev1.ConditionTrue { + return false, "not ready" + } + return true, "" + } + } + return false, "Ready condition not found" +} + // validateEtcdHealth verifies that the etcd ClusterOperator is healthy. // Resizing takes a master offline, so all etcd members must be healthy. func validateEtcdHealth(ctx context.Context, k adminactions.KubeActions) error { diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 0b4f08fad8e..f6ff6f90ea3 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -14,6 +14,7 @@ import ( "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" @@ -77,11 +78,52 @@ func healthyEtcdJSON() []byte { }) } +func fakeAPIServerPodListJSON(pods []corev1.Pod) []byte { + podList := corev1.PodList{Items: pods} + b, _ := json.Marshal(podList) + return b +} + +func healthyAPIServerPod(name, nodeName string) corev1.Pod { + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "openshift-kube-apiserver"}, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionTrue}, + }, + }, + } +} + +func healthyAPIServerPodsJSON() []byte { + return fakeAPIServerPodListJSON([]corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + healthyAPIServerPod("kube-apiserver-master-2", "master-2"), + }) +} + func allKubeChecksHealthyMock(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil). + AnyTimes() k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(healthyKubeAPIServerJSON(), nil). AnyTimes() + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(healthyAPIServerPodsJSON(), nil). + AnyTimes() k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "etcd"). Return(healthyEtcdJSON(), nil). @@ -751,14 +793,29 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "healthy kube-apiserver", mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(healthyKubeAPIServerJSON(), nil) }, }, + { + name: "healthz check fails", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(fmt.Errorf("connection refused")) + }, + wantErr: "503: InternalServerError: kube-apiserver: API server is not reachable: connection refused", + }, { name: "kube-apiserver degraded", mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(fakeClusterOperatorJSON("kube-apiserver", []configv1.ClusterOperatorStatusCondition{ @@ -772,6 +829,9 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "kube-apiserver unavailable", mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(fakeClusterOperatorJSON("kube-apiserver", []configv1.ClusterOperatorStatusCondition{ @@ -785,6 +845,9 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "KubeGet returns error", mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(nil, fmt.Errorf("connection refused")) @@ -794,6 +857,9 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "KubeGet returns invalid JSON", mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerHealthz(gomock.Any()). + Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return([]byte(`{invalid`), nil) @@ -887,3 +953,184 @@ func TestValidateEtcdHealth(t *testing.T) { }) } } + +func TestValidateAPIServerPods(t *testing.T) { + ctx := context.Background() + + for _, tt := range []struct { + name string + mocks func(*mock_adminactions.MockKubeActions) + wantErr string + }{ + { + name: "all pods healthy", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(healthyAPIServerPodsJSON(), nil) + }, + }, + { + name: "KubeList returns error", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(nil, fmt.Errorf("connection refused")) + }, + wantErr: "500: InternalServerError: kube-apiserver-pods: Failed to list pods in openshift-kube-apiserver namespace: connection refused", + }, + { + name: "KubeList returns invalid JSON", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return([]byte(`{invalid`), nil) + }, + wantErr: "500: InternalServerError: kube-apiserver-pods: Failed to parse pod list: invalid character 'i' looking for beginning of object key string", + }, + { + name: "only 2 apiserver pods", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON([]corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + }), nil) + }, + wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 2. Resize is not safe without full API server redundancy.", + }, + { + name: "4 apiserver pods", + mocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON([]corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + healthyAPIServerPod("kube-apiserver-master-2", "master-2"), + healthyAPIServerPod("kube-apiserver-master-3", "master-3"), + }), nil) + }, + wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 4. Resize is not safe without full API server redundancy.", + }, + { + name: "one pod not running", + mocks: func(k *mock_adminactions.MockKubeActions) { + pods := []corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + { + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-apiserver-master-2", + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "openshift-kube-apiserver"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + }, + }, + } + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON(pods), nil) + }, + wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (phase: Pending)]. Resize is not safe without full API server redundancy.", + }, + { + name: "one pod not ready", + mocks: func(k *mock_adminactions.MockKubeActions) { + pods := []corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + { + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-apiserver-master-2", + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "openshift-kube-apiserver"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, + }, + }, + }, + } + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON(pods), nil) + }, + wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", + }, + { + name: "multiple unhealthy pods", + mocks: func(k *mock_adminactions.MockKubeActions) { + pods := []corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + { + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-apiserver-master-1", + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "openshift-kube-apiserver"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-apiserver-master-2", + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "openshift-kube-apiserver"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, + }, + }, + }, + } + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON(pods), nil) + }, + wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-1 (phase: Failed) kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", + }, + { + name: "filters non-apiserver pods", + mocks: func(k *mock_adminactions.MockKubeActions) { + pods := []corev1.Pod{ + healthyAPIServerPod("kube-apiserver-master-0", "master-0"), + healthyAPIServerPod("kube-apiserver-master-1", "master-1"), + healthyAPIServerPod("kube-apiserver-master-2", "master-2"), + { + ObjectMeta: metav1.ObjectMeta{ + Name: "some-other-pod", + Namespace: "openshift-kube-apiserver", + Labels: map[string]string{"app": "other-app"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + }, + } + k.EXPECT(). + KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). + Return(fakeAPIServerPodListJSON(pods), nil) + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + + k := mock_adminactions.NewMockKubeActions(controller) + tt.mocks(k) + + err := validateAPIServerPods(ctx, k) + utilerror.AssertErrorMessage(t, err, tt.wantErr) + }) + } +} diff --git a/pkg/frontend/adminactions/kubeactions.go b/pkg/frontend/adminactions/kubeactions.go index 54271b99cc4..6d749ca8426 100644 --- a/pkg/frontend/adminactions/kubeactions.go +++ b/pkg/frontend/adminactions/kubeactions.go @@ -19,6 +19,7 @@ import ( "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" restclient "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" @@ -46,13 +47,15 @@ type KubeActions interface { // Fetch top pods and nodes metrics TopPods(ctx context.Context, restConfig *restclient.Config, allNamespaces bool) ([]PodMetrics, error) TopNodes(ctx context.Context, restConfig *restclient.Config) ([]NodeMetrics, error) + CheckAPIServerHealthz(ctx context.Context) error } type kubeActions struct { log *logrus.Entry oc *api.OpenShiftCluster - mapper meta.RESTMapper + mapper meta.RESTMapper + restConfig *restclient.Config dyn dynamic.Interface kubecli kubernetes.Interface @@ -84,7 +87,8 @@ func NewKubeActions(log *logrus.Entry, env env.Interface, oc *api.OpenShiftClust log: log, oc: oc, - mapper: mapper, + mapper: mapper, + restConfig: restConfig, dyn: dyn, kubecli: kubecli, @@ -187,3 +191,32 @@ func (k *kubeActions) KubeDelete(ctx context.Context, groupKind, namespace, name return k.dyn.Resource(gvr).Namespace(namespace).Delete(ctx, name, resourceDeleteOptions) } + +func (k *kubeActions) CheckAPIServerHealthz(ctx context.Context) error { + rawClientConfig := *k.restConfig + rawClientConfig.GroupVersion = &schema.GroupVersion{} + rawClientConfig.NegotiatedSerializer = scheme.Codecs.WithoutConversion() + rawClientConfig.UserAgent = restclient.DefaultKubernetesUserAgent() + + rawClient, err := restclient.RESTClientFor(&rawClientConfig) + if err != nil { + return fmt.Errorf("failed to create raw REST client: %w", err) + } + + var statusCode int + err = rawClient. + Get(). + AbsPath("/healthz"). + Do(ctx). + StatusCode(&statusCode). + Error() + if err != nil { + return fmt.Errorf("API server healthz check failed: %w", err) + } + + if statusCode != http.StatusOK { + return fmt.Errorf("API server healthz returned non-OK status: %d", statusCode) + } + + return nil +} diff --git a/pkg/util/mocks/adminactions/azureactions.go b/pkg/util/mocks/adminactions/azureactions.go index 174227938bc..66de3c91988 100644 --- a/pkg/util/mocks/adminactions/azureactions.go +++ b/pkg/util/mocks/adminactions/azureactions.go @@ -14,12 +14,11 @@ import ( io "io" reflect "reflect" - logrus "github.com/sirupsen/logrus" - gomock "go.uber.org/mock/gomock" - armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" compute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute" features "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" + logrus "github.com/sirupsen/logrus" + gomock "go.uber.org/mock/gomock" ) // MockAzureActions is a mock of AzureActions interface. diff --git a/pkg/util/mocks/adminactions/kubeactions.go b/pkg/util/mocks/adminactions/kubeactions.go index 06cca7b1e17..af7ea6ab244 100644 --- a/pkg/util/mocks/adminactions/kubeactions.go +++ b/pkg/util/mocks/adminactions/kubeactions.go @@ -13,15 +13,13 @@ import ( context "context" reflect "reflect" + adminactions "github.com/Azure/ARO-RP/pkg/frontend/adminactions" gomock "go.uber.org/mock/gomock" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" schema "k8s.io/apimachinery/pkg/runtime/schema" watch "k8s.io/apimachinery/pkg/watch" rest "k8s.io/client-go/rest" - - adminactions "github.com/Azure/ARO-RP/pkg/frontend/adminactions" ) // MockKubeActions is a mock of KubeActions interface. @@ -76,6 +74,20 @@ func (mr *MockKubeActionsMockRecorder) ApproveCsr(ctx, csrName any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ApproveCsr", reflect.TypeOf((*MockKubeActions)(nil).ApproveCsr), ctx, csrName) } +// CheckAPIServerHealthz mocks base method. +func (m *MockKubeActions) CheckAPIServerHealthz(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckAPIServerHealthz", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// CheckAPIServerHealthz indicates an expected call of CheckAPIServerHealthz. +func (mr *MockKubeActionsMockRecorder) CheckAPIServerHealthz(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckAPIServerHealthz", reflect.TypeOf((*MockKubeActions)(nil).CheckAPIServerHealthz), ctx) +} + // CordonNode mocks base method. func (m *MockKubeActions) CordonNode(ctx context.Context, nodeName string, unschedulable bool) error { m.ctrl.T.Helper() From 98f5484f50edf460ba36847477cb414d3c44b24f Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Thu, 26 Mar 2026 20:49:50 +0100 Subject: [PATCH 02/14] fix linting errors --- pkg/util/mocks/adminactions/azureactions.go | 5 +++-- pkg/util/mocks/adminactions/kubeactions.go | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/util/mocks/adminactions/azureactions.go b/pkg/util/mocks/adminactions/azureactions.go index 66de3c91988..174227938bc 100644 --- a/pkg/util/mocks/adminactions/azureactions.go +++ b/pkg/util/mocks/adminactions/azureactions.go @@ -14,11 +14,12 @@ import ( io "io" reflect "reflect" + logrus "github.com/sirupsen/logrus" + gomock "go.uber.org/mock/gomock" + armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" compute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute" features "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" - logrus "github.com/sirupsen/logrus" - gomock "go.uber.org/mock/gomock" ) // MockAzureActions is a mock of AzureActions interface. diff --git a/pkg/util/mocks/adminactions/kubeactions.go b/pkg/util/mocks/adminactions/kubeactions.go index af7ea6ab244..e2db16866bc 100644 --- a/pkg/util/mocks/adminactions/kubeactions.go +++ b/pkg/util/mocks/adminactions/kubeactions.go @@ -13,13 +13,15 @@ import ( context "context" reflect "reflect" - adminactions "github.com/Azure/ARO-RP/pkg/frontend/adminactions" gomock "go.uber.org/mock/gomock" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" schema "k8s.io/apimachinery/pkg/runtime/schema" watch "k8s.io/apimachinery/pkg/watch" rest "k8s.io/client-go/rest" + + adminactions "github.com/Azure/ARO-RP/pkg/frontend/adminactions" ) // MockKubeActions is a mock of KubeActions interface. From c7c5ca3ff0e186c79cf4cfc3c1820381900b9184 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Fri, 27 Mar 2026 15:07:09 +0100 Subject: [PATCH 03/14] Simplify CheckAPIServerHealthz to reuse existing discovery REST client --- pkg/frontend/adminactions/kubeactions.go | 31 ++++-------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/pkg/frontend/adminactions/kubeactions.go b/pkg/frontend/adminactions/kubeactions.go index 6d749ca8426..935531794bf 100644 --- a/pkg/frontend/adminactions/kubeactions.go +++ b/pkg/frontend/adminactions/kubeactions.go @@ -19,7 +19,6 @@ import ( "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" restclient "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" @@ -54,8 +53,7 @@ type kubeActions struct { log *logrus.Entry oc *api.OpenShiftCluster - mapper meta.RESTMapper - restConfig *restclient.Config + mapper meta.RESTMapper dyn dynamic.Interface kubecli kubernetes.Interface @@ -87,8 +85,7 @@ func NewKubeActions(log *logrus.Entry, env env.Interface, oc *api.OpenShiftClust log: log, oc: oc, - mapper: mapper, - restConfig: restConfig, + mapper: mapper, dyn: dyn, kubecli: kubecli, @@ -193,30 +190,12 @@ func (k *kubeActions) KubeDelete(ctx context.Context, groupKind, namespace, name } func (k *kubeActions) CheckAPIServerHealthz(ctx context.Context) error { - rawClientConfig := *k.restConfig - rawClientConfig.GroupVersion = &schema.GroupVersion{} - rawClientConfig.NegotiatedSerializer = scheme.Codecs.WithoutConversion() - rawClientConfig.UserAgent = restclient.DefaultKubernetesUserAgent() - - rawClient, err := restclient.RESTClientFor(&rawClientConfig) - if err != nil { - return fmt.Errorf("failed to create raw REST client: %w", err) - } - - var statusCode int - err = rawClient. - Get(). - AbsPath("/healthz"). - Do(ctx). - StatusCode(&statusCode). - Error() + body, err := k.kubecli.Discovery().RESTClient().Get().AbsPath("/healthz").Do(ctx).Raw() if err != nil { return fmt.Errorf("API server healthz check failed: %w", err) } - - if statusCode != http.StatusOK { - return fmt.Errorf("API server healthz returned non-OK status: %d", statusCode) + if string(body) != "ok" { + return fmt.Errorf("API server healthz returned: %s", string(body)) } - return nil } From 30d10ae40e56ba379c855acf98fe67a8fa420f13 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Fri, 27 Mar 2026 15:08:50 +0100 Subject: [PATCH 04/14] Remove redundant body check from CheckAPIServerHealthz --- pkg/frontend/adminactions/kubeactions.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/frontend/adminactions/kubeactions.go b/pkg/frontend/adminactions/kubeactions.go index 935531794bf..8f49cbbe4c2 100644 --- a/pkg/frontend/adminactions/kubeactions.go +++ b/pkg/frontend/adminactions/kubeactions.go @@ -190,12 +190,9 @@ func (k *kubeActions) KubeDelete(ctx context.Context, groupKind, namespace, name } func (k *kubeActions) CheckAPIServerHealthz(ctx context.Context) error { - body, err := k.kubecli.Discovery().RESTClient().Get().AbsPath("/healthz").Do(ctx).Raw() + _, err := k.kubecli.Discovery().RESTClient().Get().AbsPath("/healthz").Do(ctx).Raw() if err != nil { return fmt.Errorf("API server healthz check failed: %w", err) } - if string(body) != "ok" { - return fmt.Errorf("API server healthz returned: %s", string(body)) - } return nil } From 7e500f401338e049f562cd3b2fc67827c9c51204 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Fri, 27 Mar 2026 15:14:44 +0100 Subject: [PATCH 05/14] Use /readyz endpoint instead of /healthz for API server check --- ...n_openshiftcluster_vmresize_pre_validation.go | 4 ++-- ...nshiftcluster_vmresize_pre_validation_test.go | 16 ++++++++-------- pkg/frontend/adminactions/kubeactions.go | 8 ++++---- pkg/util/mocks/adminactions/kubeactions.go | 12 ++++++------ 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 33b6e2bbb6f..79d2936d6da 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -223,10 +223,10 @@ func quotaCheckDisabled(_ context.Context, _ env.Interface, _ *api.SubscriptionD } // validateAPIServerHealth verifies that: -// 1. The API server is reachable from the RP (via /healthz) +// 1. The API server is reachable from the RP (via /readyz) // 2. The kube-apiserver ClusterOperator is healthy (Available=True, Progressing=False, Degraded=False) func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) error { - if err := k.CheckAPIServerHealthz(ctx); err != nil { + if err := k.CheckAPIServerReadyz(ctx); err != nil { return api.NewCloudError( http.StatusServiceUnavailable, api.CloudErrorCodeInternalServerError, "kube-apiserver", diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index f6ff6f90ea3..19fc8eade08 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -113,7 +113,7 @@ func healthyAPIServerPodsJSON() []byte { func allKubeChecksHealthyMock(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil). AnyTimes() k.EXPECT(). @@ -794,7 +794,7 @@ func TestValidateAPIServerHealth(t *testing.T) { name: "healthy kube-apiserver", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). @@ -802,10 +802,10 @@ func TestValidateAPIServerHealth(t *testing.T) { }, }, { - name: "healthz check fails", + name: "readyz check fails", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(fmt.Errorf("connection refused")) }, wantErr: "503: InternalServerError: kube-apiserver: API server is not reachable: connection refused", @@ -814,7 +814,7 @@ func TestValidateAPIServerHealth(t *testing.T) { name: "kube-apiserver degraded", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). @@ -830,7 +830,7 @@ func TestValidateAPIServerHealth(t *testing.T) { name: "kube-apiserver unavailable", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). @@ -846,7 +846,7 @@ func TestValidateAPIServerHealth(t *testing.T) { name: "KubeGet returns error", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). @@ -858,7 +858,7 @@ func TestValidateAPIServerHealth(t *testing.T) { name: "KubeGet returns invalid JSON", mocks: func(k *mock_adminactions.MockKubeActions) { k.EXPECT(). - CheckAPIServerHealthz(gomock.Any()). + CheckAPIServerReadyz(gomock.Any()). Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). diff --git a/pkg/frontend/adminactions/kubeactions.go b/pkg/frontend/adminactions/kubeactions.go index 8f49cbbe4c2..ed0b101a985 100644 --- a/pkg/frontend/adminactions/kubeactions.go +++ b/pkg/frontend/adminactions/kubeactions.go @@ -46,7 +46,7 @@ type KubeActions interface { // Fetch top pods and nodes metrics TopPods(ctx context.Context, restConfig *restclient.Config, allNamespaces bool) ([]PodMetrics, error) TopNodes(ctx context.Context, restConfig *restclient.Config) ([]NodeMetrics, error) - CheckAPIServerHealthz(ctx context.Context) error + CheckAPIServerReadyz(ctx context.Context) error } type kubeActions struct { @@ -189,10 +189,10 @@ func (k *kubeActions) KubeDelete(ctx context.Context, groupKind, namespace, name return k.dyn.Resource(gvr).Namespace(namespace).Delete(ctx, name, resourceDeleteOptions) } -func (k *kubeActions) CheckAPIServerHealthz(ctx context.Context) error { - _, err := k.kubecli.Discovery().RESTClient().Get().AbsPath("/healthz").Do(ctx).Raw() +func (k *kubeActions) CheckAPIServerReadyz(ctx context.Context) error { + _, err := k.kubecli.Discovery().RESTClient().Get().AbsPath("/readyz").Do(ctx).Raw() if err != nil { - return fmt.Errorf("API server healthz check failed: %w", err) + return fmt.Errorf("API server readyz check failed: %w", err) } return nil } diff --git a/pkg/util/mocks/adminactions/kubeactions.go b/pkg/util/mocks/adminactions/kubeactions.go index e2db16866bc..a4df7db16f6 100644 --- a/pkg/util/mocks/adminactions/kubeactions.go +++ b/pkg/util/mocks/adminactions/kubeactions.go @@ -76,18 +76,18 @@ func (mr *MockKubeActionsMockRecorder) ApproveCsr(ctx, csrName any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ApproveCsr", reflect.TypeOf((*MockKubeActions)(nil).ApproveCsr), ctx, csrName) } -// CheckAPIServerHealthz mocks base method. -func (m *MockKubeActions) CheckAPIServerHealthz(ctx context.Context) error { +// CheckAPIServerReadyz mocks base method. +func (m *MockKubeActions) CheckAPIServerReadyz(ctx context.Context) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "CheckAPIServerHealthz", ctx) + ret := m.ctrl.Call(m, "CheckAPIServerReadyz", ctx) ret0, _ := ret[0].(error) return ret0 } -// CheckAPIServerHealthz indicates an expected call of CheckAPIServerHealthz. -func (mr *MockKubeActionsMockRecorder) CheckAPIServerHealthz(ctx any) *gomock.Call { +// CheckAPIServerReadyz indicates an expected call of CheckAPIServerReadyz. +func (mr *MockKubeActionsMockRecorder) CheckAPIServerReadyz(ctx any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckAPIServerHealthz", reflect.TypeOf((*MockKubeActions)(nil).CheckAPIServerHealthz), ctx) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckAPIServerReadyz", reflect.TypeOf((*MockKubeActions)(nil).CheckAPIServerReadyz), ctx) } // CordonNode mocks base method. From c775bba877c54a37ec1d827b7f906cdc5e0347ee Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Fri, 27 Mar 2026 15:25:14 +0100 Subject: [PATCH 06/14] Run readyz check as synchronous gate and group API server validations --- ...penshiftcluster_vmresize_pre_validation.go | 29 +++++---- ...iftcluster_vmresize_pre_validation_test.go | 63 ++++++++++++------- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 79d2936d6da..88394ba7965 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -107,11 +107,23 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( } } + if err := k.CheckAPIServerReadyz(ctx); err != nil { + return nil, api.NewCloudError( + http.StatusServiceUnavailable, + api.CloudErrorCodeInternalServerError, "kube-apiserver", + fmt.Sprintf("API server is reporting a non-ready status: %v", err)) + } + var wg sync.WaitGroup wg.Go(func() { collect(f.validateVMSKU(ctx, doc, subscriptionDoc, desiredVMSize, log)) }) - wg.Go(func() { collect(validateAPIServerHealth(ctx, k)) }) - wg.Go(func() { collect(validateAPIServerPods(ctx, k)) }) + wg.Go(func() { + if err := validateAPIServerHealth(ctx, k); err != nil { + collect(err) + return + } + collect(validateAPIServerPods(ctx, k)) + }) wg.Go(func() { collect(validateEtcdHealth(ctx, k)) }) wg.Go(func() { collect(validateClusterSP(ctx, k)) }) @@ -222,17 +234,10 @@ func quotaCheckDisabled(_ context.Context, _ env.Interface, _ *api.SubscriptionD return nil } -// validateAPIServerHealth verifies that: -// 1. The API server is reachable from the RP (via /readyz) -// 2. The kube-apiserver ClusterOperator is healthy (Available=True, Progressing=False, Degraded=False) +// validateAPIServerHealth verifies that the kube-apiserver ClusterOperator is healthy +// (Available=True, Progressing=False, Degraded=False). +// Note: API server reachability is checked earlier via CheckAPIServerReadyz func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) error { - if err := k.CheckAPIServerReadyz(ctx); err != nil { - return api.NewCloudError( - http.StatusServiceUnavailable, - api.CloudErrorCodeInternalServerError, "kube-apiserver", - fmt.Sprintf("API server is not reachable: %v", err)) - } - rawCO, err := k.KubeGet(ctx, "ClusterOperator.config.openshift.io", "", "kube-apiserver") if err != nil { return api.NewCloudError( diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 19fc8eade08..31761e1ded7 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -429,6 +429,45 @@ func TestPreResizeControlPlaneVMsValidation(t *testing.T) { wantStatusCode: http.StatusBadRequest, wantError: `400: InvalidParameter: : Pre-flight validation failed. Details: InvalidParameter: vmSize: The selected SKU 'Standard_D8s_v3' is restricted in region 'eastus' for selected subscription`, }, + { + name: "API server unreachable", + resourceID: testdatabase.GetResourcePath(mockSubID, "resourceName"), + vmSize: "Standard_D8s_v3", + fixture: func(f *testdatabase.Fixture) { + f.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ + Key: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")), + OpenShiftCluster: &api.OpenShiftCluster{ + ID: testdatabase.GetResourcePath(mockSubID, "resourceName"), + Location: "eastus", + Properties: api.OpenShiftClusterProperties{ + MasterProfile: api.MasterProfile{ + VMSize: api.VMSizeStandardD8sV3, + }, + ClusterProfile: api.ClusterProfile{ + ResourceGroupID: fmt.Sprintf("/subscriptions/%s/resourceGroups/test-cluster", mockSubID), + }, + }, + }, + }) + f.AddSubscriptionDocuments(&api.SubscriptionDocument{ + ID: mockSubID, + Subscription: &api.Subscription{ + State: api.SubscriptionStateRegistered, + Properties: &api.SubscriptionProperties{ + TenantID: mockTenantID, + }, + }, + }) + }, + mocks: func(tt *test, a *mock_adminactions.MockAzureActions) {}, + kubeMocks: func(k *mock_adminactions.MockKubeActions) { + k.EXPECT(). + CheckAPIServerReadyz(gomock.Any()). + Return(fmt.Errorf("connection refused")) + }, + wantStatusCode: http.StatusServiceUnavailable, + wantError: `503: InternalServerError: kube-apiserver: API server is reporting a non-ready status: connection refused`, + }, } { t.Run(tt.name, func(t *testing.T) { ti := newTestInfra(t).WithSubscriptions().WithOpenShiftClusters() @@ -793,29 +832,14 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "healthy kube-apiserver", mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(healthyKubeAPIServerJSON(), nil) }, }, - { - name: "readyz check fails", - mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(fmt.Errorf("connection refused")) - }, - wantErr: "503: InternalServerError: kube-apiserver: API server is not reachable: connection refused", - }, { name: "kube-apiserver degraded", mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(fakeClusterOperatorJSON("kube-apiserver", []configv1.ClusterOperatorStatusCondition{ @@ -829,9 +853,6 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "kube-apiserver unavailable", mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(fakeClusterOperatorJSON("kube-apiserver", []configv1.ClusterOperatorStatusCondition{ @@ -845,9 +866,6 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "KubeGet returns error", mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return(nil, fmt.Errorf("connection refused")) @@ -857,9 +875,6 @@ func TestValidateAPIServerHealth(t *testing.T) { { name: "KubeGet returns invalid JSON", mocks: func(k *mock_adminactions.MockKubeActions) { - k.EXPECT(). - CheckAPIServerReadyz(gomock.Any()). - Return(nil) k.EXPECT(). KubeGet(gomock.Any(), "ClusterOperator.config.openshift.io", "", "kube-apiserver"). Return([]byte(`{invalid`), nil) From d105bae903ccf5547da3b2bd3ef91f96ee262333 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Fri, 27 Mar 2026 15:34:42 +0100 Subject: [PATCH 07/14] Use 500 instead of 503 for API server non-ready status --- .../admin_openshiftcluster_vmresize_pre_validation.go | 2 +- .../admin_openshiftcluster_vmresize_pre_validation_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 88394ba7965..dfea044f352 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -109,7 +109,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( if err := k.CheckAPIServerReadyz(ctx); err != nil { return nil, api.NewCloudError( - http.StatusServiceUnavailable, + http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "kube-apiserver", fmt.Sprintf("API server is reporting a non-ready status: %v", err)) } diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 31761e1ded7..34c80230de2 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -465,8 +465,8 @@ func TestPreResizeControlPlaneVMsValidation(t *testing.T) { CheckAPIServerReadyz(gomock.Any()). Return(fmt.Errorf("connection refused")) }, - wantStatusCode: http.StatusServiceUnavailable, - wantError: `503: InternalServerError: kube-apiserver: API server is reporting a non-ready status: connection refused`, + wantStatusCode: http.StatusInternalServerError, + wantError: `500: InternalServerError: kube-apiserver: API server is reporting a non-ready status: connection refused`, }, } { t.Run(tt.name, func(t *testing.T) { From 6d7175593423c5c442be21f3effaed49701ae708 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Tue, 7 Apr 2026 14:32:20 +0200 Subject: [PATCH 08/14] return structured JSON response from pre-resize validation endpoint --- pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go | 2 +- .../admin_openshiftcluster_vmresize_pre_validation_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 907f653eddf..a7cd3546fa6 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -139,7 +139,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( } } - return json.Marshal("All pre-flight checks passed") + return json.Marshal(map[string]string{"status": "passed"}) } // defaultValidateResizeQuota creates an FP-authorized compute usage client and diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 46bf07a637f..77ebc89a8d4 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -202,7 +202,7 @@ func TestPreResizeControlPlaneVMsValidation(t *testing.T) { }, kubeMocks: allKubeChecksHealthyMock, wantStatusCode: http.StatusOK, - wantResponse: []byte(`"All pre-flight checks passed"` + "\n"), + wantResponse: []byte(`{"status":"passed"}` + "\n"), }, { name: "missing vmSize parameter", From 0d31ba6248bb54ac2db1df8d087b5fa2b95675db Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Tue, 7 Apr 2026 14:58:07 +0200 Subject: [PATCH 09/14] include Degraded condition in ClusterOperator status text, fix validation error message --- ...penshiftcluster_vmresize_pre_validation.go | 2 +- ...iftcluster_vmresize_pre_validation_test.go | 8 ++-- .../steps/cluster/apiserver_is_up_test.go | 6 ++- pkg/util/clusteroperators/isavailable.go | 7 +++- pkg/util/clusteroperators/isavailable_test.go | 38 +++++++++++++------ 5 files changed, 42 insertions(+), 19 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index a7cd3546fa6..422fb387cc3 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -257,7 +257,7 @@ func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) er return api.NewCloudError( http.StatusConflict, api.CloudErrorCodeRequestNotAllowed, "kube-apiserver", - fmt.Sprintf("kube-apiserver is not healthy: %s. Resize is not safe while the API server is degraded.", + fmt.Sprintf("kube-apiserver is not healthy: %s. Resize is not safe while the API server is unhealthy.", clusteroperators.OperatorStatusText(&co))) } diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 77ebc89a8d4..a46cb4a4d29 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -835,7 +835,7 @@ func TestValidateAPIServerHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue}, }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=True, Progressing=False. Resize is not safe while the API server is degraded.", + wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=True, Progressing=False, Degraded=True. Resize is not safe while the API server is unhealthy.", }, { name: "kube-apiserver unavailable", @@ -848,7 +848,7 @@ func TestValidateAPIServerHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionFalse}, }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=False, Progressing=True. Resize is not safe while the API server is degraded.", + wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=False, Progressing=True, Degraded=False. Resize is not safe while the API server is unhealthy.", }, { name: "KubeGet returns error", @@ -909,7 +909,7 @@ func TestValidateEtcdHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue}, }), nil) }, - wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=True, Progressing=False. Resize is not safe while etcd quorum is at risk.", + wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=True, Progressing=False, Degraded=True. Resize is not safe while etcd quorum is at risk.", }, { name: "etcd unavailable", @@ -922,7 +922,7 @@ func TestValidateEtcdHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionFalse}, }), nil) }, - wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=False, Progressing=True. Resize is not safe while etcd quorum is at risk.", + wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=False, Progressing=True, Degraded=False. Resize is not safe while etcd quorum is at risk.", }, { name: "KubeGet returns error", diff --git a/pkg/mimo/steps/cluster/apiserver_is_up_test.go b/pkg/mimo/steps/cluster/apiserver_is_up_test.go index e6ede0e0475..b1eae3b5be3 100644 --- a/pkg/mimo/steps/cluster/apiserver_is_up_test.go +++ b/pkg/mimo/steps/cluster/apiserver_is_up_test.go @@ -56,11 +56,15 @@ func TestAPIServerIsUp(t *testing.T) { Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue, }, + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + }, }, }, }, }, - wantErr: `TransientError: kube-apiserver Available=False, Progressing=True`, + wantErr: `TransientError: kube-apiserver Available=False, Progressing=True, Degraded=False`, }, { name: "ready", diff --git a/pkg/util/clusteroperators/isavailable.go b/pkg/util/clusteroperators/isavailable.go index 6c8d96beeaa..6ebd417012d 100644 --- a/pkg/util/clusteroperators/isavailable.go +++ b/pkg/util/clusteroperators/isavailable.go @@ -29,7 +29,10 @@ func OperatorStatusText(operator *configv1.ClusterOperator) string { for _, cond := range operator.Status.Conditions { m[cond.Type] = cond.Status } - return fmt.Sprintf("%s %s=%s, %s=%s", operator.Name, - configv1.OperatorAvailable, m[configv1.OperatorAvailable], configv1.OperatorProgressing, m[configv1.OperatorProgressing], + + return fmt.Sprintf("%s %s=%s, %s=%s, %s=%s", operator.Name, + configv1.OperatorAvailable, m[configv1.OperatorAvailable], + configv1.OperatorProgressing, m[configv1.OperatorProgressing], + configv1.OperatorDegraded, m[configv1.OperatorDegraded], ) } diff --git a/pkg/util/clusteroperators/isavailable_test.go b/pkg/util/clusteroperators/isavailable_test.go index a1e9f631f16..d6e1d31f873 100644 --- a/pkg/util/clusteroperators/isavailable_test.go +++ b/pkg/util/clusteroperators/isavailable_test.go @@ -69,31 +69,43 @@ func TestOperatorStatusText(t *testing.T) { name string availableCondition configv1.ConditionStatus progressingCondition configv1.ConditionStatus + degradedCondition configv1.ConditionStatus want string }{ { - name: "Available && Progressing; not available", + name: "Available && Progressing && !Degraded", availableCondition: configv1.ConditionTrue, progressingCondition: configv1.ConditionTrue, - want: "server Available=True, Progressing=True", + degradedCondition: configv1.ConditionFalse, + want: "server Available=True, Progressing=True, Degraded=False", }, { - name: "Available && !Progressing; available", + name: "Available && !Progressing && !Degraded; healthy", availableCondition: configv1.ConditionTrue, progressingCondition: configv1.ConditionFalse, - want: "server Available=True, Progressing=False", + degradedCondition: configv1.ConditionFalse, + want: "server Available=True, Progressing=False, Degraded=False", }, { - name: "!Available && Progressing; not available", + name: "!Available && Progressing && !Degraded", availableCondition: configv1.ConditionFalse, progressingCondition: configv1.ConditionTrue, - want: "server Available=False, Progressing=True", + degradedCondition: configv1.ConditionFalse, + want: "server Available=False, Progressing=True, Degraded=False", }, { - name: "!Available && !Progressing; not available", + name: "!Available && !Progressing && !Degraded", availableCondition: configv1.ConditionFalse, progressingCondition: configv1.ConditionFalse, - want: "server Available=False, Progressing=False", + degradedCondition: configv1.ConditionFalse, + want: "server Available=False, Progressing=False, Degraded=False", + }, + { + name: "Available && !Progressing && Degraded", + availableCondition: configv1.ConditionTrue, + progressingCondition: configv1.ConditionFalse, + degradedCondition: configv1.ConditionTrue, + want: "server Available=True, Progressing=False, Degraded=True", }, } { operator := &configv1.ClusterOperator{ @@ -110,12 +122,16 @@ func TestOperatorStatusText(t *testing.T) { Type: configv1.OperatorProgressing, Status: tt.progressingCondition, }, + { + Type: configv1.OperatorDegraded, + Status: tt.degradedCondition, + }, }, }, } - available := OperatorStatusText(operator) - if available != tt.want { - t.Error(available) + got := OperatorStatusText(operator) + if got != tt.want { + t.Errorf("%s: OperatorStatusText() = %q, want %q", tt.name, got, tt.want) } } } From 972416e7c3dc71d76f8d0892b04bc9e492527b53 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Wed, 8 Apr 2026 16:40:21 +0200 Subject: [PATCH 10/14] remove redundant return from _getPreResizeControlPlaneVMsValidation() --- ...penshiftcluster_vmresize_pre_validation.go | 22 +++++++++---------- ...iftcluster_vmresize_pre_validation_test.go | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 422fb387cc3..e20d1e63eba 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -45,9 +45,9 @@ func (f *frontend) getPreResizeControlPlaneVMsValidation(w http.ResponseWriter, resourceID := strings.TrimPrefix(r.URL.Path, "/admin") desiredVMSize := r.URL.Query().Get("vmSize") - b, err := f._getPreResizeControlPlaneVMsValidation(ctx, resType, resName, resGroupName, resourceID, desiredVMSize, log) + err := f._getPreResizeControlPlaneVMsValidation(ctx, resType, resName, resGroupName, resourceID, desiredVMSize, log) - adminReply(log, w, nil, b, err) + adminReply(log, w, nil, nil, err) } // _getPreResizeControlPlaneVMsValidation runs all pre-flight checks before @@ -57,31 +57,31 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( ctx context.Context, resType, resName, resGroupName, resourceID, desiredVMSize string, log *logrus.Entry, -) ([]byte, error) { +) error { dbOpenShiftClusters, err := f.dbGroup.OpenShiftClusters() if err != nil { - return nil, api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) } doc, err := dbOpenShiftClusters.Get(ctx, resourceID) switch { case cosmosdb.IsErrorStatusCode(err, http.StatusNotFound): - return nil, api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", + return api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", fmt.Sprintf( "The Resource '%s/%s' under resource group '%s' was not found.", resType, resName, resGroupName)) case err != nil: - return nil, err + return err } subscriptionDoc, err := f.getSubscriptionDocument(ctx, doc.Key) if err != nil { - return nil, err + return err } k, err := f.kubeActionsFactory(log, f.env, doc.OpenShiftCluster) if err != nil { - return nil, err + return err } // Run checks in parallel, collecting all errors so the caller sees every failure at once. @@ -107,7 +107,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( } if err := k.CheckAPIServerReadyz(ctx); err != nil { - return nil, api.NewCloudError( + return api.NewCloudError( http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "kube-apiserver", fmt.Sprintf("API server is reporting a non-ready status: %v", err)) @@ -129,7 +129,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( wg.Wait() if len(details) > 0 { - return nil, &api.CloudError{ + return &api.CloudError{ StatusCode: http.StatusBadRequest, CloudErrorBody: &api.CloudErrorBody{ Code: api.CloudErrorCodeInvalidParameter, @@ -139,7 +139,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( } } - return json.Marshal(map[string]string{"status": "passed"}) + return nil } // defaultValidateResizeQuota creates an FP-authorized compute usage client and diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index a46cb4a4d29..2b0da8a9fbb 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -202,7 +202,7 @@ func TestPreResizeControlPlaneVMsValidation(t *testing.T) { }, kubeMocks: allKubeChecksHealthyMock, wantStatusCode: http.StatusOK, - wantResponse: []byte(`{"status":"passed"}` + "\n"), + wantResponse: nil, }, { name: "missing vmSize parameter", From a752b804a605fddab2423c4760dd69cfb49c9344 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Wed, 8 Apr 2026 16:48:42 +0200 Subject: [PATCH 11/14] use InternalServerError for all pre-flight validation failures --- ...penshiftcluster_vmresize_pre_validation.go | 20 ++++++++--------- ...iftcluster_vmresize_pre_validation_test.go | 22 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index e20d1e63eba..8157e5400e0 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -255,8 +255,8 @@ func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) er if !clusteroperators.IsOperatorAvailable(&co) { return api.NewCloudError( - http.StatusConflict, - api.CloudErrorCodeRequestNotAllowed, "kube-apiserver", + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "kube-apiserver", fmt.Sprintf("kube-apiserver is not healthy: %s. Resize is not safe while the API server is unhealthy.", clusteroperators.OperatorStatusText(&co))) } @@ -302,16 +302,16 @@ func validateAPIServerPods(ctx context.Context, k adminactions.KubeActions) erro if apiServerPodCount != api.ControlPlaneNodeCount { return api.NewCloudError( - http.StatusConflict, - api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods", + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "kube-apiserver-pods", fmt.Sprintf("Expected %d kube-apiserver pods, found %d. Resize is not safe without full API server redundancy.", api.ControlPlaneNodeCount, apiServerPodCount)) } if len(unhealthyPods) > 0 { return api.NewCloudError( - http.StatusConflict, - api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods", + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "kube-apiserver-pods", fmt.Sprintf("Unhealthy kube-apiserver pods: %v. Resize is not safe without full API server redundancy.", unhealthyPods)) } @@ -355,8 +355,8 @@ func validateEtcdHealth(ctx context.Context, k adminactions.KubeActions) error { if !clusteroperators.IsOperatorAvailable(&co) { return api.NewCloudError( - http.StatusConflict, - api.CloudErrorCodeRequestNotAllowed, "etcd", + http.StatusInternalServerError, + api.CloudErrorCodeInternalServerError, "etcd", fmt.Sprintf("etcd is not healthy: %s. Resize is not safe while etcd quorum is at risk.", clusteroperators.OperatorStatusText(&co))) } @@ -389,14 +389,14 @@ func validateClusterSP(ctx context.Context, k adminactions.KubeActions) error { return nil } return api.NewCloudError( - http.StatusConflict, + http.StatusInternalServerError, api.CloudErrorCodeInvalidServicePrincipalCredentials, "servicePrincipal", fmt.Sprintf("Cluster Service Principal is invalid: %s", cond.Message)) } } return api.NewCloudError( - http.StatusConflict, + http.StatusInternalServerError, api.CloudErrorCodeInvalidServicePrincipalCredentials, "servicePrincipal", "ServicePrincipalValid condition not found on the ARO Cluster resource. The ARO operator may not have reconciled yet.") } diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go index 2b0da8a9fbb..3f858951630 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation_test.go @@ -765,7 +765,7 @@ func TestValidateVMSP(t *testing.T) { }, }), nil) }, - wantErr: "409: InvalidServicePrincipalCredentials: servicePrincipal: Cluster Service Principal is invalid: secret expired", + wantErr: "500: InvalidServicePrincipalCredentials: servicePrincipal: Cluster Service Principal is invalid: secret expired", }, { name: "condition not found", @@ -774,7 +774,7 @@ func TestValidateVMSP(t *testing.T) { KubeGet(gomock.Any(), "Cluster.aro.openshift.io", "", arov1alpha1.SingletonClusterName). Return(fakeAROClusterJSON([]operatorv1.OperatorCondition{}), nil) }, - wantErr: "409: InvalidServicePrincipalCredentials: servicePrincipal: ServicePrincipalValid condition not found on the ARO Cluster resource. The ARO operator may not have reconciled yet.", + wantErr: "500: InvalidServicePrincipalCredentials: servicePrincipal: ServicePrincipalValid condition not found on the ARO Cluster resource. The ARO operator may not have reconciled yet.", }, { name: "KubeGet returns error", @@ -835,7 +835,7 @@ func TestValidateAPIServerHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue}, }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=True, Progressing=False, Degraded=True. Resize is not safe while the API server is unhealthy.", + wantErr: "500: InternalServerError: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=True, Progressing=False, Degraded=True. Resize is not safe while the API server is unhealthy.", }, { name: "kube-apiserver unavailable", @@ -848,7 +848,7 @@ func TestValidateAPIServerHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionFalse}, }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=False, Progressing=True, Degraded=False. Resize is not safe while the API server is unhealthy.", + wantErr: "500: InternalServerError: kube-apiserver: kube-apiserver is not healthy: kube-apiserver Available=False, Progressing=True, Degraded=False. Resize is not safe while the API server is unhealthy.", }, { name: "KubeGet returns error", @@ -909,7 +909,7 @@ func TestValidateEtcdHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionTrue}, }), nil) }, - wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=True, Progressing=False, Degraded=True. Resize is not safe while etcd quorum is at risk.", + wantErr: "500: InternalServerError: etcd: etcd is not healthy: etcd Available=True, Progressing=False, Degraded=True. Resize is not safe while etcd quorum is at risk.", }, { name: "etcd unavailable", @@ -922,7 +922,7 @@ func TestValidateEtcdHealth(t *testing.T) { {Type: configv1.OperatorDegraded, Status: configv1.ConditionFalse}, }), nil) }, - wantErr: "409: RequestNotAllowed: etcd: etcd is not healthy: etcd Available=False, Progressing=True, Degraded=False. Resize is not safe while etcd quorum is at risk.", + wantErr: "500: InternalServerError: etcd: etcd is not healthy: etcd Available=False, Progressing=True, Degraded=False. Resize is not safe while etcd quorum is at risk.", }, { name: "KubeGet returns error", @@ -1000,7 +1000,7 @@ func TestValidateAPIServerPods(t *testing.T) { healthyAPIServerPod("kube-apiserver-master-1", "master-1"), }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 2. Resize is not safe without full API server redundancy.", + wantErr: "500: InternalServerError: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 2. Resize is not safe without full API server redundancy.", }, { name: "4 apiserver pods", @@ -1014,7 +1014,7 @@ func TestValidateAPIServerPods(t *testing.T) { healthyAPIServerPod("kube-apiserver-master-3", "master-3"), }), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 4. Resize is not safe without full API server redundancy.", + wantErr: "500: InternalServerError: kube-apiserver-pods: Expected 3 kube-apiserver pods, found 4. Resize is not safe without full API server redundancy.", }, { name: "one pod not running", @@ -1037,7 +1037,7 @@ func TestValidateAPIServerPods(t *testing.T) { KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). Return(fakeAPIServerPodListJSON(pods), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (phase: Pending)]. Resize is not safe without full API server redundancy.", + wantErr: "500: InternalServerError: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (phase: Pending)]. Resize is not safe without full API server redundancy.", }, { name: "one pod not ready", @@ -1063,7 +1063,7 @@ func TestValidateAPIServerPods(t *testing.T) { KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). Return(fakeAPIServerPodListJSON(pods), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", + wantErr: "500: InternalServerError: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", }, { name: "multiple unhealthy pods", @@ -1098,7 +1098,7 @@ func TestValidateAPIServerPods(t *testing.T) { KubeList(gomock.Any(), "Pod", "openshift-kube-apiserver"). Return(fakeAPIServerPodListJSON(pods), nil) }, - wantErr: "409: RequestNotAllowed: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-1 (phase: Failed) kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", + wantErr: "500: InternalServerError: kube-apiserver-pods: Unhealthy kube-apiserver pods: [kube-apiserver-master-1 (phase: Failed) kube-apiserver-master-2 (not ready)]. Resize is not safe without full API server redundancy.", }, { name: "filters non-apiserver pods", From c1d1e921dfbe291913f1c709c6d08e0f06d53427 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Wed, 8 Apr 2026 17:03:47 +0200 Subject: [PATCH 12/14] rename isPodHealthy() to validatePodHealth(), amend validatePodhealth() to only return error when needed --- ...min_openshiftcluster_vmresize_pre_validation.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 8157e5400e0..2ab9ddd4393 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -295,8 +295,8 @@ func validateAPIServerPods(ctx context.Context, k adminactions.KubeActions) erro apiServerPodCount++ - if healthy, reason := isPodHealthy(&pod); !healthy { - unhealthyPods = append(unhealthyPods, fmt.Sprintf("%s (%s)", pod.Name, reason)) + if err := validatePodHealth(&pod); err != nil { + unhealthyPods = append(unhealthyPods, fmt.Sprintf("%s (%s)", pod.Name, err.Error())) } } @@ -319,19 +319,19 @@ func validateAPIServerPods(ctx context.Context, k adminactions.KubeActions) erro return nil } -func isPodHealthy(pod *corev1.Pod) (healthy bool, reason string) { +func validatePodHealth(pod *corev1.Pod) error { if pod.Status.Phase != corev1.PodRunning { - return false, fmt.Sprintf("phase: %s", pod.Status.Phase) + return fmt.Errorf("phase: %s", pod.Status.Phase) } for _, cond := range pod.Status.Conditions { if cond.Type == corev1.PodReady { if cond.Status != corev1.ConditionTrue { - return false, "not ready" + return fmt.Errorf("not ready") } - return true, "" + return nil } } - return false, "Ready condition not found" + return fmt.Errorf("Ready condition not found") } // validateEtcdHealth verifies that the etcd ClusterOperator is healthy. From a5d7b2d6d98b5692f91a94d74618c5ba979357c0 Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Wed, 8 Apr 2026 17:27:28 +0200 Subject: [PATCH 13/14] add a 1m timeout to _getPreResizeControlPlaneVMsValidation() --- .../admin_openshiftcluster_vmresize_pre_validation.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index 2ab9ddd4393..ffcf7bc4e53 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -12,6 +12,7 @@ import ( "path/filepath" "strings" "sync" + "time" "github.com/go-chi/chi/v5" "github.com/sirupsen/logrus" @@ -58,6 +59,9 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation( resType, resName, resGroupName, resourceID, desiredVMSize string, log *logrus.Entry, ) error { + ctx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + dbOpenShiftClusters, err := f.dbGroup.OpenShiftClusters() if err != nil { return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) From 3e155b9d25f45be9c546977849ce4156f9e525ed Mon Sep 17 00:00:00 2001 From: Anton Asserzon Date: Wed, 8 Apr 2026 17:35:46 +0200 Subject: [PATCH 14/14] fix capital letter in error return to satisfy linter --- pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go index ffcf7bc4e53..d604870a4fd 100644 --- a/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go +++ b/pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go @@ -335,7 +335,7 @@ func validatePodHealth(pod *corev1.Pod) error { return nil } } - return fmt.Errorf("Ready condition not found") + return fmt.Errorf("ready condition not found") } // validateEtcdHealth verifies that the etcd ClusterOperator is healthy.