Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 91 additions & 5 deletions pkg/frontend/admin_openshiftcluster_vmresize_pre_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"github.com/go-chi/chi/v5"
"github.com/sirupsen/logrus"

corev1 "k8s.io/api/core/v1"

configv1 "github.com/openshift/api/config/v1"
operatorv1 "github.com/openshift/api/operator/v1"

Expand Down Expand Up @@ -104,10 +106,23 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation(
}
}

if err := k.CheckAPIServerReadyz(ctx); err != nil {
return nil, api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeInternalServerError, "kube-apiserver",
fmt.Sprintf("API server is reporting a non-ready status: %v", err))
}

var wg sync.WaitGroup

wg.Go(func() { collect(f.validateVMSKU(ctx, doc, subscriptionDoc, desiredVMSize, log)) })
wg.Go(func() { collect(validateAPIServerHealth(ctx, k)) })
wg.Go(func() {
if err := validateAPIServerHealth(ctx, k); err != nil {
collect(err)
return
}
collect(validateAPIServerPods(ctx, k))
})
wg.Go(func() { collect(validateEtcdHealth(ctx, k)) })
wg.Go(func() { collect(validateClusterSP(ctx, k)) })

Expand All @@ -124,7 +139,7 @@ func (f *frontend) _getPreResizeControlPlaneVMsValidation(
}
}

return json.Marshal("All pre-flight checks passed")
return json.Marshal(map[string]string{"status": "passed"})
}

// defaultValidateResizeQuota creates an FP-authorized compute usage client and
Expand Down Expand Up @@ -218,8 +233,9 @@ func quotaCheckDisabled(_ context.Context, _ env.Interface, _ *api.SubscriptionD
return nil
}

// validateAPIServerHealth verifies that the kube-apiserver ClusterOperator is
// healthy (Available=True, Progressing=False, Degraded=False).
// validateAPIServerHealth verifies that the kube-apiserver ClusterOperator is healthy
// (Available=True, Progressing=False, Degraded=False).
// Note: API server reachability is checked earlier via CheckAPIServerReadyz
func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) error {
rawCO, err := k.KubeGet(ctx, "ClusterOperator.config.openshift.io", "", "kube-apiserver")
if err != nil {
Expand All @@ -241,13 +257,83 @@ func validateAPIServerHealth(ctx context.Context, k adminactions.KubeActions) er
return api.NewCloudError(
http.StatusConflict,
api.CloudErrorCodeRequestNotAllowed, "kube-apiserver",
fmt.Sprintf("kube-apiserver is not healthy: %s. Resize is not safe while the API server is degraded.",
fmt.Sprintf("kube-apiserver is not healthy: %s. Resize is not safe while the API server is unhealthy.",
clusteroperators.OperatorStatusText(&co)))
}

return nil
}

func validateAPIServerPods(ctx context.Context, k adminactions.KubeActions) error {
const (
kubeAPIServerNamespace = "openshift-kube-apiserver"
kubeAPIServerAppLabel = "openshift-kube-apiserver"
)

rawPods, err := k.KubeList(ctx, "Pod", kubeAPIServerNamespace)
if err != nil {
return api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeInternalServerError, "kube-apiserver-pods",
fmt.Sprintf("Failed to list pods in %s namespace: %v", kubeAPIServerNamespace, err))
}

var podList corev1.PodList
if err := json.Unmarshal(rawPods, &podList); err != nil {
return api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeInternalServerError, "kube-apiserver-pods",
fmt.Sprintf("Failed to parse pod list: %v", err))
}

var apiServerPodCount int
var unhealthyPods []string
for _, pod := range podList.Items {
if pod.Labels["app"] != kubeAPIServerAppLabel {
continue
}

apiServerPodCount++

if healthy, reason := isPodHealthy(&pod); !healthy {
unhealthyPods = append(unhealthyPods, fmt.Sprintf("%s (%s)", pod.Name, reason))
}
}

if apiServerPodCount != api.ControlPlaneNodeCount {
return api.NewCloudError(
http.StatusConflict,
api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods",
fmt.Sprintf("Expected %d kube-apiserver pods, found %d. Resize is not safe without full API server redundancy.",
api.ControlPlaneNodeCount, apiServerPodCount))
}

if len(unhealthyPods) > 0 {
return api.NewCloudError(
http.StatusConflict,
api.CloudErrorCodeRequestNotAllowed, "kube-apiserver-pods",
fmt.Sprintf("Unhealthy kube-apiserver pods: %v. Resize is not safe without full API server redundancy.",
unhealthyPods))
}

return nil
}

func isPodHealthy(pod *corev1.Pod) (healthy bool, reason string) {
if pod.Status.Phase != corev1.PodRunning {
return false, fmt.Sprintf("phase: %s", pod.Status.Phase)
}
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodReady {
if cond.Status != corev1.ConditionTrue {
return false, "not ready"
}
return true, ""
}
}
return false, "Ready condition not found"
}

// validateEtcdHealth verifies that the etcd ClusterOperator is healthy.
// Resizing takes a master offline, so all etcd members must be healthy.
func validateEtcdHealth(ctx context.Context, k adminactions.KubeActions) error {
Expand Down
Loading
Loading