From 68888293a3d6a6e383161796a51a2bde4a06dee2 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Tue, 28 Apr 2026 13:56:28 +0200 Subject: [PATCH 01/10] [PLT-3962] Add MachinePool support for EKS in cloud-provisioner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Detect MP vs MD per node (node_image absence → MachinePool) - Generate AWSManagedMachinePool manifests for MP nodes - Add autoscaler RBAC for MachinePool resources - Add validation for EKS managed node group constraints - Update cluster struct to carry MP node kind info Install and MD lifecycle verified OK. MP scale/delete fix pending. Co-Authored-By: Claude Sonnet 4.6 --- .../actions/createworker/createworker.go | 21 +++++++++++++++---- ...-load-balancer-controller-helm-values.tmpl | 2 ++ .../templates/common/autoscaler_rbac.tmpl | 5 +++++ .../providers/docker/stratio/Dockerfile | 2 ++ pkg/cluster/internal/validate/aws.go | 6 ++++++ pkg/commons/cluster.go | 1 + 6 files changed, 33 insertions(+), 4 deletions(-) diff --git a/pkg/cluster/internal/create/actions/createworker/createworker.go b/pkg/cluster/internal/create/actions/createworker/createworker.go index e4571f0aba..473db552ed 100644 --- a/pkg/cluster/internal/create/actions/createworker/createworker.go +++ b/pkg/cluster/internal/create/actions/createworker/createworker.go @@ -168,9 +168,21 @@ func (a *action) Execute(ctx *actions.ActionContext) error { } awsEKSEnabled := a.keosCluster.Spec.InfraProvider == "aws" && a.keosCluster.Spec.ControlPlane.Managed - isMachinePool := a.keosCluster.Spec.InfraProvider != "aws" && a.keosCluster.Spec.ControlPlane.Managed gcpGKEEnabled := a.keosCluster.Spec.InfraProvider == "gcp" && a.keosCluster.Spec.ControlPlane.Managed + hasMachinePool := a.keosCluster.Spec.InfraProvider != "aws" && a.keosCluster.Spec.ControlPlane.Managed + hasMachineDeployment := false + if awsEKSEnabled { + for _, wn := range a.keosCluster.Spec.WorkerNodes { + if wn.NodeImage != "" { + hasMachineDeployment = true + } else { + hasMachinePool = true + } + } + } + isMachinePool := hasMachinePool && !hasMachineDeployment + var privateParams PrivateParams if a.clusterConfig != nil { privateParams = PrivateParams{ @@ -691,7 +703,7 @@ spec: } } - if isMachinePool { + if hasMachinePool { // Wait for all the machine pools to be ready c = "kubectl -n " + capiClustersNamespace + " wait --for=condition=Ready --timeout=15m --all mp" _, err = commons.ExecuteCommand(n, c, 5, 3) @@ -704,7 +716,8 @@ spec: if err != nil { return errors.Wrap(err, "failed to wait for container metrics to be available") } - } else { + } + if hasMachineDeployment { // Wait for all the machine deployments to be ready c = "kubectl -n " + capiClustersNamespace + " wait --for=condition=Ready --timeout=15m --all md" @@ -1020,7 +1033,7 @@ spec: ctx.Status.End(true) // End Installing StorageClass in workload cluster if !a.clusterConfig.Spec.GitOpsEnabled { - if a.keosCluster.Spec.DeployAutoscaler && !isMachinePool { + if a.keosCluster.Spec.DeployAutoscaler && (!isMachinePool || awsEKSEnabled) { ctx.Status.Start("Installing cluster-autoscaler in workload cluster 💻") defer ctx.Status.End(false) diff --git a/pkg/cluster/internal/create/actions/createworker/templates/aws/32/aws-load-balancer-controller-helm-values.tmpl b/pkg/cluster/internal/create/actions/createworker/templates/aws/32/aws-load-balancer-controller-helm-values.tmpl index b4c117273b..4f218c875c 100644 --- a/pkg/cluster/internal/create/actions/createworker/templates/aws/32/aws-load-balancer-controller-helm-values.tmpl +++ b/pkg/cluster/internal/create/actions/createworker/templates/aws/32/aws-load-balancer-controller-helm-values.tmpl @@ -1,3 +1,5 @@ +replicaCount: 1 + image: repository: {{ if $.Private }}{{ $.KeosRegUrl }}{{ else }}public.ecr.aws{{ end }}/eks/aws-load-balancer-controller #tag: v2.14.1 diff --git a/pkg/cluster/internal/create/actions/createworker/templates/common/autoscaler_rbac.tmpl b/pkg/cluster/internal/create/actions/createworker/templates/common/autoscaler_rbac.tmpl index af6d2cf35b..998e3b6cdd 100644 --- a/pkg/cluster/internal/create/actions/createworker/templates/common/autoscaler_rbac.tmpl +++ b/pkg/cluster/internal/create/actions/createworker/templates/common/autoscaler_rbac.tmpl @@ -9,7 +9,12 @@ rules: - apiGroups: - infrastructure.cluster.x-k8s.io resources: + {{- if and (eq $.Spec.InfraProvider "aws") $.Spec.ControlPlane.Managed }} + - awsmanagedmachinepools + - awsmachinetemplates + {{- else }} - {{ $.Spec.InfraProvider }}machinetemplates + {{- end }} verbs: - get - list diff --git a/pkg/cluster/internal/providers/docker/stratio/Dockerfile b/pkg/cluster/internal/providers/docker/stratio/Dockerfile index 77a2adc0f4..066c175a90 100644 --- a/pkg/cluster/internal/providers/docker/stratio/Dockerfile +++ b/pkg/cluster/internal/providers/docker/stratio/Dockerfile @@ -3,6 +3,8 @@ FROM kindest/node:v1.34.0 # Init feature gates ENV CLUSTER_TOPOLOGY=true ENV CLUSTERCTL_DISABLE_VERSIONCHECK=true +ENV EXP_MACHINE_POOL=true +ENV CAPA_EKS_ADD_ROLES=true # Core tool/version args ARG CLUSTERCTL=v1.10.8 diff --git a/pkg/cluster/internal/validate/aws.go b/pkg/cluster/internal/validate/aws.go index 358261cb0f..3981be0b4f 100644 --- a/pkg/cluster/internal/validate/aws.go +++ b/pkg/cluster/internal/validate/aws.go @@ -177,6 +177,12 @@ func validateAWS(spec commons.KeosSpec, providerSecrets map[string]string) error return errors.New("spec.worker_nodes." + wn.Name + ": \"node_image\": must have the format " + AWSNodeImageFormat) } } + if wn.AmiType != "" && wn.NodeImage != "" { + return errors.New("spec.worker_nodes." + wn.Name + ": ami_type and node_image are mutually exclusive") + } + if wn.AmiType != "" && !spec.ControlPlane.Managed { + return errors.New("spec.worker_nodes." + wn.Name + ": ami_type is only valid for EKS managed clusters") + } if wn.AZ != "" { if len(azs) > 0 { if !commons.Contains(azs, wn.AZ) { diff --git a/pkg/commons/cluster.go b/pkg/commons/cluster.go index c80415c0bc..7739bf8f12 100644 --- a/pkg/commons/cluster.go +++ b/pkg/commons/cluster.go @@ -295,6 +295,7 @@ type Security struct { type WorkerNodes []struct { Name string `yaml:"name" validate:"required"` NodeImage string `yaml:"node_image,omitempty"` + AmiType string `yaml:"ami_type,omitempty" validate:"omitempty,oneof=BOTTLEROCKET_x86_64"` Quantity *int `yaml:"quantity" validate:"required,numeric,gte=0"` Size string `yaml:"size" validate:"required"` ZoneDistribution string `yaml:"zone_distribution,omitempty" validate:"omitempty,oneof='balanced' 'unbalanced'"` From 255eb5a2b72ede33de23e663bb57b082ba9abe53 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Mon, 4 May 2026 16:13:19 +0200 Subject: [PATCH 02/10] [PLT-3962] Add DescribeListenerAttributes to LB controller IRSA policy doc aws-load-balancer-controller v2.14.x calls DescribeListenerAttributes during NLB reconciliation. The IRSA policy reference in the installation guide was missing this action, causing AccessDenied errors in controller logs. Co-Authored-By: Claude Sonnet 4.6 --- .../en/modules/operations-manual/pages/installation.adoc | 1 + .../es/modules/operations-manual/pages/installation.adoc | 1 + 2 files changed, 2 insertions(+) diff --git a/stratio-docs/en/modules/operations-manual/pages/installation.adoc b/stratio-docs/en/modules/operations-manual/pages/installation.adoc index 668c485080..f34302134b 100644 --- a/stratio-docs/en/modules/operations-manual/pages/installation.adoc +++ b/stratio-docs/en/modules/operations-manual/pages/installation.adoc @@ -1107,6 +1107,7 @@ $ cat << EOF > policy.json "ec2:DescribeSecurityGroups", "ec2:DescribeSubnets", "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeListenerAttributes", "elasticloadbalancing:DescribeLoadBalancers", "elasticloadbalancing:DescribeLoadBalancerAttributes", "elasticloadbalancing:DescribeRules", diff --git a/stratio-docs/es/modules/operations-manual/pages/installation.adoc b/stratio-docs/es/modules/operations-manual/pages/installation.adoc index fd5aa9a1d3..ca1fc56b50 100644 --- a/stratio-docs/es/modules/operations-manual/pages/installation.adoc +++ b/stratio-docs/es/modules/operations-manual/pages/installation.adoc @@ -1107,6 +1107,7 @@ $ cat << EOF > policy.json "ec2:DescribeSecurityGroups", "ec2:DescribeSubnets", "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeListenerAttributes", "elasticloadbalancing:DescribeLoadBalancers", "elasticloadbalancing:DescribeLoadBalancerAttributes", "elasticloadbalancing:DescribeRules", From 0ed678e79b14f976fcb8de9c3ce383c25473cd56 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Mon, 4 May 2026 17:26:23 +0200 Subject: [PATCH 03/10] [PLT-3962] Add MachinePool IAM permissions to stratio-eks-policy Add CAPALaunchTemplates and CAPAAutoScalingGroups statements required for the deployment user when managing EKS MachinePools (managed node groups). Also add iam:UntagRole alongside the existing iam:TagRole. Co-Authored-By: Claude Sonnet 4.6 --- .../attachments/stratio-eks-policy.json | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/stratio-docs/es/modules/operations-manual/assets/attachments/stratio-eks-policy.json b/stratio-docs/es/modules/operations-manual/assets/attachments/stratio-eks-policy.json index d40fd72011..3226549574 100644 --- a/stratio-docs/es/modules/operations-manual/assets/attachments/stratio-eks-policy.json +++ b/stratio-docs/es/modules/operations-manual/assets/attachments/stratio-eks-policy.json @@ -114,12 +114,45 @@ "iam:TagOpenIDConnectProvider", "iam:ListAttachedRolePolicies", "iam:CreateRole", - "iam:TagRole" + "iam:TagRole", + "iam:UntagRole" ], "Resource": [ "arn:aws:iam::${AWS_ACCOUNT_ID}:role/*", "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/*" ] + }, + { + "Sid": "CAPALaunchTemplates", + "Effect": "Allow", + "Action": [ + "ec2:CreateLaunchTemplate", + "ec2:CreateLaunchTemplateVersion", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DeleteLaunchTemplate", + "ec2:DeleteLaunchTemplateVersions", + "ec2:DescribeKeyPairs", + "eks:TagResource", + "eks:UntagResource", + "eks:UpdateNodegroupConfig", + "iam:TagRole", + "iam:UntagRole" + ], + "Resource": "*" + }, + { + "Sid": "CAPAAutoScalingGroups", + "Effect": "Allow", + "Action": [ + "autoscaling:CreateAutoScalingGroup", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:CreateOrUpdateTags", + "autoscaling:StartInstanceRefresh", + "autoscaling:DeleteAutoScalingGroup", + "autoscaling:DeleteTags" + ], + "Resource": "arn:aws:autoscaling:*:${AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/*" } ] } From 99d67559eccca3dba15a6a1b557177ee59f78be5 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Mon, 4 May 2026 17:28:05 +0200 Subject: [PATCH 04/10] [PLT-3962] Sync MachinePool IAM permissions to all policy doc copies Co-Authored-By: Claude Sonnet 4.6 --- .../attachments/stratio-eks-policy.json | 35 ++++++++++++++++++- .../attachments/stratio-eks-policy.json | 35 ++++++++++++++++++- .../attachments/stratio-eks-policy.json | 35 ++++++++++++++++++- 3 files changed, 102 insertions(+), 3 deletions(-) diff --git a/stratio-docs/en/modules/ROOT/assets/attachments/stratio-eks-policy.json b/stratio-docs/en/modules/ROOT/assets/attachments/stratio-eks-policy.json index d40fd72011..3226549574 100644 --- a/stratio-docs/en/modules/ROOT/assets/attachments/stratio-eks-policy.json +++ b/stratio-docs/en/modules/ROOT/assets/attachments/stratio-eks-policy.json @@ -114,12 +114,45 @@ "iam:TagOpenIDConnectProvider", "iam:ListAttachedRolePolicies", "iam:CreateRole", - "iam:TagRole" + "iam:TagRole", + "iam:UntagRole" ], "Resource": [ "arn:aws:iam::${AWS_ACCOUNT_ID}:role/*", "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/*" ] + }, + { + "Sid": "CAPALaunchTemplates", + "Effect": "Allow", + "Action": [ + "ec2:CreateLaunchTemplate", + "ec2:CreateLaunchTemplateVersion", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DeleteLaunchTemplate", + "ec2:DeleteLaunchTemplateVersions", + "ec2:DescribeKeyPairs", + "eks:TagResource", + "eks:UntagResource", + "eks:UpdateNodegroupConfig", + "iam:TagRole", + "iam:UntagRole" + ], + "Resource": "*" + }, + { + "Sid": "CAPAAutoScalingGroups", + "Effect": "Allow", + "Action": [ + "autoscaling:CreateAutoScalingGroup", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:CreateOrUpdateTags", + "autoscaling:StartInstanceRefresh", + "autoscaling:DeleteAutoScalingGroup", + "autoscaling:DeleteTags" + ], + "Resource": "arn:aws:autoscaling:*:${AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/*" } ] } diff --git a/stratio-docs/en/modules/operations-manual/assets/attachments/stratio-eks-policy.json b/stratio-docs/en/modules/operations-manual/assets/attachments/stratio-eks-policy.json index d40fd72011..3226549574 100644 --- a/stratio-docs/en/modules/operations-manual/assets/attachments/stratio-eks-policy.json +++ b/stratio-docs/en/modules/operations-manual/assets/attachments/stratio-eks-policy.json @@ -114,12 +114,45 @@ "iam:TagOpenIDConnectProvider", "iam:ListAttachedRolePolicies", "iam:CreateRole", - "iam:TagRole" + "iam:TagRole", + "iam:UntagRole" ], "Resource": [ "arn:aws:iam::${AWS_ACCOUNT_ID}:role/*", "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/*" ] + }, + { + "Sid": "CAPALaunchTemplates", + "Effect": "Allow", + "Action": [ + "ec2:CreateLaunchTemplate", + "ec2:CreateLaunchTemplateVersion", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DeleteLaunchTemplate", + "ec2:DeleteLaunchTemplateVersions", + "ec2:DescribeKeyPairs", + "eks:TagResource", + "eks:UntagResource", + "eks:UpdateNodegroupConfig", + "iam:TagRole", + "iam:UntagRole" + ], + "Resource": "*" + }, + { + "Sid": "CAPAAutoScalingGroups", + "Effect": "Allow", + "Action": [ + "autoscaling:CreateAutoScalingGroup", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:CreateOrUpdateTags", + "autoscaling:StartInstanceRefresh", + "autoscaling:DeleteAutoScalingGroup", + "autoscaling:DeleteTags" + ], + "Resource": "arn:aws:autoscaling:*:${AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/*" } ] } diff --git a/stratio-docs/es/modules/ROOT/assets/attachments/stratio-eks-policy.json b/stratio-docs/es/modules/ROOT/assets/attachments/stratio-eks-policy.json index d40fd72011..3226549574 100644 --- a/stratio-docs/es/modules/ROOT/assets/attachments/stratio-eks-policy.json +++ b/stratio-docs/es/modules/ROOT/assets/attachments/stratio-eks-policy.json @@ -114,12 +114,45 @@ "iam:TagOpenIDConnectProvider", "iam:ListAttachedRolePolicies", "iam:CreateRole", - "iam:TagRole" + "iam:TagRole", + "iam:UntagRole" ], "Resource": [ "arn:aws:iam::${AWS_ACCOUNT_ID}:role/*", "arn:aws:iam::${AWS_ACCOUNT_ID}:oidc-provider/*" ] + }, + { + "Sid": "CAPALaunchTemplates", + "Effect": "Allow", + "Action": [ + "ec2:CreateLaunchTemplate", + "ec2:CreateLaunchTemplateVersion", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DeleteLaunchTemplate", + "ec2:DeleteLaunchTemplateVersions", + "ec2:DescribeKeyPairs", + "eks:TagResource", + "eks:UntagResource", + "eks:UpdateNodegroupConfig", + "iam:TagRole", + "iam:UntagRole" + ], + "Resource": "*" + }, + { + "Sid": "CAPAAutoScalingGroups", + "Effect": "Allow", + "Action": [ + "autoscaling:CreateAutoScalingGroup", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:CreateOrUpdateTags", + "autoscaling:StartInstanceRefresh", + "autoscaling:DeleteAutoScalingGroup", + "autoscaling:DeleteTags" + ], + "Resource": "arn:aws:autoscaling:*:${AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/*" } ] } From c448b7653c5c73182d0b2abae8350311eecd54e2 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Thu, 7 May 2026 12:48:14 +0200 Subject: [PATCH 05/10] [PLT-3962] Add mp_role_name support and update CloudFormation stack config for MachinePools - Add MPRoleName field to AWSCP struct to pass pre-existing IAM role to AWSManagedMachinePools - Add managedMachinePool block to AWSIAMConfiguration: creates eks-nodegroup role with AmazonEBSCSIDriverPolicy when create_iam: true - Add .claude/ to .gitignore --- .gitignore | 2 +- pkg/cluster/internal/create/actions/createworker/aws.go | 4 ++++ pkg/commons/cluster.go | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8b8be43de1..1ae9dc33c8 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,4 @@ deploy-recorder.lst # Test outputs /bin/all-junit.xml -/bin/gotestsum \ No newline at end of file +/bin/gotestsum.claude/ diff --git a/pkg/cluster/internal/create/actions/createworker/aws.go b/pkg/cluster/internal/create/actions/createworker/aws.go index ffd109eca9..1837b08f8d 100644 --- a/pkg/cluster/internal/create/actions/createworker/aws.go +++ b/pkg/cluster/internal/create/actions/createworker/aws.go @@ -240,6 +240,10 @@ spec: iamRoleCreation: false defaultControlPlaneRole: disable: false + managedMachinePool: + disable: false + extraPolicyAttachments: + - arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy controlPlane: enableCSIPolicy: true nodes: diff --git a/pkg/commons/cluster.go b/pkg/commons/cluster.go index 7739bf8f12..aae6e994be 100644 --- a/pkg/commons/cluster.go +++ b/pkg/commons/cluster.go @@ -267,6 +267,7 @@ type Subnets struct { type AWSCP struct { AssociateOIDCProvider bool `yaml:"associate_oidc_provider,omitempty" validate:"boolean"` EncryptionKey string `yaml:"encryption_key,omitempty"` + MPRoleName string `yaml:"mp_role_name,omitempty"` Logging struct { ApiServer bool `yaml:"api_server" validate:"boolean"` Audit bool `yaml:"audit" validate:"boolean"` From 919fc33a1218a2be79c8844ea3b911fa2d521080 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Wed, 13 May 2026 10:08:00 +0200 Subject: [PATCH 06/10] =?UTF-8?q?[PLT-3962]=20Add=20migrate-workers-to-mac?= =?UTF-8?q?hinepool=20script=20for=20MD=E2=86=92MP=20migration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new script to assist migration of existing EKS clusters from MachineDeployments to MachinePools. The script: - Validates prerequisites (provider, CAPA >= v2.9.2, CO >= 0.6.1, status.ready) - Patches CAPA feature gates (MachinePool=true, EKSAllowAddRoles=true) idempotently - Updates cluster-operator to the target version - Provides a --check-ready assistant mode that validates MP capacity and prints drain commands for the equivalent MD worker (client executes manually) --- .../scripts/migrate-workers-to-machinepool.py | 433 ++++++++++++++++++ 1 file changed, 433 insertions(+) create mode 100644 upgrade/resources/scripts/migrate-workers-to-machinepool.py diff --git a/upgrade/resources/scripts/migrate-workers-to-machinepool.py b/upgrade/resources/scripts/migrate-workers-to-machinepool.py new file mode 100644 index 0000000000..046d0915a8 --- /dev/null +++ b/upgrade/resources/scripts/migrate-workers-to-machinepool.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +############################################################## +# Author: Stratio Clouds # +# Supported provisioner versions: 0.17.0-0.8.X # +# Supported cloud providers: # +# - EKS (AWS managed) # +############################################################## + +__version__ = "0.1.0" + +import argparse +import json +import subprocess +import sys +import time + +CAPA_NAMESPACE = "capa-system" +CAPA_DEPLOYMENT = "capa-controller-manager" +CAPA_CONTAINER_INDEX = 0 +KEOS_CLUSTER_NAMESPACE_PREFIX = "cluster-" + +# Minimum versions required to run this migration +MIN_CLUSTER_OPERATOR_VERSION = "0.6.1" +MIN_CAPA_VERSION = "v2.9.2" + +# Feature gates required for MachinePool support +REQUIRED_FEATURE_GATES = { + "MachinePool": "true", + "EKSAllowAddRoles": "true", +} + +kubectl = "kubectl" + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + +def run_command(command, allow_errors=False): + '''Run a shell command, return (output, returncode).''' + status, output = subprocess.getstatusoutput(command) + if status != 0 and not allow_errors: + print("FAILED") + print(f"[ERROR] {output}") + sys.exit(1) + return output, status + + +def _version_gte(version, minimum): + '''Return True if version >= minimum, comparing semver-style (strips leading "v").''' + def parse(v): + return [int(x) for x in v.lstrip("v").split(".")[:3] if x.isdigit()] + try: + return parse(version) >= parse(minimum) + except Exception: + return False + + +def execute_command(command, dry_run, print_result=True): + '''Execute a command respecting dry-run mode.''' + if dry_run: + if print_result: + print("DRY-RUN") + return "" + output, _ = run_command(command) + if print_result: + print("OK") + return output + + +# --------------------------------------------------------------------------- +# S4 — Prerequisite validation +# --------------------------------------------------------------------------- + +def validate_prerequisites(dry_run): + '''Validate that the cluster is ready for MachinePool migration.''' + + print("[INFO] Validating prerequisites...") + + # 1. Detect provider and managed mode + print("[INFO] Checking cluster provider:", end=" ", flush=True) + cmd = kubectl + " get keoscluster -A -o jsonpath='{.items[0].spec.infra_provider}'" + provider, _ = run_command(cmd, allow_errors=True) + provider = provider.strip().strip("'") + if provider != "aws": + print(f"FAILED\n[ERROR] Provider '{provider}' is not supported. Only 'aws' is supported.") + sys.exit(1) + print(f"OK ({provider})") + + print("[INFO] Checking managed control plane:", end=" ", flush=True) + cmd = kubectl + " get keoscluster -A -o jsonpath='{.items[0].spec.control_plane.managed}'" + managed, _ = run_command(cmd, allow_errors=True) + managed = managed.strip().strip("'") + if managed != "true": + print("FAILED\n[ERROR] Only EKS managed clusters (control_plane.managed=true) are supported.") + sys.exit(1) + print("OK") + + # 2. Check cluster-operator minimum version + print(f"[INFO] Checking cluster-operator version (>= {MIN_CLUSTER_OPERATOR_VERSION}):", end=" ", flush=True) + cmd = kubectl + " get deployment keoscluster-controller-manager -n kube-system -o jsonpath='{.spec.template.spec.containers[0].image}'" + co_image, _ = run_command(cmd, allow_errors=True) + co_image = co_image.strip().strip("'") + co_version = co_image.split(":")[-1] if ":" in co_image else "" + if not co_version: + print(f"FAILED\n[ERROR] Could not determine cluster-operator version from image '{co_image}'.") + sys.exit(1) + if not _version_gte(co_version, MIN_CLUSTER_OPERATOR_VERSION): + print(f"FAILED\n[ERROR] cluster-operator version '{co_version}' is below minimum '{MIN_CLUSTER_OPERATOR_VERSION}'. " + "Run upgrade-provisioner.py first.") + sys.exit(1) + print(f"OK ({co_version})") + + # 3. Check CAPA minimum version + print(f"[INFO] Checking CAPA version (>= {MIN_CAPA_VERSION}):", end=" ", flush=True) + cmd = kubectl + f" get deployment {CAPA_DEPLOYMENT} -n {CAPA_NAMESPACE} -o jsonpath='{{.spec.template.spec.containers[0].image}}'" + capa_image, _ = run_command(cmd, allow_errors=True) + capa_image = capa_image.strip().strip("'") + capa_version = capa_image.split(":")[-1] if ":" in capa_image else "" + # Normalize: strip any suffix after the semver (e.g. v2.9.2-keos.1 → v2.9.2) + capa_semver = capa_version.split("-")[0] if "-" in capa_version else capa_version + if not capa_semver: + print(f"FAILED\n[ERROR] Could not determine CAPA version from image '{capa_image}'.") + sys.exit(1) + if not _version_gte(capa_semver, MIN_CAPA_VERSION): + print(f"FAILED\n[ERROR] CAPA version '{capa_version}' is below minimum '{MIN_CAPA_VERSION}'. " + "Run upgrade-provisioner.py first to upgrade CAPA.") + sys.exit(1) + print(f"OK ({capa_version})") + + # 4. Check KeosCluster status.ready + print("[INFO] Checking KeosCluster status.ready:", end=" ", flush=True) + cmd = kubectl + " get keoscluster -A -o jsonpath='{.items[0].status.ready}'" + ready, _ = run_command(cmd, allow_errors=True) + ready = ready.strip().strip("'") + if ready != "true": + print(f"FAILED\n[ERROR] KeosCluster status.ready={ready}. " + "Resolve any pending reconciliation before migrating.") + sys.exit(1) + print("OK") + + print("[INFO] All prerequisites satisfied.") + + +# --------------------------------------------------------------------------- +# S2 — Patch CAPA feature gates +# --------------------------------------------------------------------------- + +def _get_capa_args(): + '''Return the current args list of the CAPA manager container.''' + cmd = (kubectl + f" get deployment {CAPA_DEPLOYMENT} -n {CAPA_NAMESPACE}" + f" -o jsonpath='{{.spec.template.spec.containers[{CAPA_CONTAINER_INDEX}].args}}'") + output, _ = run_command(cmd) + return json.loads(output) + + +def _find_feature_gates_index(args): + '''Return the index of the --feature-gates arg, or -1 if not found.''' + for i, arg in enumerate(args): + if arg.startswith("--feature-gates="): + return i + return -1 + + +def _parse_feature_gates(arg): + '''Parse "--feature-gates=K=V,K=V,..." into a dict.''' + raw = arg.split("=", 1)[1] + result = {} + for pair in raw.split(","): + k, v = pair.split("=", 1) + result[k.strip()] = v.strip() + return result + + +def _build_feature_gates_arg(gates): + '''Reconstruct "--feature-gates=..." from a dict, preserving original key order.''' + pairs = ",".join(f"{k}={v}" for k, v in gates.items()) + return f"--feature-gates={pairs}" + + +def patch_capa_feature_gates(dry_run): + '''Enable MachinePool and EKSAllowAddRoles feature gates in CAPA, idempotently.''' + + print("[INFO] Checking CAPA feature gates:", end=" ", flush=True) + + args = _get_capa_args() + idx = _find_feature_gates_index(args) + + if idx == -1: + print("FAILED\n[ERROR] --feature-gates argument not found in CAPA deployment.") + sys.exit(1) + + gates = _parse_feature_gates(args[idx]) + + # Check if already correct — idempotent + already_set = all(gates.get(k) == v for k, v in REQUIRED_FEATURE_GATES.items()) + if already_set: + print("OK (already set, SKIP)") + return + + # Apply required gates + gates.update(REQUIRED_FEATURE_GATES) + new_arg = _build_feature_gates_arg(gates) + + patch = json.dumps([{ + "op": "replace", + "path": f"/spec/template/spec/containers/{CAPA_CONTAINER_INDEX}/args/{idx}", + "value": new_arg + }]) + + print("") + print(f"[INFO] Patching CAPA feature gates (MachinePool=true, EKSAllowAddRoles=true):", end=" ", flush=True) + cmd = (kubectl + f" patch deployment {CAPA_DEPLOYMENT} -n {CAPA_NAMESPACE}" + f" --type=json -p='{patch}'") + execute_command(cmd, dry_run) + + if not dry_run: + print("[INFO] Waiting for CAPA rollout:", end=" ", flush=True) + cmd = (kubectl + f" rollout status deployment/{CAPA_DEPLOYMENT}" + f" -n {CAPA_NAMESPACE} --timeout=3m") + execute_command(cmd, dry_run) + + # Verify + args_after = _get_capa_args() + gates_after = _parse_feature_gates(args_after[_find_feature_gates_index(args_after)]) + for k, v in REQUIRED_FEATURE_GATES.items(): + if gates_after.get(k) != v: + print(f"[ERROR] Verification failed: {k}={gates_after.get(k)} (expected {v})") + sys.exit(1) + print("[INFO] CAPA feature gates verified OK.") + + +# --------------------------------------------------------------------------- +# S3 — Update cluster-operator +# --------------------------------------------------------------------------- + +def update_cluster_operator(cluster_operator_version, dry_run): + '''Update cluster-operator HelmRelease and image tag ConfigMap, then wait.''' + + print(f"[INFO] Updating cluster-operator to {cluster_operator_version}...") + + # 1. Get current version from ConfigMap + cmd = (kubectl + " get configmap 00-cluster-operator-helm-chart-default-values" + " -n kube-system -o jsonpath='{.data.values\\.yaml}'") + values_yaml, _ = run_command(cmd, allow_errors=True) + + current_tag = None + for line in values_yaml.splitlines(): + if "tag:" in line: + current_tag = line.strip().split("tag:")[-1].strip() + break + + if current_tag == cluster_operator_version: + print(f"[INFO] cluster-operator already at {cluster_operator_version}: SKIP") + return + + print(f"[INFO] Updating image tag in ConfigMap ({current_tag} → {cluster_operator_version}):", end=" ", flush=True) + cmd = (kubectl + " get configmap 00-cluster-operator-helm-chart-default-values" + " -n kube-system -o json" + f" | python3 -c \"" + "import json,sys; cm=json.load(sys.stdin);" + f"cm['data']['values.yaml']=cm['data']['values.yaml'].replace('tag: {current_tag}','tag: {cluster_operator_version}');" + "print(json.dumps(cm))\"" + " | " + kubectl + " apply -f -") + execute_command(cmd, dry_run) + + # 2. Update HelmRelease chart version + print(f"[INFO] Updating HelmRelease chart version:", end=" ", flush=True) + cmd = (kubectl + " patch helmrelease cluster-operator -n kube-system" + f" --type=merge -p '{{\"spec\":{{\"chart\":{{\"spec\":{{\"version\":\"{cluster_operator_version}\"}}}}}}}}'") + execute_command(cmd, dry_run) + + # 3. Force reconciliation + print("[INFO] Forcing HelmRelease reconciliation:", end=" ", flush=True) + ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + cmd = (kubectl + " annotate helmrelease cluster-operator -n kube-system" + f" reconcile.fluxcd.io/requestedAt={ts} --overwrite") + execute_command(cmd, dry_run) + + if not dry_run: + # 4. Wait for controller deployment + print("[INFO] Waiting for cluster-operator deployment:", end=" ", flush=True) + cmd = (kubectl + " wait deployment -n kube-system keoscluster-controller-manager" + " --for=condition=Available --timeout=5m") + execute_command(cmd, dry_run) + + # 5. Wait for KeosCluster ready + print("[INFO] Waiting for KeosCluster ready:", end=" ", flush=True) + cmd = (kubectl + " wait keoscluster -A" + " --for=jsonpath='{.status.ready}'=true --timeout=5m") + execute_command(cmd, dry_run) + + +# --------------------------------------------------------------------------- +# S5 — Check MP readiness and print drain commands +# --------------------------------------------------------------------------- + +def check_mp_ready(worker_mp_name): + ''' + Verify that a MachinePool worker has enough Ready nodes to absorb MD workloads, + then print the drain commands for the equivalent MD worker. + ''' + + print(f"[INFO] Checking MachinePool worker '{worker_mp_name}'...") + + # Find MachinePool objects for this worker + cmd = (kubectl + f" get machinepool -A -l cluster.x-k8s.io/deployment-name={worker_mp_name}" + " -o jsonpath='{.items[*].status.readyReplicas}'") + output, status = run_command(cmd, allow_errors=True) + if status != 0 or not output.strip(): + # Try by name prefix + cmd = (kubectl + f" get machinepool -A --no-headers" + f" | grep '{worker_mp_name}'") + output, status = run_command(cmd, allow_errors=True) + if not output.strip(): + print(f"[ERROR] No MachinePool found for worker '{worker_mp_name}'. " + "Make sure you have added the MP worker to the KeosCluster descriptor and it has been reconciled.") + sys.exit(1) + + # Get MP nodes + cmd = (kubectl + f" get nodes -l cluster.x-k8s.io/deployment-name={worker_mp_name}" + " --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].type,READY:.status.conditions[-1].status") + nodes_output, _ = run_command(cmd, allow_errors=True) + + ready_nodes = [] + for line in nodes_output.strip().splitlines(): + parts = line.split() + if len(parts) >= 3 and parts[1] == "Ready" and parts[2] == "True": + ready_nodes.append(parts[0]) + + if not ready_nodes: + print(f"[ERROR] No Ready nodes found for MachinePool '{worker_mp_name}'. " + "Wait for the MP nodes to be Ready before draining the MD.") + sys.exit(1) + + print(f"[INFO] MachinePool '{worker_mp_name}' has {len(ready_nodes)} Ready node(s): {', '.join(ready_nodes)}") + + # Infer equivalent MD name (convention: worker_mp_name with -mp suffix → strip -mp) + worker_md_name = worker_mp_name.replace("-mp", "-md") if "-mp" in worker_mp_name else worker_mp_name + "-md" + + # Get MD nodes + cmd = (kubectl + f" get nodes -l cluster.x-k8s.io/deployment-name={worker_md_name}" + " --no-headers -o custom-columns=NAME:.metadata.name") + md_nodes_output, status = run_command(cmd, allow_errors=True) + md_nodes = [l.strip() for l in md_nodes_output.strip().splitlines() if l.strip()] if status == 0 else [] + + if not md_nodes: + print(f"[WARN] No nodes found for MachineDeployment '{worker_md_name}'. " + "Verify the MD worker name manually.") + else: + print(f"[INFO] MachineDeployment '{worker_md_name}' has {len(md_nodes)} node(s) to drain.") + + print("") + print("=" * 70) + print(f" CAPACITY CHECK: {len(ready_nodes)} MP nodes Ready — proceed with drain? (verify manually)") + print("=" * 70) + print("") + print(" Step 1 — Drain each MD node (run one at a time, verify pods after each):") + print("") + for node in md_nodes: + print(f" kubectl drain {node} --ignore-daemonsets --delete-emptydir-data --timeout=10m") + print("") + print(" Step 2 — Remove the MD worker from the KeosCluster descriptor:") + print(f" Delete worker entry with name='{worker_md_name}' from spec.worker_nodes") + print(f" Then: kubectl patch keoscluster -n cluster- --type=merge \\") + print(f" -p '{{\"spec\":{{\"worker_nodes\": [") + print(" 4. Execute the printed drain commands manually, one node at a time.") + print("=" * 70) + print("") + print("RESULT: OK") + + +if __name__ == "__main__": + main() From add2cbcf712bc2745c8aede0668a80978880e303 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Wed, 13 May 2026 10:25:55 +0200 Subject: [PATCH 07/10] [PLT-3962] Make --cluster-operator-version optional with default Default is now TARGET_CLUSTER_OPERATOR_VERSION (0.9.0-PR907-SNAPSHOT) so clients can run the migration script without specifying the version manually. --- .../resources/scripts/migrate-workers-to-machinepool.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/upgrade/resources/scripts/migrate-workers-to-machinepool.py b/upgrade/resources/scripts/migrate-workers-to-machinepool.py index 046d0915a8..5c2c6f3c78 100644 --- a/upgrade/resources/scripts/migrate-workers-to-machinepool.py +++ b/upgrade/resources/scripts/migrate-workers-to-machinepool.py @@ -21,7 +21,8 @@ CAPA_CONTAINER_INDEX = 0 KEOS_CLUSTER_NAMESPACE_PREFIX = "cluster-" -# Minimum versions required to run this migration +# Target and minimum versions for this migration +TARGET_CLUSTER_OPERATOR_VERSION = "0.9.0-PR907-SNAPSHOT" MIN_CLUSTER_OPERATOR_VERSION = "0.6.1" MIN_CAPA_VERSION = "v2.9.2" @@ -385,8 +386,8 @@ def parse_args(): help="Path to kubeconfig file. Can also be set via $KUBECONFIG.", default="~/.kube/config") parser.add_argument("--cluster-operator-version", - help="Target cluster-operator version (e.g. 0.8.0-PLT-3962-MP-01.15).", - required=True) + help="Target cluster-operator version. Defaults to the version bundled with this release.", + default=TARGET_CLUSTER_OPERATOR_VERSION) parser.add_argument("--dry-run", action="store_true", help="Print actions without executing them.") parser.add_argument("--check-ready", metavar="WORKER_MP_NAME", From 08d102996533faea5415933f54f046a700860cc0 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Wed, 13 May 2026 10:28:05 +0200 Subject: [PATCH 08/10] [PLT-3962] Add pre-flight warning about image and chart availability Before running preparation mode, the script now prints a reminder to verify that the cluster-operator image and Helm chart are accessible in the cluster's configured registry and Helm repository. --- .../scripts/migrate-workers-to-machinepool.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/upgrade/resources/scripts/migrate-workers-to-machinepool.py b/upgrade/resources/scripts/migrate-workers-to-machinepool.py index 5c2c6f3c78..70f4868b9f 100644 --- a/upgrade/resources/scripts/migrate-workers-to-machinepool.py +++ b/upgrade/resources/scripts/migrate-workers-to-machinepool.py @@ -411,6 +411,16 @@ def main(): return # Mode: preparation (S4 → S2 → S3) + print("=" * 70) + print(" IMPORTANT: Before continuing, verify that the following artifacts") + print(f" are available in the registry/repository configured for this cluster:") + print(f" - cluster-operator image: {args.cluster_operator_version}") + print(f" - cluster-operator Helm chart: {args.cluster_operator_version}") + print("") + print(" If the cluster uses a private registry or private Helm repository,") + print(" ensure those artifacts have been pushed before running this script.") + print("=" * 70) + print("") validate_prerequisites(args.dry_run) patch_capa_feature_gates(args.dry_run) update_cluster_operator(args.cluster_operator_version, args.dry_run) From 0cd29cebace214a4d8d9a4cf543c7533b7d5df1d Mon Sep 17 00:00:00 2001 From: iamjanr Date: Wed, 13 May 2026 11:55:37 +0200 Subject: [PATCH 09/10] [PLT-3962] Update migrate script target version to 0.7.0-PR317-SNAPSHOT --- upgrade/resources/scripts/migrate-workers-to-machinepool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/resources/scripts/migrate-workers-to-machinepool.py b/upgrade/resources/scripts/migrate-workers-to-machinepool.py index 70f4868b9f..f70ffe9c0d 100644 --- a/upgrade/resources/scripts/migrate-workers-to-machinepool.py +++ b/upgrade/resources/scripts/migrate-workers-to-machinepool.py @@ -22,7 +22,7 @@ KEOS_CLUSTER_NAMESPACE_PREFIX = "cluster-" # Target and minimum versions for this migration -TARGET_CLUSTER_OPERATOR_VERSION = "0.9.0-PR907-SNAPSHOT" +TARGET_CLUSTER_OPERATOR_VERSION = "0.7.0-PR317-SNAPSHOT" MIN_CLUSTER_OPERATOR_VERSION = "0.6.1" MIN_CAPA_VERSION = "v2.9.2" From d4358c28f6a73a567ec4aab386f9c2f90c20ad10 Mon Sep 17 00:00:00 2001 From: iamjanr Date: Wed, 13 May 2026 12:08:38 +0200 Subject: [PATCH 10/10] [PLT-3962] Add confirmation prompt before executing migration --- upgrade/resources/scripts/migrate-workers-to-machinepool.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/upgrade/resources/scripts/migrate-workers-to-machinepool.py b/upgrade/resources/scripts/migrate-workers-to-machinepool.py index f70ffe9c0d..70940a9a8f 100644 --- a/upgrade/resources/scripts/migrate-workers-to-machinepool.py +++ b/upgrade/resources/scripts/migrate-workers-to-machinepool.py @@ -421,6 +421,11 @@ def main(): print(" ensure those artifacts have been pushed before running this script.") print("=" * 70) print("") + answer = input("Have you verified the artifacts are available? [y/N] ").strip().lower() + if answer != "y": + print("[INFO] Aborted by user.") + sys.exit(0) + print("") validate_prerequisites(args.dry_run) patch_capa_feature_gates(args.dry_run) update_cluster_operator(args.cluster_operator_version, args.dry_run)