Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
---
# Single attempt to create a metal machine pool with specific instance type
# Includes detection of stuck provisioning and automatic cleanup/retry

- name: Check if metal machine pool already exists
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--output json
register: r_existing_machinepool
failed_when: false
changed_when: false

- name: Skip if pool exists and is healthy
when:
- r_existing_machinepool.rc == 0
- (r_existing_machinepool.stdout | from_json).status.current_replicas == rosa_metal_replicas
block:
- name: Metal machine pool is already healthy
ansible.builtin.debug:
msg: >-
Metal machine pool '{{ rosa_metal_pool_name }}' already exists and is healthy
({{ (r_existing_machinepool.stdout | from_json).status.current_replicas }}/{{ rosa_metal_replicas }} replicas)

- name: Set successful instance type
ansible.builtin.set_fact:
rosa_metal_successful_instance_type: "{{ (r_existing_machinepool.stdout | from_json).aws_node_pool.instance_type }}"

- name: End attempts early
ansible.builtin.meta: end_play

- name: Delete existing stuck/failed machine pool if it exists
when:
- r_existing_machinepool.rc == 0
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa delete machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--yes
register: r_delete_machinepool

- name: Wait for machine pool deletion to complete
when: r_existing_machinepool.rc == 0
ansible.builtin.pause:
seconds: 10

- name: Create metal machine pool with {{ rosa_metal_current_instance_type }}
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa create machinepool
--cluster {{ rosa_cluster_name }}
--name {{ rosa_metal_pool_name }}
--instance-type {{ rosa_metal_current_instance_type }}
--replicas {{ rosa_metal_replicas }}
--disk-size {{ rosa_metal_disk_size }}
{% if rosa_metal_availability_zone | length > 0 %}--availability-zone {{ rosa_metal_availability_zone }}{% endif %}
--yes
--output json
register: r_create_machinepool

- name: Record machine pool creation start time
ansible.builtin.set_fact:
rosa_metal_pool_create_time: "{{ ansible_date_time.epoch }}"

- name: Wait and check if instances are provisioning
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--output json
register: r_machinepool_status
until:
- r_machinepool_status.rc == 0
- (r_machinepool_status.stdout | from_json).status.current_replicas | int > 0
retries: "{{ (rosa_metal_provision_timeout / 30) | int }}"
delay: 30
failed_when: false

- name: Check if machine pool is stuck (0 replicas after timeout)
when:
- r_machinepool_status.rc == 0
- (r_machinepool_status.stdout | from_json).status.current_replicas | default(0) | int == 0
block:
- name: Machine pool stuck with {{ rosa_metal_current_instance_type }}
ansible.builtin.debug:
msg: >-
Metal machine pool failed to provision any instances with {{ rosa_metal_current_instance_type }}.
Current replicas: {{ (r_machinepool_status.stdout | from_json).status.current_replicas | default(0) }}
Desired replicas: {{ rosa_metal_replicas }}
This is likely due to AWS capacity constraints.

- name: Delete failed machine pool
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa delete machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--yes

- name: Wait for deletion
ansible.builtin.pause:
seconds: 10

- name: Fail this attempt to try next instance type
ansible.builtin.fail:
msg: "Instance type {{ rosa_metal_current_instance_type }} unavailable, trying next option..."

- name: Wait for all replicas to provision
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--output json
register: r_machinepool_final_status
until:
- r_machinepool_final_status.rc == 0
- (r_machinepool_final_status.stdout | from_json).status.current_replicas | int == rosa_metal_replicas | int
retries: 40
delay: 30
failed_when: false

- name: Check final status
when:
- (r_machinepool_final_status.stdout | from_json).status.current_replicas | int == rosa_metal_replicas | int
ansible.builtin.set_fact:
rosa_metal_successful_instance_type: "{{ rosa_metal_current_instance_type }}"

- name: Fail if not all replicas provisioned
when:
- (r_machinepool_final_status.stdout | from_json).status.current_replicas | int != rosa_metal_replicas | int
block:
- name: Not all replicas provisioned with {{ rosa_metal_current_instance_type }}
ansible.builtin.debug:
msg: >-
Only {{ (r_machinepool_final_status.stdout | from_json).status.current_replicas }}/{{ rosa_metal_replicas }}
replicas provisioned. Trying next instance type...

- name: Delete partially successful machine pool
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa delete machinepool
--cluster {{ rosa_cluster_name }}
--machinepool {{ rosa_metal_pool_name }}
--yes

- name: Fail to try next instance type
ansible.builtin.fail:
msg: "Partial provisioning with {{ rosa_metal_current_instance_type }}, trying next..."
14 changes: 14 additions & 0 deletions ansible/configs/rosa-consolidated/default_vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,20 @@ rosa_setup_cluster_admin_delete_after_workloads: false
# Replicas also can not be set when autoscaling is enabled
# rosa_compute_replicas: 2

# Metal Machine Pool Settings (optional)
# Enable metal machine pool creation (only for HCP clusters)
rosa_metal_deploy: false
# Preferred metal instance type (user's choice from AgnosticV)
rosa_metal_instance_type: i4i.metal
# Number of metal worker nodes
rosa_metal_replicas: 3
# Disk size for metal nodes
rosa_metal_disk_size: 250GiB
# Machine pool name
rosa_metal_pool_name: metal
# Availability zone (leave empty to use cluster default)
rosa_metal_availability_zone: ""

# Enable Autoscaling. Further autoscaling options are only used if true
# HCP does not support autoscaling. Turning it on will have no effect
# rosa_compute_enable_autoscaling: false
Expand Down
55 changes: 55 additions & 0 deletions ansible/configs/rosa-consolidated/fix_metal_machinepool.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
---
# Self-healing metal machine pool creation with instance type fallback
# This task file handles metal machine pool provisioning with automatic
# fallback to alternative instance types when capacity is unavailable

- name: Set metal machine pool configuration
ansible.builtin.set_fact:
# User's preferred instance type (set via AgnosticV ordering)
rosa_metal_instance_type_preferred: "{{ rosa_metal_instance_type | default('i4i.metal') }}"
# Fallback instance types to try if preferred is unavailable
rosa_metal_instance_type_fallbacks:
- i4i.metal
- i3en.metal
- i3.metal
# Other metal pool settings
rosa_metal_replicas: "{{ rosa_metal_replicas | default(3) }}"
rosa_metal_disk_size: "{{ rosa_metal_disk_size | default('250GiB') }}"
rosa_metal_pool_name: "{{ rosa_metal_pool_name | default('metal') }}"
rosa_metal_availability_zone: "{{ rosa_metal_availability_zone | default('') }}"
# Timeout for checking if instances provision (seconds)
rosa_metal_provision_timeout: 300
# How long to wait before considering pool stuck (seconds)
rosa_metal_stuck_threshold: 180

- name: Build list of instance types to try (preferred first, then fallbacks)
ansible.builtin.set_fact:
rosa_metal_instance_types_to_try: >-
{{ [rosa_metal_instance_type_preferred] +
(rosa_metal_instance_type_fallbacks |
reject('equalto', rosa_metal_instance_type_preferred) |
list) }}

- name: Display instance type attempt order
ansible.builtin.debug:
msg: "Will attempt metal instance types in order: {{ rosa_metal_instance_types_to_try }}"

- name: Attempt to create metal machine pool with fallback logic
block:
- name: Try each instance type until one succeeds
ansible.builtin.include_tasks: create_metal_machinepool_attempt.yml
loop: "{{ rosa_metal_instance_types_to_try }}"
loop_control:
loop_var: rosa_metal_current_instance_type
label: "{{ rosa_metal_current_instance_type }}"

rescue:
- name: All metal instance types failed
ansible.builtin.fail:
msg: >-
Failed to provision metal machine pool after trying all instance types:
{{ rosa_metal_instance_types_to_try }}

- name: Record successful instance type
ansible.builtin.debug:
msg: "✅ Successfully provisioned metal machine pool using {{ rosa_metal_successful_instance_type }}"
159 changes: 143 additions & 16 deletions ansible/configs/rosa-consolidated/install_rosa_hcp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,135 @@
retries: 3
delay: 10

- name: Wait for ROSA HCP installer completion
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe cluster
--cluster {{ rosa_cluster_name }}
--output json
register: r_rosa_installer_status
until:
- (r_rosa_installer_status.stdout | from_json).status is defined
- (r_rosa_installer_status.stdout | from_json).status.state is defined
- (r_rosa_installer_status.stdout | from_json).status.state == "ready"
retries: 120
delay: 60
- name: Record cluster creation start time for stuck detection
ansible.builtin.set_fact:
rosa_install_start_time: "{{ ansible_date_time.epoch }}"

- name: Wait for ROSA HCP installer completion with stuck detection
block:
- name: Check ROSA HCP cluster status
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe cluster
--cluster {{ rosa_cluster_name }}
--output json
register: r_rosa_installer_status
until:
- r_rosa_installer_status.rc == 0
- (r_rosa_installer_status.stdout | from_json).status is defined
- (r_rosa_installer_status.stdout | from_json).status.state is defined
- (r_rosa_installer_status.stdout | from_json).status.state == "ready"
retries: 90
delay: 60
failed_when: false

- name: Check for provision errors
when:
- (r_rosa_installer_status.stdout | from_json).status.provision_error_message | default("") | length > 0
ansible.builtin.fail:
msg: "ROSA cluster provisioning failed: {{ (r_rosa_installer_status.stdout | from_json).status.provision_error_message }}"

- name: Detect and recover from stuck installation
when:
- (r_rosa_installer_status.stdout | from_json).status.state != "ready"
- (r_rosa_installer_status.stdout | from_json).status.state == "installing"
- (r_rosa_installer_status.stdout | from_json).status.current_compute | default(0) | int == 0
- (ansible_date_time.epoch | int - rosa_install_start_time | int) > 2700
block:
- name: Log stuck cluster detection
ansible.builtin.debug:
msg: "Cluster stuck in 'installing' state for >45 min with 0 worker nodes. Attempting recovery..."

- name: Delete stuck cluster
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa delete cluster
--cluster {{ rosa_cluster_name }}
--yes

- name: Wait for cluster deletion
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa list clusters --output json
register: r_list_clusters
until: >-
(r_list_clusters.stdout | from_json | selectattr('name', 'equalto', rosa_cluster_name) | list | length) == 0
retries: 30
delay: 10

- name: Verify subnets still exist after deletion
ansible.builtin.command:
cmd: >-
aws ec2 describe-subnets
--region {{ aws_region }}
--subnet-ids {{ rosa_subnets }}
--query 'Subnets[].SubnetId'
--output text
register: r_verify_subnets
failed_when: false

- name: Get current available subnets if original were deleted
when: r_verify_subnets.rc != 0
ansible.builtin.command:
cmd: >-
aws ec2 describe-subnets
--region {{ aws_region }}
--filters 'Name=tag:Name,Values=*{{ rosa_cluster_name }}*'
--query 'Subnets[?MapPublicIpOnLaunch==`false`].SubnetId'
--output text
register: r_current_subnets

- name: Update subnet fact if needed
when:
- r_verify_subnets.rc != 0
- r_current_subnets.stdout | length > 0
ansible.builtin.set_fact:
rosa_subnets: "{{ r_current_subnets.stdout.split() | join(',') }}"

- name: Recreate cluster after stuck detection
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa create cluster
--cluster-name {{ rosa_cluster_name }}
--billing-account {{ aws_billing_account_id }}
--sts
--mode auto
--yes
--hosted-cp
--region {{ aws_region }}
--operator-roles-prefix {{ rosa_cluster_name }}
--oidc-config-id {{ rosa_oidc_id }}
--subnet-ids {{ rosa_subnets }}
--tags "{{ cloud_tags_list }}"
{% if _rosa_version_to_install | default("") | length > 0 %}--version {{ _rosa_version_to_install }}{% endif %}
{% if rosa_compute_machine_type is defined %}--compute-machine-type {{ rosa_compute_machine_type }}{% endif %}
{% if rosa_compute_worker_disk_size is defined %}--worker-disk-size {{ rosa_compute_worker_disk_size }}{% endif %}
{% if rosa_compute_replicas is defined %}--replicas {{ rosa_compute_replicas | int }}{% endif %}
{% if rosa_machine_cidr | default("") | length > 0 %}--machine-cidr {{ rosa_machine_cidr }}{% endif %}
{% if rosa_service_cidr | default("") | length > 0 %}--service-cidr {{ rosa_service_cidr }}{% endif %}
{% if rosa_pod_cidr | default("") | length > 0 %}--pod-cidr {{ rosa_pod_cidr }}{% endif %}
{% if rosa_host_prefix | default("") | length > 0 %}--host-prefix {{ rosa_host_prefix | int }}{% endif %}
register: r_rosa_recreate_status
until: r_rosa_recreate_status.rc == 0
retries: 3
delay: 10

- name: Reset start time for recreated cluster
ansible.builtin.set_fact:
rosa_install_start_time: "{{ ansible_date_time.epoch }}"

- name: Wait for recreated cluster to be ready
ansible.builtin.command: >-
{{ rosa_binary_path }}/rosa describe cluster
--cluster {{ rosa_cluster_name }}
--output json
register: r_rosa_installer_status
until:
- r_rosa_installer_status.rc == 0
- (r_rosa_installer_status.stdout | from_json).status.state == "ready"
retries: 60
delay: 60

rescue:
- name: Failed to install cluster after recovery attempts
ansible.builtin.fail:
msg: "ROSA HCP cluster installation failed after stuck detection and recovery attempt"

# HCP install finishes well before all the cluster operators have
# rolled out. Need to wait for the console to be available before
Expand All @@ -146,9 +263,19 @@
--output json
register: r_rosa_installer_status
until:
- (r_rosa_installer_status.stdout | from_json).api is defined
- (r_rosa_installer_status.stdout | from_json).api.url is defined
- (r_rosa_installer_status.stdout | from_json).console is defined
- (r_rosa_installer_status.stdout | from_json).console.url is defined
- (r_rosa_installer_status.stdout | from_json).api is defined
- (r_rosa_installer_status.stdout | from_json).api.url is defined
- (r_rosa_installer_status.stdout | from_json).console is defined
- (r_rosa_installer_status.stdout | from_json).console.url is defined
retries: 60
delay: 60

# Metal machine pool creation
# NOTE: This is opt-in via rosa_metal_deploy=true
# If using ocp4_workload_rosa_machinepool workload, that handles metal pools
# automatically with similar fallback logic. Only enable this if NOT using
# the workload or if you want metal pools provisioned during install.
- name: Create metal machine pool if requested
when:
- rosa_metal_deploy | default(false) | bool
ansible.builtin.include_tasks: fix_metal_machinepool.yml
Loading
Loading