Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 220 additions & 40 deletions pkg/util/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ type Cluster struct {
}

const (
GenerateSubnetMaxTries = 100
localDefaultURL string = "https://localhost:8443"
GenerateSubnetMaxTries = 100
localDefaultURL string = "https://localhost:8443"
aroClusterIdentityOperatorName = "aro-Cluster"
)

func DefaultMasterVmSizes() []string {
Expand Down Expand Up @@ -402,30 +403,19 @@ func (c *Cluster) GetPlatformWIRoles() ([]api.PlatformWorkloadIdentityRole, erro
return nil, fmt.Errorf("workload identity role sets for version %s not found", c.Config.OSClusterVersion)
}

func (c *Cluster) SetupWorkloadIdentity(ctx context.Context, vnetResourceGroup string) error {
func (c *Cluster) SetupWorkloadIdentity(ctx context.Context, vnetResourceGroup string, diskEncryptionSetID string) error {
platformWorkloadIdentityRoles, err := c.GetPlatformWIRoles()
if err != nil {
return fmt.Errorf("failed parsing platformWI Roles: %w", err)
}

platformWorkloadIdentityRoles = append(platformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{
OperatorName: "aro-Cluster",
OperatorName: aroClusterIdentityOperatorName,
RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e",
})

c.log.Info("Assigning role to mock msi client")
c.roleassignments.Create(
ctx,
fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", c.Config.SubscriptionID, vnetResourceGroup),
uuid.DefaultGenerator.Generate(),
mgmtauthorization.RoleAssignmentCreateParameters{
RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{
RoleDefinitionID: pointerutils.ToPtr("/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e"),
PrincipalID: &c.Config.MockMSIObjectID,
PrincipalType: mgmtauthorization.ServicePrincipal,
},
},
)
// Create managed identities and store their resource IDs for later federated credential role assignment
operatorIdentities := make(map[string]string) // operatorName -> resourceID

for _, wi := range platformWorkloadIdentityRoles {
c.log.Infof("creating WI: %s", wi.OperatorName)
Expand All @@ -435,32 +425,222 @@ func (c *Cluster) SetupWorkloadIdentity(ctx context.Context, vnetResourceGroup s
if err != nil {
return err
}
_, err = c.roleassignments.Create(
ctx,
fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", c.Config.SubscriptionID, vnetResourceGroup),
uuid.DefaultGenerator.Generate(),
mgmtauthorization.RoleAssignmentCreateParameters{
RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{
RoleDefinitionID: &wi.RoleDefinitionID,
PrincipalID: resp.Properties.PrincipalID,
PrincipalType: mgmtauthorization.ServicePrincipal,
},
},
)

operatorIdentities[wi.OperatorName] = *resp.ID
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resp is a struct, but its ID/Properties/Properties.PrincipalID are all pointers. Do we need to bother with a nil check on these?


// Determine required scopes based on role permissions
scopes, err := c.determineRequiredPlatformWorkloadIdentityScopes(ctx, wi.RoleDefinitionID, vnetResourceGroup, diskEncryptionSetID)
if err != nil {
Comment on lines +431 to 433
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SetupWorkloadIdentity and determineRequiredPlatformWorkloadIdentityScopes introduce new logic for computing scopes and performing per-scope role assignments, but there are no unit tests covering scope determination and the resulting role assignment behavior. Consider adding tests (e.g., mocking RoleDefinitionsClient and asserting the expected scopes for representative roles) to prevent regressions as role definitions evolve.

Copilot uses AI. Check for mistakes.
return err
}
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If determineRequiredPlatformWorkloadIdentityScopes returns an empty slice, the subsequent loop will skip role assignment entirely (no error), leaving the managed identity without required permissions and likely causing later auth failures. It would be safer to either return an error when no scopes can be derived, or fall back to a known-safe scope set (e.g. the documented minimal resource scopes).

Suggested change
}
}
if len(scopes) == 0 {
return fmt.Errorf("no platform workload identity scopes could be determined for operator %s and role %s", wi.OperatorName, wi.RoleDefinitionID)
}

Copilot uses AI. Check for mistakes.

if wi.OperatorName != "aro-Cluster" {
// Assign role to each determined scope
for _, scope := range scopes {
c.log.Infof("assigning role %s to scope %s for principal %s", wi.RoleDefinitionID, scope, *resp.Properties.PrincipalID)
for i := 0; i < 5; i++ {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this loop duplicated a few times - can we make it a function?

_, err := c.roleassignments.Create(
ctx,
scope,
uuid.DefaultGenerator.Generate(),
mgmtauthorization.RoleAssignmentCreateParameters{
RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{
RoleDefinitionID: &wi.RoleDefinitionID,
PrincipalID: resp.Properties.PrincipalID,
PrincipalType: mgmtauthorization.ServicePrincipal,
},
},
)

// Ignore if the role assignment already exists
if detailedError, ok := err.(autorest.DetailedError); ok {
if detailedError.StatusCode == http.StatusConflict {
err = nil
}
}

if err != nil && i < 4 {
// Sometimes we see HashConflictOnDifferentRoleAssignmentIds.
// Retry a few times.
c.log.Print(err)
continue
}
if err != nil {
return fmt.Errorf("failed to assign role to scope %s: %w", scope, err)
}

break
}
}

if wi.OperatorName != aroClusterIdentityOperatorName {
c.workloadIdentities[wi.OperatorName] = api.PlatformWorkloadIdentity{
ResourceID: *resp.ID,
}
}
}

// Assign federated credential role from aro-Cluster identity to each operator identity
_, ok := operatorIdentities[aroClusterIdentityOperatorName]
if !ok {
return fmt.Errorf("%s identity not found", aroClusterIdentityOperatorName)
}

// Get the aro-Cluster identity to get its principalID
aroClusterIdentity, err := c.msiClient.Get(ctx, vnetResourceGroup, aroClusterIdentityOperatorName, nil)
if err != nil {
return fmt.Errorf("failed to get %s identity: %w", aroClusterIdentityOperatorName, err)
}
Comment on lines +482 to +492
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idea: instead of making a call to CREATE this identity above, then GET it here - can we preserve the data on CREATE? We should have it above as resp.Properties.PrincipalID, right?


c.log.Infof("Assigning federated credential role from %s to operator identities", aroClusterIdentityOperatorName)
for operatorName, operatorResourceID := range operatorIdentities {
if operatorName == aroClusterIdentityOperatorName {
continue // Don't assign to itself
}

for i := 0; i < 5; i++ {
_, err := c.roleassignments.Create(
ctx,
operatorResourceID, // Scope to the operator's managed identity resource
uuid.DefaultGenerator.Generate(),
mgmtauthorization.RoleAssignmentCreateParameters{
RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{
RoleDefinitionID: pointerutils.ToPtr("/providers/Microsoft.Authorization/roleDefinitions/" + rbac.RoleAzureRedHatOpenShiftFederatedCredentialRole),
PrincipalID: aroClusterIdentity.Properties.PrincipalID,
PrincipalType: mgmtauthorization.ServicePrincipal,
},
},
)

// Ignore if the role assignment already exists
if detailedError, ok := err.(autorest.DetailedError); ok {
if detailedError.StatusCode == http.StatusConflict {
err = nil
}
}

if err != nil && i < 4 {
// Sometimes we see HashConflictOnDifferentRoleAssignmentIds.
// Retry a few times.
c.log.Print(err)
continue
}
if err != nil {
return fmt.Errorf("failed to assign federated credential role to %s: %w", operatorName, err)
}

break
}
}

// Also assign federated credential role to mock MSI if configured (for testing)
if c.Config.MockMSIObjectID != "" {
c.log.Info("Assigning federated credential role to mock msi client")
for operatorName, operatorResourceID := range operatorIdentities {
if operatorName == aroClusterIdentityOperatorName {
continue
}

for i := 0; i < 5; i++ {
_, err := c.roleassignments.Create(
ctx,
operatorResourceID,
uuid.DefaultGenerator.Generate(),
mgmtauthorization.RoleAssignmentCreateParameters{
RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{
RoleDefinitionID: pointerutils.ToPtr("/providers/Microsoft.Authorization/roleDefinitions/" + rbac.RoleAzureRedHatOpenShiftFederatedCredentialRole),
PrincipalID: &c.Config.MockMSIObjectID,
PrincipalType: mgmtauthorization.ServicePrincipal,
},
},
)

// Ignore if the role assignment already exists
if detailedError, ok := err.(autorest.DetailedError); ok {
if detailedError.StatusCode == http.StatusConflict {
err = nil
}
}

if err != nil && i < 4 {
// Sometimes we see HashConflictOnDifferentRoleAssignmentIds.
// Retry a few times.
c.log.Print(err)
continue
}
if err != nil {
return fmt.Errorf("failed to assign federated credential role to mock MSI for %s: %w", operatorName, err)
}

break
}
}
}

return nil
}

// determineRequiredPlatformWorkloadIdentityScopes analyzes a role definition's permissions
// and returns the list of resource scopes where the role should be assigned
func (c *Cluster) determineRequiredPlatformWorkloadIdentityScopes(ctx context.Context, roleDefinitionID string, vnetResourceGroup string, diskEncryptionSetID string) ([]string, error) {
// Extract the role GUID from the full resource ID
parts := strings.Split(roleDefinitionID, "/")
if len(parts) < 2 {
return nil, fmt.Errorf("invalid role definition ID format: %s", roleDefinitionID)
}
roleGUID := parts[len(parts)-1]

// Get the role definition to check its permissions
subscriptionScope := fmt.Sprintf("/subscriptions/%s", c.Config.SubscriptionID)
filter := fmt.Sprintf("roleName eq '%s'", roleGUID)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bug (E2E caught it too): need to use the Display Name rather than GUID here.

Alternatively, use Get or GetByID

roleDefs, err := c.roledefinitions.List(ctx, subscriptionScope, filter)
if err != nil {
return nil, fmt.Errorf("failed to list role definition %s: %w", roleDefinitionID, err)
}

if len(roleDefs) == 0 {
return nil, fmt.Errorf("role definition %s not found", roleDefinitionID)
}

roleDef := roleDefs[0]

Comment on lines +592 to +605
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RoleDefinitions list filter is using roleName eq '<guid>', but roleName is the human-readable role name (e.g. "Reader"), not the role definition GUID from the ID. With the current code this will typically return an empty list and cause WI setup to fail with "role definition ... not found". Consider retrieving the role definition by ID (RoleDefinitionsClient.Get) or listing without this filter and matching by the role definition name/ID suffix.

Suggested change
// Get the role definition to check its permissions
subscriptionScope := fmt.Sprintf("/subscriptions/%s", c.Config.SubscriptionID)
filter := fmt.Sprintf("roleName eq '%s'", roleGUID)
roleDefs, err := c.roledefinitions.List(ctx, subscriptionScope, filter)
if err != nil {
return nil, fmt.Errorf("failed to list role definition %s: %w", roleDefinitionID, err)
}
if len(roleDefs) == 0 {
return nil, fmt.Errorf("role definition %s not found", roleDefinitionID)
}
roleDef := roleDefs[0]
// Get the role definition to check its permissions. The roleDefinitionID contains
// the role definition resource ID / GUID, while the RoleDefinitions list filter's
// roleName field expects the human-readable role name (for example, "Reader").
// List without that filter and match the returned role definition by ID instead.
subscriptionScope := fmt.Sprintf("/subscriptions/%s", c.Config.SubscriptionID)
roleDefs, err := c.roledefinitions.List(ctx, subscriptionScope, "")
if err != nil {
return nil, fmt.Errorf("failed to list role definition %s: %w", roleDefinitionID, err)
}
var roleDef *mgmtauthorization.RoleDefinition
for i := range roleDefs {
if roleDefs[i].ID == nil {
continue
}
id := *roleDefs[i].ID
if strings.EqualFold(id, roleDefinitionID) || strings.HasSuffix(strings.ToLower(id), "/"+strings.ToLower(roleGUID)) {
roleDef = &roleDefs[i]
break
}
}
if roleDef == nil {
return nil, fmt.Errorf("role definition %s not found", roleDefinitionID)
}

Copilot uses AI. Check for mistakes.
// Build the list of scopes based on the role's permissions
var scopes []string

if roleDef.Permissions != nil {
for _, perm := range *roleDef.Permissions {
if perm.Actions == nil {
continue
}

for _, action := range *perm.Actions {
// Check for subnet-specific permissions
if strings.Contains(action, "Microsoft.Network/virtualNetworks/subnets/") {
// Assign to both master and worker subnets
masterSubnet := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet/subnets/%s-master", c.Config.SubscriptionID, vnetResourceGroup, c.Config.ClusterName)
workerSubnet := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet/subnets/%s-worker", c.Config.SubscriptionID, vnetResourceGroup, c.Config.ClusterName)
scopes = append(scopes, masterSubnet, workerSubnet)
Comment on lines +617 to +621
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The scope-building logic can append duplicates (e.g. if multiple permission blocks include subnet-related actions), leading to redundant role assignment calls. Consider de-duplicating scopes before returning/assigning to reduce API calls and avoid relying on conflict handling for idempotency.

Copilot uses AI. Check for mistakes.
break
}

// Check for VNet-level permissions
if strings.Contains(action, "Microsoft.Network/virtualNetworks/") && !strings.Contains(action, "Microsoft.Network/virtualNetworks/subnets/") {
vnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet", c.Config.SubscriptionID, vnetResourceGroup)
scopes = append(scopes, vnetID)
break
}

// Check for DES permissions
if strings.Contains(action, "Microsoft.Compute/diskEncryptionSets") {
scopes = append(scopes, diskEncryptionSetID)
break
}
}
}
}

return scopes, nil
Comment on lines +615 to +641
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if none of the cases in the for _, action loop land, and we return [], nil here? Is that possible? Should we log it or return an error instead?

}

func (c *Cluster) Create(ctx context.Context) error {
c.log.Info("Creating cluster")
clusterGet, err := c.openshiftclusters.Get(ctx, c.Config.VnetResourceGroup, c.Config.ClusterName)
Expand Down Expand Up @@ -622,13 +802,6 @@ func (c *Cluster) Create(ctx context.Context) error {
diskEncryptionSetName,
)

if c.Config.UseWorkloadIdentity {
c.log.Info("creating WIs")
if err := c.SetupWorkloadIdentity(ctx, c.Config.VnetResourceGroup); err != nil {
return fmt.Errorf("error setting up Workload Identity Roles: %w", err)
}
}

principalIds := []string{
c.Config.FPServicePrincipalID,
}
Expand All @@ -645,6 +818,13 @@ func (c *Cluster) Create(ctx context.Context) error {
return err
}

if c.Config.UseWorkloadIdentity {
c.log.Info("creating WIs")
if err := c.SetupWorkloadIdentity(ctx, c.Config.VnetResourceGroup, diskEncryptionSetID); err != nil {
return fmt.Errorf("error setting up Workload Identity Roles: %w", err)
}
}

fipsMode := c.Config.IsCI || !c.Config.IsLocalDevelopmentMode()

// Don't install with FIPS in a local dev, non-CI environment
Expand Down Expand Up @@ -869,7 +1049,7 @@ func (c *Cluster) deleteWI(ctx context.Context, resourceGroup string) error {
return fmt.Errorf("failure parsing Platform WI Roles, unable to remove them: %w", err)
}
platformWorkloadIdentityRoles = append(platformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{
OperatorName: "aro-Cluster",
OperatorName: aroClusterIdentityOperatorName,
RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e",
})
for _, wi := range platformWorkloadIdentityRoles {
Expand Down Expand Up @@ -943,7 +1123,7 @@ func (c *Cluster) createCluster(ctx context.Context, vnetResourceGroup, clusterN
Type: api.ManagedServiceIdentityUserAssigned,
TenantID: c.Config.TenantID,
UserAssignedIdentities: map[string]api.UserAssignedIdentity{
fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.Config.SubscriptionID, vnetResourceGroup, "aro-Cluster"): {},
fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.Config.SubscriptionID, vnetResourceGroup, aroClusterIdentityOperatorName): {},
},
}
} else {
Expand Down Expand Up @@ -1323,7 +1503,7 @@ func (c *Cluster) deleteMiwiRoleAssignments(ctx context.Context, vnetResourceGro
return fmt.Errorf("failed to parse JSON: %w", err)
}
platformWorkloadIdentityRoles := append(wiRoleSets[0].PlatformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{
OperatorName: "aro-Cluster",
OperatorName: aroClusterIdentityOperatorName,
RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e",
})
for _, wi := range platformWorkloadIdentityRoles {
Expand Down
Loading