diff --git a/pkg/controllers/nodeclass/ami.go b/pkg/controllers/nodeclass/ami.go index 334a2c5ff000..cc964f07bb47 100644 --- a/pkg/controllers/nodeclass/ami.go +++ b/pkg/controllers/nodeclass/ami.go @@ -47,7 +47,7 @@ func NewAMIReconciler(provider amifamily.Provider) *AMI { func (a *AMI) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) (reconcile.Result, error) { amis, err := a.amiProvider.List(ctx, nodeClass) if err != nil { - if amifamily.IsAl2DeprecationError(err) || amifamily.IsWS2025UnsupportedVersionError(err) { + if amifamily.IsAl2DeprecationError(err) || amifamily.IsWS2025UnsupportedVersionError(err) || amifamily.IsAMINotFoundForAliasError(err) { nodeClass.StatusConditions().SetFalse(v1.ConditionTypeAMIsReady, "UnsupportedAlias", err.Error()) return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("getting amis, %w", err)) } diff --git a/pkg/controllers/nodeclass/ami_test.go b/pkg/controllers/nodeclass/ami_test.go index 06b3ba27c6e3..54b863eea203 100644 --- a/pkg/controllers/nodeclass/ami_test.go +++ b/pkg/controllers/nodeclass/ami_test.go @@ -618,6 +618,36 @@ var _ = Describe("NodeClass AMI Status Controller", func() { nodeClass = ExpectExists(ctx, env.Client, nodeClass) Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsReady)).To(BeFalse()) }) + DescribeTable("should set UnsupportedAlias condition when no AMIs are found for alias", func(alias string) { + awsEnv.SSMAPI.Parameters = map[string]string{"not-real": "not-real"} + + nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: alias}} + ExpectApplied(ctx, env.Client, nodeClass) + _ = ExpectObjectReconcileFailed(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).IsFalse()).To(BeTrue()) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).Reason).To(Equal("UnsupportedAlias")) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).Message).To(ContainSubstring("failed to discover any AMIs for alias")) + }, + Entry("AL2023 with invalid version", "al2023@v20240101"), + Entry("Bottlerocket with invalid version", "bottlerocket@v20240101"), + Entry("Windows with invalid version", "windows2022@latest"), + ) + DescribeTable("should retry when SSM returns transient errors", func(alias string) { + awsEnv.SSMAPI.WantErr = fmt.Errorf("RequestLimitExceeded: Rate exceeded") + + nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: alias}} + ExpectApplied(ctx, env.Client, nodeClass) + _ = ExpectObjectReconcileFailed(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).Reason).ToNot(Equal("UnsupportedAlias")) + }, + Entry("AL2023 with invalid version", "al2023@v20240101"), + Entry("Bottlerocket with invalid version", "bottlerocket@v20240101"), + Entry("Windows with invalid version", "windows2022@latest"), + ) Context("NodeClass AMI Status", func() { BeforeEach(func() { // Set time using the injectable/fake clock to now diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index 1f0ff0379cb9..bfaa5de59297 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -44,6 +44,7 @@ var ( "QueueDoesNotExist", "NoSuchEntity", "ParameterNotFound", + "ParameterVersionNotFound", ) alreadyExistsErrorCodes = sets.New( "EntityAlreadyExists", diff --git a/pkg/fake/ssmapi.go b/pkg/fake/ssmapi.go index 03e13cd43fab..dbe54cf26cef 100644 --- a/pkg/fake/ssmapi.go +++ b/pkg/fake/ssmapi.go @@ -19,11 +19,11 @@ import ( "fmt" "github.com/Pallinder/go-randomdata" - "github.com/awslabs/operatorpkg/serrors" "github.com/samber/lo" "github.com/aws/aws-sdk-go-v2/service/ssm" ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types" + "github.com/aws/smithy-go" sdk "github.com/aws/karpenter-provider-aws/pkg/aws" ) @@ -54,7 +54,10 @@ func (a SSMAPI) GetParameter(_ context.Context, input *ssm.GetParameterInput, _ if len(a.Parameters) != 0 { value, ok := a.Parameters[parameter] if !ok { - return &ssm.GetParameterOutput{}, serrors.Wrap(fmt.Errorf("parameter not found"), "parameter", lo.FromPtr(input.Name)) + return &ssm.GetParameterOutput{}, &smithy.GenericAPIError{ + Code: "ParameterNotFound", + Message: fmt.Sprintf("parameter %s not found", parameter), + } } return &ssm.GetParameterOutput{ Parameter: &ssmtypes.Parameter{ diff --git a/pkg/providers/amifamily/al2.go b/pkg/providers/amifamily/al2.go index f0fe069a90b7..3cdbd5798401 100644 --- a/pkg/providers/amifamily/al2.go +++ b/pkg/providers/amifamily/al2.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/karpenter/pkg/scheduling" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/errors" "sigs.k8s.io/karpenter/pkg/cloudprovider" @@ -42,6 +43,7 @@ type AL2 struct { func (a AL2) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provider, k8sVersion string, amiVersion string) (DescribeImageQuery, error) { ids := map[string][]Variant{} + isAllNotFoundErrors := true for path, variants := range map[string][]Variant{ fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2/%s/image_id", k8sVersion, lo.Ternary( amiVersion == v1.AliasVersionLatest, @@ -64,12 +66,18 @@ func (a AL2) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provider, k IsMutable: amiVersion == v1.AliasVersionLatest, }) if err != nil { + isAllNotFoundErrors = isAllNotFoundErrors && errors.IsNotFound(err) continue } ids[imageID] = variants } // Failed to discover any AMIs, we should short circuit AMI discovery if len(ids) == 0 { + if isAllNotFoundErrors { + return DescribeImageQuery{}, &AMINotFoundForAliasError{ + error: serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("al2@%s", amiVersion)), + } + } return DescribeImageQuery{}, serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("al2@%s", amiVersion)) } diff --git a/pkg/providers/amifamily/al2023.go b/pkg/providers/amifamily/al2023.go index 6d629534a20e..1db7fea585ab 100644 --- a/pkg/providers/amifamily/al2023.go +++ b/pkg/providers/amifamily/al2023.go @@ -27,6 +27,7 @@ import ( ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/errors" "github.com/aws/karpenter-provider-aws/pkg/providers/amifamily/bootstrap" "github.com/aws/karpenter-provider-aws/pkg/providers/ssm" ) @@ -38,6 +39,7 @@ type AL2023 struct { func (a AL2023) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provider, k8sVersion string, amiVersion string) (DescribeImageQuery, error) { ids := map[string]Variant{} + isAllNotFoundErrors := true for arch, variants := range map[string][]Variant{ "x86_64": {VariantStandard, VariantNvidia, VariantNeuron}, "arm64": {VariantStandard, VariantNvidia}, @@ -49,6 +51,7 @@ func (a AL2023) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provider IsMutable: amiVersion == v1.AliasVersionLatest, }) if err != nil { + isAllNotFoundErrors = isAllNotFoundErrors && errors.IsNotFound(err) continue } ids[imageID] = variant @@ -56,6 +59,11 @@ func (a AL2023) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provider } // Failed to discover any AMIs, we should short circuit AMI discovery if len(ids) == 0 { + if isAllNotFoundErrors { + return DescribeImageQuery{}, &AMINotFoundForAliasError{ + error: serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("al2023@%s", amiVersion)), + } + } return DescribeImageQuery{}, serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("al2023@%s", amiVersion)) } diff --git a/pkg/providers/amifamily/bottlerocket.go b/pkg/providers/amifamily/bottlerocket.go index d2b92603951d..9237256116ab 100644 --- a/pkg/providers/amifamily/bottlerocket.go +++ b/pkg/providers/amifamily/bottlerocket.go @@ -23,6 +23,7 @@ import ( "github.com/samber/lo" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/errors" "github.com/aws/karpenter-provider-aws/pkg/providers/amifamily/bootstrap" "github.com/aws/karpenter-provider-aws/pkg/providers/ssm" "github.com/aws/karpenter-provider-aws/pkg/providers/version" @@ -45,6 +46,7 @@ func (b Bottlerocket) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Pr // Bottlerocket AMIs versions are prefixed with a v on GitHub, but not in the SSM path. We should accept both. trimmedAMIVersion := strings.TrimLeft(amiVersion, "v") ids := map[string][]Variant{} + isAllNotFoundErrors := true for path, variants := range map[string][]Variant{ fmt.Sprintf("/aws/service/bottlerocket/aws-k8s-%s/x86_64/%s/image_id", k8sVersion, trimmedAMIVersion): {VariantStandard, VariantNeuron}, fmt.Sprintf("/aws/service/bottlerocket/aws-k8s-%s/arm64/%s/image_id", k8sVersion, trimmedAMIVersion): {VariantStandard}, @@ -56,12 +58,18 @@ func (b Bottlerocket) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Pr IsMutable: amiVersion == v1.AliasVersionLatest, }) if err != nil { + isAllNotFoundErrors = isAllNotFoundErrors && errors.IsNotFound(err) continue } ids[imageID] = variants } // Failed to discover any AMIs, we should short circuit AMI discovery if len(ids) == 0 { + if isAllNotFoundErrors { + return DescribeImageQuery{}, &AMINotFoundForAliasError{ + error: serrors.Wrap(fmt.Errorf(`failed to discover any AMIs for alias`), "alias", fmt.Sprintf("bottlerocket@%s", amiVersion)), + } + } return DescribeImageQuery{}, serrors.Wrap(fmt.Errorf(`failed to discover any AMIs for alias`), "alias", fmt.Sprintf("bottlerocket@%s", amiVersion)) } diff --git a/pkg/providers/amifamily/types.go b/pkg/providers/amifamily/types.go index eec329d32ee3..ddf6f4d6d444 100644 --- a/pkg/providers/amifamily/types.go +++ b/pkg/providers/amifamily/types.go @@ -153,3 +153,15 @@ func IsWS2025UnsupportedVersionError(err error) bool { var ws2025Err *WS2025UnsupportedVersionError return errors.As(err, &ws2025Err) } + +type AMINotFoundForAliasError struct { + error +} + +func IsAMINotFoundForAliasError(err error) bool { + if err == nil { + return false + } + var notFoundErr *AMINotFoundForAliasError + return errors.As(err, ¬FoundErr) +} diff --git a/pkg/providers/amifamily/windows.go b/pkg/providers/amifamily/windows.go index 3b9a353ea676..15f00f66d1cb 100644 --- a/pkg/providers/amifamily/windows.go +++ b/pkg/providers/amifamily/windows.go @@ -22,6 +22,7 @@ import ( "sigs.k8s.io/karpenter/pkg/scheduling" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/errors" "github.com/samber/lo" "k8s.io/apimachinery/pkg/api/resource" @@ -53,6 +54,11 @@ func (w Windows) DescribeImageQuery(ctx context.Context, ssmProvider ssm.Provide IsMutable: true, }) if err != nil { + if errors.IsNotFound(err) { + return DescribeImageQuery{}, &AMINotFoundForAliasError{ + error: serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("windows%s@%s", w.Version, amiVersion)), + } + } return DescribeImageQuery{}, serrors.Wrap(fmt.Errorf("failed to discover any AMIs for alias"), "alias", fmt.Sprintf("windows%s@%s", w.Version, amiVersion)) } return DescribeImageQuery{