diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 1d9dff5b46f5..2d8a1ff61af7 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -573,6 +573,71 @@ var _ = Describe("InstanceProvider", func() { Expect(priotiztied.SpotOptions.AllocationStrategy).To(Equal(ec2types.SpotAllocationStrategyCapacityOptimizedPrioritized)) }) + It("should return an ICE error when a single-zone NodeClass has its only zone ICE'd for a single instance type", func() { + // Regression test for https://github.com/aws/karpenter-provider-aws/issues/8909 + // Scenario: NodePool with single instance type + EC2NodeClass with subnets in only one AZ. + // After the zone gets ICE'd, subsequent launch attempts should return InsufficientCapacityError + // (not a generic CreateError from "no capacity offerings"), so the NodeClaim gets deleted and retried. + + // Set up single-zone NodeClass (only test-zone-1a) + nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}} + nodeClass.Status.Subnets = []v1.Subnet{ + {ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"}, + } + + // Constrain to a single instance type and on-demand only + nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + {Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"m5.xlarge"}}, + {Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}}, + } + + ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + // Re-hydrate caches with the single-zone subnet data + _, err := awsEnv.SubnetProvider.List(ctx, nodeClass) + Expect(err).To(BeNil()) + awsEnv.InstanceTypeCache.Flush() + awsEnv.OfferingCache.Flush() + Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed()) + + // First launch: ICE from AWS for m5.xlarge in test-zone-1a + awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{ + {CapacityType: karpv1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"}, + }) + + instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" }) + Expect(instanceTypes).To(HaveLen(1)) + + // The first Create triggers the ICE from CreateFleet and caches the unavailable offering + instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(instance).To(BeNil()) + + // Verify the zone is now cached as unavailable + Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable("m5.xlarge", "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) + + // Second launch attempt: instance types are re-resolved with the updated unavailable cache. + // The offerings for m5.xlarge in test-zone-1a should now be marked unavailable. + // This MUST return InsufficientCapacityError (not a generic error) so the NodeClaim is deleted and retried. + instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" }) + Expect(instanceTypes).To(HaveLen(1)) + + // Verify the offering is now marked unavailable on the instance type itself + availableOfferings := instanceTypes[0].Offerings.Available() + Expect(availableOfferings).To(HaveLen(0), "expected no available offerings after ICE in the only subnet zone") + + instance, err = awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes) + Expect(err).To(HaveOccurred()) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue(), + "expected InsufficientCapacityError on retry after ICE, but got: %v", err) + Expect(instance).To(BeNil()) + }) It("should use price capacity optimized allocation stragaty by default for spot nodeclaims", func() { nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ { diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index facca9823ae6..e3bcf53e6a5f 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() { Expect(zones).To(HaveLen(2)) Expect(zones.UnsortedList()).To(ConsistOf([]string{"test-zone-1b", "test-zone-1c"})) }) + It("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs", func() { + // BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type + // name only, not by the full offering cache key (name + zones hash). When two + // nodeclasses have different subnet zones, they produce different offering cache keys + // for the same instance type. After an ICE error increments the seqNum, whichever + // nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other + // nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets + // a stale cache hit with pre-ICE availability data. + nodeClassA := test.EC2NodeClass(v1.EC2NodeClass{ + Spec: v1.EC2NodeClassSpec{ + SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "a"}}}, + }, + Status: v1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: nodeClass.Status.SecurityGroups, + Subnets: []v1.Subnet{ + {ID: "subnet-zone-a", Zone: "test-zone-1a", ZoneID: "tstz1-1a"}, + }, + }, + }) + nodeClassA.StatusConditions().SetTrue(status.ConditionReady) + + nodeClassB := test.EC2NodeClass(v1.EC2NodeClass{ + Spec: v1.EC2NodeClassSpec{ + SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "b"}}}, + }, + Status: v1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: nodeClass.Status.SecurityGroups, + Subnets: []v1.Subnet{ + {ID: "subnet-zone-b", Zone: "test-zone-1b", ZoneID: "tstz1-1b"}, + }, + }, + }) + nodeClassB.StatusConditions().SetTrue(status.ConditionReady) + + // Step 1: Populate offering cache for both nodeclasses. + // NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)). + // NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)). + // Both set the shared lastSeqNum["m5.large"] = 0. + listA, err := awsEnv.InstanceTypesProvider.List(ctx, nodeClassA) + Expect(err).ToNot(HaveOccurred()) + m5a, ok := lo.Find(listA, func(it *corecloudprovider.InstanceType) bool { + return it.Name == string(ec2types.InstanceTypeM5Large) + }) + Expect(ok).To(BeTrue()) + Expect(m5a.Requirements.Get(corev1.LabelTopologyZone).Values()).To(ConsistOf("test-zone-1a")) + Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{ + corev1.LabelTopologyZone: "test-zone-1a", + karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand, + }))[0].Available).To(BeTrue()) + + _, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB) + Expect(err).ToNot(HaveOccurred()) + + // Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1. + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "InsufficientInstanceCapacity"}) + + // Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b]) + // is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1. + _, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB) + Expect(err).ToNot(HaveOccurred()) + + // Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1) + // (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE + // offering for test-zone-1a/on-demand is returned with Available=true. + listA, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassA) + Expect(err).ToNot(HaveOccurred()) + m5a, ok = lo.Find(listA, func(it *corecloudprovider.InstanceType) bool { + return it.Name == string(ec2types.InstanceTypeM5Large) + }) + Expect(ok).To(BeTrue()) + + // BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable, + // but the stale cache returns Available=true. When fixed, change to BeFalse(). + Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{ + corev1.LabelTopologyZone: "test-zone-1a", + karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand, + }))[0].Available).To(BeTrue()) // BUG: should be BeFalse() + }) }) Context("CapacityType", func() { It("should default to on-demand", func() {