aws · abhinavdahiya · Mar 5, 2026
@@ -573,6 +573,71 @@ var _ = Describe("InstanceProvider", func() {
 
 		Expect(priotiztied.SpotOptions.AllocationStrategy).To(Equal(ec2types.SpotAllocationStrategyCapacityOptimizedPrioritized))
 	})
+	It("should return an ICE error when a single-zone NodeClass has its only zone ICE'd for a single instance type", func() {
+		// Regression test for https://github.com/aws/karpenter-provider-aws/issues/8909
+		// Scenario: NodePool with single instance type + EC2NodeClass with subnets in only one AZ.
+		// After the zone gets ICE'd, subsequent launch attempts should return InsufficientCapacityError
+		// (not a generic CreateError from "no capacity offerings"), so the NodeClaim gets deleted and retried.
+
+		// Set up single-zone NodeClass (only test-zone-1a)
+		nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}}
+		nodeClass.Status.Subnets = []v1.Subnet{
+			{ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
+		}
+
+		// Constrain to a single instance type and on-demand only
+		nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
+			{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"m5.xlarge"}},
+			{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}},
+		}
+
+		ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
+		nodeClass = ExpectExists(ctx, env.Client, nodeClass)
+
+		// Re-hydrate caches with the single-zone subnet data
+		_, err := awsEnv.SubnetProvider.List(ctx, nodeClass)
+		Expect(err).To(BeNil())
+		awsEnv.InstanceTypeCache.Flush()
+		awsEnv.OfferingCache.Flush()
+		Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
+		Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
+
+		// First launch: ICE from AWS for m5.xlarge in test-zone-1a
+		awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{
+			{CapacityType: karpv1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"},
+		})
+
+		instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
+		Expect(err).ToNot(HaveOccurred())
+		instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
+		Expect(instanceTypes).To(HaveLen(1))
+
+		// The first Create triggers the ICE from CreateFleet and caches the unavailable offering
+		instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
+		Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
+		Expect(instance).To(BeNil())
+
+		// Verify the zone is now cached as unavailable
+		Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable("m5.xlarge", "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
+
+		// Second launch attempt: instance types are re-resolved with the updated unavailable cache.
+		// The offerings for m5.xlarge in test-zone-1a should now be marked unavailable.
+		// This MUST return InsufficientCapacityError (not a generic error) so the NodeClaim is deleted and retried.
+		instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
+		Expect(err).ToNot(HaveOccurred())
+		instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
+		Expect(instanceTypes).To(HaveLen(1))
+
+		// Verify the offering is now marked unavailable on the instance type itself
+		availableOfferings := instanceTypes[0].Offerings.Available()
+		Expect(availableOfferings).To(HaveLen(0), "expected no available offerings after ICE in the only subnet zone")
+
+		instance, err = awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
+		Expect(err).To(HaveOccurred())
+		Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue(),
+			"expected InsufficientCapacityError on retry after ICE, but got: %v", err)
+		Expect(instance).To(BeNil())
+	})
 	It("should use price capacity optimized allocation stragaty by default for spot nodeclaims", func() {
 		nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
 			{

@@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() {
 			Expect(zones).To(HaveLen(2))
 			Expect(zones.UnsortedList()).To(ConsistOf([]string{"test-zone-1b", "test-zone-1c"}))
 		})
+		It("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs", func() {
+			// BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type
+			// name only, not by the full offering cache key (name + zones hash). When two
+			// nodeclasses have different subnet zones, they produce different offering cache keys
+			// for the same instance type. After an ICE error increments the seqNum, whichever
+			// nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other
+			// nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets
+			// a stale cache hit with pre-ICE availability data.
+			nodeClassA := test.EC2NodeClass(v1.EC2NodeClass{
+				Spec: v1.EC2NodeClassSpec{
+					SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "a"}}},
+				},
+				Status: v1.EC2NodeClassStatus{
+					InstanceProfile: "test-profile",
+					SecurityGroups:  nodeClass.Status.SecurityGroups,
+					Subnets: []v1.Subnet{
+						{ID: "subnet-zone-a", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
+					},
+				},
+			})
+			nodeClassA.StatusConditions().SetTrue(status.ConditionReady)
+
+			nodeClassB := test.EC2NodeClass(v1.EC2NodeClass{
+				Spec: v1.EC2NodeClassSpec{
+					SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "b"}}},
+				},
+				Status: v1.EC2NodeClassStatus{
+					InstanceProfile: "test-profile",
+					SecurityGroups:  nodeClass.Status.SecurityGroups,
+					Subnets: []v1.Subnet{
+						{ID: "subnet-zone-b", Zone: "test-zone-1b", ZoneID: "tstz1-1b"},
+					},
+				},
+			})
+			nodeClassB.StatusConditions().SetTrue(status.ConditionReady)
+
+			// Step 1: Populate offering cache for both nodeclasses.
+			// NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)).
+			// NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)).
+			// Both set the shared lastSeqNum["m5.large"] = 0.
+			listA, err := awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
+			Expect(err).ToNot(HaveOccurred())
+			m5a, ok := lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
+				return it.Name == string(ec2types.InstanceTypeM5Large)
+			})
+			Expect(ok).To(BeTrue())
+			Expect(m5a.Requirements.Get(corev1.LabelTopologyZone).Values()).To(ConsistOf("test-zone-1a"))
+			Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
+				corev1.LabelTopologyZone:    "test-zone-1a",
+				karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
+			}))[0].Available).To(BeTrue())
+
+			_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
+			Expect(err).ToNot(HaveOccurred())
+
+			// Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1.
+			awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "InsufficientInstanceCapacity"})
+
+			// Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b])
+			// is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1.
+			_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
+			Expect(err).ToNot(HaveOccurred())
+
+			// Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1)
+			// (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE
+			// offering for test-zone-1a/on-demand is returned with Available=true.
+			listA, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
+			Expect(err).ToNot(HaveOccurred())
+			m5a, ok = lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
+				return it.Name == string(ec2types.InstanceTypeM5Large)
+			})
+			Expect(ok).To(BeTrue())
+
+			// BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable,
+			// but the stale cache returns Available=true. When fixed, change to BeFalse().
+			Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
+				corev1.LabelTopologyZone:    "test-zone-1a",
+				karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
+			}))[0].Available).To(BeTrue()) // BUG: should be BeFalse()
+		})
 	})
 	Context("CapacityType", func() {
 		It("should default to on-demand", func() {