From af5329219e66a3ef52408e892d3cfcfa292ff337 Mon Sep 17 00:00:00 2001 From: Abhinav Dahiya Date: Thu, 5 Mar 2026 09:41:17 -0500 Subject: [PATCH] test: reproduce offering cache invalidation bug across nodeclasses after ICE When two EC2NodeClasses have subnets in different availability zones, the offering cache's lastUnavailableOfferingsSeqNum (keyed by instance type name only, not by the full cache key including zones hash) causes stale cache hits. After an InsufficientInstanceCapacity error, whichever nodeclass is queried first updates the shared seqNum, causing the other nodeclass to skip cache rebuild and return pre-ICE availability data. Additionally, when a single-zone NodeClass has its only zone ICE'd, the launch path returns a generic CreateError instead of InsufficientCapacityError, causing NodeClaims to stay stuck instead of being deleted and retried. These tests reproduce both issues observed after upgrading from v1.5.1 to v1.8.x. --- pkg/providers/instance/suite_test.go | 65 +++++++++++++++++++ pkg/providers/instancetype/suite_test.go | 80 ++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 1d9dff5b46f5..2d8a1ff61af7 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -573,6 +573,71 @@ var _ = Describe("InstanceProvider", func() { Expect(priotiztied.SpotOptions.AllocationStrategy).To(Equal(ec2types.SpotAllocationStrategyCapacityOptimizedPrioritized)) }) + It("should return an ICE error when a single-zone NodeClass has its only zone ICE'd for a single instance type", func() { + // Regression test for https://github.com/aws/karpenter-provider-aws/issues/8909 + // Scenario: NodePool with single instance type + EC2NodeClass with subnets in only one AZ. + // After the zone gets ICE'd, subsequent launch attempts should return InsufficientCapacityError + // (not a generic CreateError from "no capacity offerings"), so the NodeClaim gets deleted and retried. + + // Set up single-zone NodeClass (only test-zone-1a) + nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}} + nodeClass.Status.Subnets = []v1.Subnet{ + {ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"}, + } + + // Constrain to a single instance type and on-demand only + nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ + {Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"m5.xlarge"}}, + {Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}}, + } + + ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + // Re-hydrate caches with the single-zone subnet data + _, err := awsEnv.SubnetProvider.List(ctx, nodeClass) + Expect(err).To(BeNil()) + awsEnv.InstanceTypeCache.Flush() + awsEnv.OfferingCache.Flush() + Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) + Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed()) + + // First launch: ICE from AWS for m5.xlarge in test-zone-1a + awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{ + {CapacityType: karpv1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"}, + }) + + instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" }) + Expect(instanceTypes).To(HaveLen(1)) + + // The first Create triggers the ICE from CreateFleet and caches the unavailable offering + instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(instance).To(BeNil()) + + // Verify the zone is now cached as unavailable + Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable("m5.xlarge", "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) + + // Second launch attempt: instance types are re-resolved with the updated unavailable cache. + // The offerings for m5.xlarge in test-zone-1a should now be marked unavailable. + // This MUST return InsufficientCapacityError (not a generic error) so the NodeClaim is deleted and retried. + instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" }) + Expect(instanceTypes).To(HaveLen(1)) + + // Verify the offering is now marked unavailable on the instance type itself + availableOfferings := instanceTypes[0].Offerings.Available() + Expect(availableOfferings).To(HaveLen(0), "expected no available offerings after ICE in the only subnet zone") + + instance, err = awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes) + Expect(err).To(HaveOccurred()) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue(), + "expected InsufficientCapacityError on retry after ICE, but got: %v", err) + Expect(instance).To(BeNil()) + }) It("should use price capacity optimized allocation stragaty by default for spot nodeclaims", func() { nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{ { diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index facca9823ae6..e3bcf53e6a5f 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() { Expect(zones).To(HaveLen(2)) Expect(zones.UnsortedList()).To(ConsistOf([]string{"test-zone-1b", "test-zone-1c"})) }) + It("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs", func() { + // BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type + // name only, not by the full offering cache key (name + zones hash). When two + // nodeclasses have different subnet zones, they produce different offering cache keys + // for the same instance type. After an ICE error increments the seqNum, whichever + // nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other + // nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets + // a stale cache hit with pre-ICE availability data. + nodeClassA := test.EC2NodeClass(v1.EC2NodeClass{ + Spec: v1.EC2NodeClassSpec{ + SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "a"}}}, + }, + Status: v1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: nodeClass.Status.SecurityGroups, + Subnets: []v1.Subnet{ + {ID: "subnet-zone-a", Zone: "test-zone-1a", ZoneID: "tstz1-1a"}, + }, + }, + }) + nodeClassA.StatusConditions().SetTrue(status.ConditionReady) + + nodeClassB := test.EC2NodeClass(v1.EC2NodeClass{ + Spec: v1.EC2NodeClassSpec{ + SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "b"}}}, + }, + Status: v1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: nodeClass.Status.SecurityGroups, + Subnets: []v1.Subnet{ + {ID: "subnet-zone-b", Zone: "test-zone-1b", ZoneID: "tstz1-1b"}, + }, + }, + }) + nodeClassB.StatusConditions().SetTrue(status.ConditionReady) + + // Step 1: Populate offering cache for both nodeclasses. + // NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)). + // NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)). + // Both set the shared lastSeqNum["m5.large"] = 0. + listA, err := awsEnv.InstanceTypesProvider.List(ctx, nodeClassA) + Expect(err).ToNot(HaveOccurred()) + m5a, ok := lo.Find(listA, func(it *corecloudprovider.InstanceType) bool { + return it.Name == string(ec2types.InstanceTypeM5Large) + }) + Expect(ok).To(BeTrue()) + Expect(m5a.Requirements.Get(corev1.LabelTopologyZone).Values()).To(ConsistOf("test-zone-1a")) + Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{ + corev1.LabelTopologyZone: "test-zone-1a", + karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand, + }))[0].Available).To(BeTrue()) + + _, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB) + Expect(err).ToNot(HaveOccurred()) + + // Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1. + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "InsufficientInstanceCapacity"}) + + // Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b]) + // is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1. + _, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB) + Expect(err).ToNot(HaveOccurred()) + + // Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1) + // (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE + // offering for test-zone-1a/on-demand is returned with Available=true. + listA, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassA) + Expect(err).ToNot(HaveOccurred()) + m5a, ok = lo.Find(listA, func(it *corecloudprovider.InstanceType) bool { + return it.Name == string(ec2types.InstanceTypeM5Large) + }) + Expect(ok).To(BeTrue()) + + // BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable, + // but the stale cache returns Available=true. When fixed, change to BeFalse(). + Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{ + corev1.LabelTopologyZone: "test-zone-1a", + karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand, + }))[0].Available).To(BeTrue()) // BUG: should be BeFalse() + }) }) Context("CapacityType", func() { It("should default to on-demand", func() {