Skip to content

Commit 88bfce8

Browse files
committed
test: reproduce offering cache invalidation bug across nodeclasses after ICE
When two EC2NodeClasses have subnets in different availability zones, the offering cache's lastUnavailableOfferingsSeqNum (keyed by instance type name only, not by the full cache key including zones hash) causes stale cache hits. After an InsufficientInstanceCapacity error, whichever nodeclass is queried first updates the shared seqNum, causing the other nodeclass to skip cache rebuild and return pre-ICE availability data. Additionally, when a single-zone NodeClass has its only zone ICE'd, the launch path returns a generic CreateError instead of InsufficientCapacityError, causing NodeClaims to stay stuck instead of being deleted and retried. These tests reproduce both issues observed after upgrading from v1.5.1 to v1.8.x.
1 parent 7caf2f2 commit 88bfce8

File tree

2 files changed

+145
-0
lines changed

2 files changed

+145
-0
lines changed

pkg/providers/instance/suite_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,71 @@ var _ = Describe("InstanceProvider", func() {
497497

498498
Expect(priotiztied.SpotOptions.AllocationStrategy).To(Equal(ec2types.SpotAllocationStrategyCapacityOptimizedPrioritized))
499499
})
500+
It("should return an ICE error when a single-zone NodeClass has its only zone ICE'd for a single instance type", func() {
501+
// Regression test for https://github.com/aws/karpenter-provider-aws/issues/8909
502+
// Scenario: NodePool with single instance type + EC2NodeClass with subnets in only one AZ.
503+
// After the zone gets ICE'd, subsequent launch attempts should return InsufficientCapacityError
504+
// (not a generic CreateError from "no capacity offerings"), so the NodeClaim gets deleted and retried.
505+
506+
// Set up single-zone NodeClass (only test-zone-1a)
507+
nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}}
508+
nodeClass.Status.Subnets = []v1.Subnet{
509+
{ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
510+
}
511+
512+
// Constrain to a single instance type and on-demand only
513+
nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
514+
{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"m5.xlarge"}},
515+
{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}},
516+
}
517+
518+
ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
519+
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
520+
521+
// Re-hydrate caches with the single-zone subnet data
522+
_, err := awsEnv.SubnetProvider.List(ctx, nodeClass)
523+
Expect(err).To(BeNil())
524+
awsEnv.InstanceTypeCache.Flush()
525+
awsEnv.OfferingCache.Flush()
526+
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
527+
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
528+
529+
// First launch: ICE from AWS for m5.xlarge in test-zone-1a
530+
awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{
531+
{CapacityType: karpv1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"},
532+
})
533+
534+
instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
535+
Expect(err).ToNot(HaveOccurred())
536+
instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
537+
Expect(instanceTypes).To(HaveLen(1))
538+
539+
// The first Create triggers the ICE from CreateFleet and caches the unavailable offering
540+
instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
541+
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
542+
Expect(instance).To(BeNil())
543+
544+
// Verify the zone is now cached as unavailable
545+
Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable("m5.xlarge", "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
546+
547+
// Second launch attempt: instance types are re-resolved with the updated unavailable cache.
548+
// The offerings for m5.xlarge in test-zone-1a should now be marked unavailable.
549+
// This MUST return InsufficientCapacityError (not a generic error) so the NodeClaim is deleted and retried.
550+
instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
551+
Expect(err).ToNot(HaveOccurred())
552+
instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
553+
Expect(instanceTypes).To(HaveLen(1))
554+
555+
// Verify the offering is now marked unavailable on the instance type itself
556+
availableOfferings := instanceTypes[0].Offerings.Available()
557+
Expect(availableOfferings).To(HaveLen(0), "expected no available offerings after ICE in the only subnet zone")
558+
559+
instance, err = awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
560+
Expect(err).To(HaveOccurred())
561+
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue(),
562+
"expected InsufficientCapacityError on retry after ICE, but got: %v", err)
563+
Expect(instance).To(BeNil())
564+
})
500565
It("should use price capacity optimized allocation stragaty by default for spot nodeclaims", func() {
501566
nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
502567
{

pkg/providers/instancetype/suite_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() {
23712371
Expect(zones).To(HaveLen(2))
23722372
Expect(zones.UnsortedList()).To(ConsistOf([]string{"test-zone-1b", "test-zone-1c"}))
23732373
})
2374+
It("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs", func() {
2375+
// BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type
2376+
// name only, not by the full offering cache key (name + zones hash). When two
2377+
// nodeclasses have different subnet zones, they produce different offering cache keys
2378+
// for the same instance type. After an ICE error increments the seqNum, whichever
2379+
// nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other
2380+
// nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets
2381+
// a stale cache hit with pre-ICE availability data.
2382+
nodeClassA := test.EC2NodeClass(v1.EC2NodeClass{
2383+
Spec: v1.EC2NodeClassSpec{
2384+
SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "a"}}},
2385+
},
2386+
Status: v1.EC2NodeClassStatus{
2387+
InstanceProfile: "test-profile",
2388+
SecurityGroups: nodeClass.Status.SecurityGroups,
2389+
Subnets: []v1.Subnet{
2390+
{ID: "subnet-zone-a", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
2391+
},
2392+
},
2393+
})
2394+
nodeClassA.StatusConditions().SetTrue(status.ConditionReady)
2395+
2396+
nodeClassB := test.EC2NodeClass(v1.EC2NodeClass{
2397+
Spec: v1.EC2NodeClassSpec{
2398+
SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "b"}}},
2399+
},
2400+
Status: v1.EC2NodeClassStatus{
2401+
InstanceProfile: "test-profile",
2402+
SecurityGroups: nodeClass.Status.SecurityGroups,
2403+
Subnets: []v1.Subnet{
2404+
{ID: "subnet-zone-b", Zone: "test-zone-1b", ZoneID: "tstz1-1b"},
2405+
},
2406+
},
2407+
})
2408+
nodeClassB.StatusConditions().SetTrue(status.ConditionReady)
2409+
2410+
// Step 1: Populate offering cache for both nodeclasses.
2411+
// NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)).
2412+
// NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)).
2413+
// Both set the shared lastSeqNum["m5.large"] = 0.
2414+
listA, err := awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
2415+
Expect(err).ToNot(HaveOccurred())
2416+
m5a, ok := lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
2417+
return it.Name == string(ec2types.InstanceTypeM5Large)
2418+
})
2419+
Expect(ok).To(BeTrue())
2420+
Expect(m5a.Requirements.Get(corev1.LabelTopologyZone).Values()).To(ConsistOf("test-zone-1a"))
2421+
Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
2422+
corev1.LabelTopologyZone: "test-zone-1a",
2423+
karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
2424+
}))[0].Available).To(BeTrue())
2425+
2426+
_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
2427+
Expect(err).ToNot(HaveOccurred())
2428+
2429+
// Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1.
2430+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "InsufficientInstanceCapacity"})
2431+
2432+
// Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b])
2433+
// is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1.
2434+
_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
2435+
Expect(err).ToNot(HaveOccurred())
2436+
2437+
// Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1)
2438+
// (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE
2439+
// offering for test-zone-1a/on-demand is returned with Available=true.
2440+
listA, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
2441+
Expect(err).ToNot(HaveOccurred())
2442+
m5a, ok = lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
2443+
return it.Name == string(ec2types.InstanceTypeM5Large)
2444+
})
2445+
Expect(ok).To(BeTrue())
2446+
2447+
// BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable,
2448+
// but the stale cache returns Available=true. When fixed, change to BeFalse().
2449+
Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
2450+
corev1.LabelTopologyZone: "test-zone-1a",
2451+
karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
2452+
}))[0].Available).To(BeTrue()) // BUG: should be BeFalse()
2453+
})
23742454
})
23752455
Context("CapacityType", func() {
23762456
It("should default to on-demand", func() {

0 commit comments

Comments
 (0)