Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions pkg/providers/instance/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,71 @@ var _ = Describe("InstanceProvider", func() {

Expect(priotiztied.SpotOptions.AllocationStrategy).To(Equal(ec2types.SpotAllocationStrategyCapacityOptimizedPrioritized))
})
It("should return an ICE error when a single-zone NodeClass has its only zone ICE'd for a single instance type", func() {
// Regression test for https://github.com/aws/karpenter-provider-aws/issues/8909
// Scenario: NodePool with single instance type + EC2NodeClass with subnets in only one AZ.
// After the zone gets ICE'd, subsequent launch attempts should return InsufficientCapacityError
// (not a generic CreateError from "no capacity offerings"), so the NodeClaim gets deleted and retried.

// Set up single-zone NodeClass (only test-zone-1a)
nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}}
nodeClass.Status.Subnets = []v1.Subnet{
{ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
}

// Constrain to a single instance type and on-demand only
nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"m5.xlarge"}},
{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeOnDemand}},
}

ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
nodeClass = ExpectExists(ctx, env.Client, nodeClass)

// Re-hydrate caches with the single-zone subnet data
_, err := awsEnv.SubnetProvider.List(ctx, nodeClass)
Expect(err).To(BeNil())
awsEnv.InstanceTypeCache.Flush()
awsEnv.OfferingCache.Flush()
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())

// First launch: ICE from AWS for m5.xlarge in test-zone-1a
awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{
{CapacityType: karpv1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"},
})

instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())
instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
Expect(instanceTypes).To(HaveLen(1))

// The first Create triggers the ICE from CreateFleet and caches the unavailable offering
instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
Expect(instance).To(BeNil())

// Verify the zone is now cached as unavailable
Expect(awsEnv.UnavailableOfferingsCache.IsUnavailable("m5.xlarge", "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())

// Second launch attempt: instance types are re-resolved with the updated unavailable cache.
// The offerings for m5.xlarge in test-zone-1a should now be marked unavailable.
// This MUST return InsufficientCapacityError (not a generic error) so the NodeClaim is deleted and retried.
instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())
instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
Expect(instanceTypes).To(HaveLen(1))

// Verify the offering is now marked unavailable on the instance type itself
availableOfferings := instanceTypes[0].Offerings.Available()
Expect(availableOfferings).To(HaveLen(0), "expected no available offerings after ICE in the only subnet zone")

instance, err = awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
Expect(err).To(HaveOccurred())
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue(),
"expected InsufficientCapacityError on retry after ICE, but got: %v", err)
Expect(instance).To(BeNil())
})
It("should use price capacity optimized allocation stragaty by default for spot nodeclaims", func() {
nodeClaim.Spec.Requirements = []karpv1.NodeSelectorRequirementWithMinValues{
{
Expand Down
80 changes: 80 additions & 0 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() {
Expect(zones).To(HaveLen(2))
Expect(zones.UnsortedList()).To(ConsistOf([]string{"test-zone-1b", "test-zone-1c"}))
})
It("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs", func() {
// BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type
// name only, not by the full offering cache key (name + zones hash). When two
// nodeclasses have different subnet zones, they produce different offering cache keys
// for the same instance type. After an ICE error increments the seqNum, whichever
// nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other
// nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets
// a stale cache hit with pre-ICE availability data.
nodeClassA := test.EC2NodeClass(v1.EC2NodeClass{
Spec: v1.EC2NodeClassSpec{
SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "a"}}},
},
Status: v1.EC2NodeClassStatus{
InstanceProfile: "test-profile",
SecurityGroups: nodeClass.Status.SecurityGroups,
Subnets: []v1.Subnet{
{ID: "subnet-zone-a", Zone: "test-zone-1a", ZoneID: "tstz1-1a"},
},
},
})
nodeClassA.StatusConditions().SetTrue(status.ConditionReady)

nodeClassB := test.EC2NodeClass(v1.EC2NodeClass{
Spec: v1.EC2NodeClassSpec{
SubnetSelectorTerms: []v1.SubnetSelectorTerm{{Tags: map[string]string{"zone": "b"}}},
},
Status: v1.EC2NodeClassStatus{
InstanceProfile: "test-profile",
SecurityGroups: nodeClass.Status.SecurityGroups,
Subnets: []v1.Subnet{
{ID: "subnet-zone-b", Zone: "test-zone-1b", ZoneID: "tstz1-1b"},
},
},
})
nodeClassB.StatusConditions().SetTrue(status.ConditionReady)

// Step 1: Populate offering cache for both nodeclasses.
// NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)).
// NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)).
// Both set the shared lastSeqNum["m5.large"] = 0.
listA, err := awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
Expect(err).ToNot(HaveOccurred())
m5a, ok := lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
return it.Name == string(ec2types.InstanceTypeM5Large)
})
Expect(ok).To(BeTrue())
Expect(m5a.Requirements.Get(corev1.LabelTopologyZone).Values()).To(ConsistOf("test-zone-1a"))
Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
corev1.LabelTopologyZone: "test-zone-1a",
karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
}))[0].Available).To(BeTrue())

_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
Expect(err).ToNot(HaveOccurred())

// Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1.
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "InsufficientInstanceCapacity"})

// Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b])
// is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1.
_, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassB)
Expect(err).ToNot(HaveOccurred())

// Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1)
// (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE
// offering for test-zone-1a/on-demand is returned with Available=true.
listA, err = awsEnv.InstanceTypesProvider.List(ctx, nodeClassA)
Expect(err).ToNot(HaveOccurred())
m5a, ok = lo.Find(listA, func(it *corecloudprovider.InstanceType) bool {
return it.Name == string(ec2types.InstanceTypeM5Large)
})
Expect(ok).To(BeTrue())

// BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable,
// but the stale cache returns Available=true. When fixed, change to BeFalse().
Expect(m5a.Offerings.Compatible(scheduling.NewLabelRequirements(map[string]string{
corev1.LabelTopologyZone: "test-zone-1a",
karpv1.CapacityTypeLabelKey: karpv1.CapacityTypeOnDemand,
}))[0].Available).To(BeTrue()) // BUG: should be BeFalse()
})
})
Context("CapacityType", func() {
It("should default to on-demand", func() {
Expand Down
Loading