@@ -2371,6 +2371,86 @@ var _ = Describe("InstanceTypeProvider", func() {
23712371 Expect (zones ).To (HaveLen (2 ))
23722372 Expect (zones .UnsortedList ()).To (ConsistOf ([]string {"test-zone-1b" , "test-zone-1c" }))
23732373 })
2374+ It ("should invalidate offering cache for an instance type across different nodeclasses when an ICE error occurs" , func () {
2375+ // BUG: The offering cache's lastUnavailableOfferingsSeqNum is keyed by instance type
2376+ // name only, not by the full offering cache key (name + zones hash). When two
2377+ // nodeclasses have different subnet zones, they produce different offering cache keys
2378+ // for the same instance type. After an ICE error increments the seqNum, whichever
2379+ // nodeclass's offerings are rebuilt FIRST updates the shared lastSeqNum. The other
2380+ // nodeclass then sees lastSeqNum == seqNum (a false "up-to-date" signal) and gets
2381+ // a stale cache hit with pre-ICE availability data.
2382+ nodeClassA := test .EC2NodeClass (v1.EC2NodeClass {
2383+ Spec : v1.EC2NodeClassSpec {
2384+ SubnetSelectorTerms : []v1.SubnetSelectorTerm {{Tags : map [string ]string {"zone" : "a" }}},
2385+ },
2386+ Status : v1.EC2NodeClassStatus {
2387+ InstanceProfile : "test-profile" ,
2388+ SecurityGroups : nodeClass .Status .SecurityGroups ,
2389+ Subnets : []v1.Subnet {
2390+ {ID : "subnet-zone-a" , Zone : "test-zone-1a" , ZoneID : "tstz1-1a" },
2391+ },
2392+ },
2393+ })
2394+ nodeClassA .StatusConditions ().SetTrue (status .ConditionReady )
2395+
2396+ nodeClassB := test .EC2NodeClass (v1.EC2NodeClass {
2397+ Spec : v1.EC2NodeClassSpec {
2398+ SubnetSelectorTerms : []v1.SubnetSelectorTerm {{Tags : map [string ]string {"zone" : "b" }}},
2399+ },
2400+ Status : v1.EC2NodeClassStatus {
2401+ InstanceProfile : "test-profile" ,
2402+ SecurityGroups : nodeClass .Status .SecurityGroups ,
2403+ Subnets : []v1.Subnet {
2404+ {ID : "subnet-zone-b" , Zone : "test-zone-1b" , ZoneID : "tstz1-1b" },
2405+ },
2406+ },
2407+ })
2408+ nodeClassB .StatusConditions ().SetTrue (status .ConditionReady )
2409+
2410+ // Step 1: Populate offering cache for both nodeclasses.
2411+ // NodeClassA's m5.large has zones=[test-zone-1a], offering cache key = f(m5.large, hash(1a)).
2412+ // NodeClassB's m5.large has zones=[test-zone-1b], offering cache key = f(m5.large, hash(1b)).
2413+ // Both set the shared lastSeqNum["m5.large"] = 0.
2414+ listA , err := awsEnv .InstanceTypesProvider .List (ctx , nodeClassA )
2415+ Expect (err ).ToNot (HaveOccurred ())
2416+ m5a , ok := lo .Find (listA , func (it * corecloudprovider.InstanceType ) bool {
2417+ return it .Name == string (ec2types .InstanceTypeM5Large )
2418+ })
2419+ Expect (ok ).To (BeTrue ())
2420+ Expect (m5a .Requirements .Get (corev1 .LabelTopologyZone ).Values ()).To (ConsistOf ("test-zone-1a" ))
2421+ Expect (m5a .Offerings .Compatible (scheduling .NewLabelRequirements (map [string ]string {
2422+ corev1 .LabelTopologyZone : "test-zone-1a" ,
2423+ karpv1 .CapacityTypeLabelKey : karpv1 .CapacityTypeOnDemand ,
2424+ }))[0 ].Available ).To (BeTrue ())
2425+
2426+ _ , err = awsEnv .InstanceTypesProvider .List (ctx , nodeClassB )
2427+ Expect (err ).ToNot (HaveOccurred ())
2428+
2429+ // Step 2: ICE marks m5.large/test-zone-1a/on-demand unavailable → seqNum becomes 1.
2430+ awsEnv .UnavailableOfferingsCache .MarkUnavailable (ctx , ec2types .InstanceTypeM5Large , "test-zone-1a" , karpv1 .CapacityTypeOnDemand , map [string ]string {"reason" : "InsufficientInstanceCapacity" })
2431+
2432+ // Step 3: Query nodeClassB FIRST. Its offering cache entry (keyed by zones=[1b])
2433+ // is rebuilt because seqNum (1) != lastSeqNum (0). This stores lastSeqNum["m5.large"] = 1.
2434+ _ , err = awsEnv .InstanceTypesProvider .List (ctx , nodeClassB )
2435+ Expect (err ).ToNot (HaveOccurred ())
2436+
2437+ // Step 4: Query nodeClassA. The offering cache check sees seqNum (1) == lastSeqNum (1)
2438+ // (updated by step 3 for a DIFFERENT cache key) → stale cache hit. The pre-ICE
2439+ // offering for test-zone-1a/on-demand is returned with Available=true.
2440+ listA , err = awsEnv .InstanceTypesProvider .List (ctx , nodeClassA )
2441+ Expect (err ).ToNot (HaveOccurred ())
2442+ m5a , ok = lo .Find (listA , func (it * corecloudprovider.InstanceType ) bool {
2443+ return it .Name == string (ec2types .InstanceTypeM5Large )
2444+ })
2445+ Expect (ok ).To (BeTrue ())
2446+
2447+ // BUG: The ICE for test-zone-1a/on-demand should make this offering unavailable,
2448+ // but the stale cache returns Available=true. When fixed, change to BeFalse().
2449+ Expect (m5a .Offerings .Compatible (scheduling .NewLabelRequirements (map [string ]string {
2450+ corev1 .LabelTopologyZone : "test-zone-1a" ,
2451+ karpv1 .CapacityTypeLabelKey : karpv1 .CapacityTypeOnDemand ,
2452+ }))[0 ].Available ).To (BeTrue ()) // BUG: should be BeFalse()
2453+ })
23742454 })
23752455 Context ("CapacityType" , func () {
23762456 It ("should default to on-demand" , func () {
0 commit comments