-
Notifications
You must be signed in to change notification settings - Fork 265
fix: release stale IP assignments from crashed containers in NodeSubnet mode #4350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1758,3 +1758,154 @@ | |
| // Return cleanup function | ||
| return func() { svc.imdsClient = originalIMDS } | ||
| } | ||
|
|
||
| // TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs verifies that | ||
| // ReconcileIPAMStateForNodeSubnet releases IPs that are Assigned to pods | ||
| // that are no longer running on the node (i.e. their key is not in podInfoByIP). | ||
| func TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs(t *testing.T) { | ||
| restartService() | ||
| setEnv(t) | ||
| setOrchestratorTypeInternal(cns.KubernetesCRD) | ||
|
|
||
| // Create an NC with 4 secondary IPs | ||
| secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) | ||
| ipIDs := make([]string, 4) | ||
| for i := 0; i < 4; i++ { | ||
| ipaddress := fmt.Sprintf("10.0.0.%d", 6+i) | ||
| secIPConfig := newSecondaryIPConfig(ipaddress, -1) | ||
| ipID := uuid.New().String() | ||
| ipIDs[i] = ipID | ||
| secondaryIPConfigs[ipID] = secIPConfig | ||
| } | ||
| ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc1", "-1") | ||
|
|
||
| // Simulate: 2 pods are currently running, and 1 stale pod (old InfraContainerID) has leaked IPs. | ||
| // Pod1 (running) has IP 10.0.0.6 | ||
| // Pod2 (running) has IP 10.0.0.7 | ||
| // Pod1's OLD container (stale) had IP 10.0.0.8 — this should be released. | ||
| // 10.0.0.9 is available | ||
| activePodInfo := map[string]cns.PodInfo{ | ||
| "10.0.0.6": cns.NewPodInfo("newcontainer1", "newcont1-eth0", "pod1", "ns1"), | ||
| "10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"), | ||
| } | ||
|
|
||
| returnCode := svc.ReconcileIPAMStateForNodeSubnet( | ||
| []*cns.CreateNetworkContainerRequest{ncReq}, activePodInfo, | ||
| ) | ||
| require.Equal(t, types.Success, returnCode) | ||
|
|
||
| // Verify: 2 IPs should be Assigned, rest should be Available | ||
| assignedCount := 0 | ||
| availableCount := 0 | ||
| for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy | ||
| switch ipConfig.GetState() { | ||
|
Check failure on line 1801 in cns/restserver/internalapi_test.go
|
||
| case types.Assigned: | ||
| assignedCount++ | ||
| case types.Available: | ||
| availableCount++ | ||
| } | ||
| } | ||
| assert.Equal(t, 2, assignedCount, "only the 2 active pods should have Assigned IPs") | ||
| assert.Equal(t, 2, availableCount, "remaining 2 IPs should be Available") | ||
|
Comment on lines
+1762
to
+1809
|
||
| } | ||
|
|
||
| // TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer | ||
| // simulates the exact crash-loop scenario: a pod previously had an IP under an | ||
| // old InfraContainerID, and now has a new InfraContainerID. The reconciliation | ||
| // should assign the IP for the current container and release the stale one. | ||
| func TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer(t *testing.T) { | ||
| restartService() | ||
| setEnv(t) | ||
| setOrchestratorTypeInternal(cns.KubernetesCRD) | ||
|
|
||
| // Create NC with 2 IPs | ||
| secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) | ||
| ipID1 := uuid.New().String() | ||
| ipID2 := uuid.New().String() | ||
| secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1) | ||
| secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1) | ||
| ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc2", "-1") | ||
|
|
||
| // First reconcile: pod1 with OLD container gets 10.0.0.6 | ||
| oldPodInfo := map[string]cns.PodInfo{ | ||
| "10.0.0.6": cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1"), | ||
| } | ||
|
|
||
| returnCode := svc.ReconcileIPAMStateForNodeSubnet( | ||
| []*cns.CreateNetworkContainerRequest{ncReq}, oldPodInfo, | ||
| ) | ||
| require.Equal(t, types.Success, returnCode) | ||
|
|
||
| // Verify old container has 1 assigned IP | ||
| assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1) | ||
|
|
||
| // Second reconcile: pod1 has restarted with NEW container, IP is now 10.0.0.6 again | ||
| // The old container's key is no longer active. | ||
| newPodInfo := map[string]cns.PodInfo{ | ||
| "10.0.0.6": cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1"), | ||
| } | ||
|
|
||
| returnCode = svc.ReconcileIPAMStateForNodeSubnet( | ||
| []*cns.CreateNetworkContainerRequest{ncReq}, newPodInfo, | ||
| ) | ||
| require.Equal(t, types.Success, returnCode) | ||
|
|
||
| // The old container key should be gone, only the new one should have an IP | ||
| assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1, | ||
| "only the new container key should remain") | ||
|
|
||
| _, hasNewKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1").Key()] | ||
| assert.True(t, hasNewKey, "new container key should be present") | ||
|
|
||
| _, hasOldKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1").Key()] | ||
| assert.False(t, hasOldKey, "old container key should have been cleaned up") | ||
|
|
||
| // Verify IP states: 1 Assigned, 1 Available | ||
| assignedCount := 0 | ||
| availableCount := 0 | ||
| for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy | ||
| switch ipConfig.GetState() { | ||
|
Check failure on line 1867 in cns/restserver/internalapi_test.go
|
||
| case types.Assigned: | ||
| assignedCount++ | ||
| case types.Available: | ||
| availableCount++ | ||
| } | ||
| } | ||
| assert.Equal(t, 1, assignedCount) | ||
| assert.Equal(t, 1, availableCount) | ||
| } | ||
|
|
||
| // TestReleaseStaleAssignedIPsNoOp verifies releaseStaleAssignedIPs does nothing | ||
| // when all Assigned IPs belong to active pods. | ||
| func TestReleaseStaleAssignedIPsNoOp(t *testing.T) { | ||
| restartService() | ||
| setEnv(t) | ||
| setOrchestratorTypeInternal(cns.KubernetesCRD) | ||
|
|
||
| // Create NC with 2 IPs | ||
| secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) | ||
| ipID1 := uuid.New().String() | ||
| ipID2 := uuid.New().String() | ||
| secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1) | ||
| secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1) | ||
| ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc3", "-1") | ||
|
|
||
| // Both pods are active | ||
| podInfo := map[string]cns.PodInfo{ | ||
| "10.0.0.6": cns.NewPodInfo("container1", "cont1-eth0", "pod1", "ns1"), | ||
| "10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"), | ||
| } | ||
|
|
||
| returnCode := svc.ReconcileIPAMStateForNodeSubnet( | ||
| []*cns.CreateNetworkContainerRequest{ncReq}, podInfo, | ||
| ) | ||
| require.Equal(t, types.Success, returnCode) | ||
|
|
||
| // Both IPs should be Assigned, neither should be released | ||
| assert.Len(t, svc.PodIPIDByPodInterfaceKey, 2) | ||
| for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy | ||
| if ipConfig.PodInfo != nil { | ||
| assert.Equal(t, types.Assigned, ipConfig.GetState()) | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2485,3 +2485,135 @@ func TestStatelessCNIStateFile(t *testing.T) { | |
| }) | ||
| } | ||
| } | ||
|
|
||
| // TestIPAMReleaseStaleIPOnPodRestart verifies that when a pod restarts with a new | ||
| // InfraContainerID, the stale IP from the previous container is released and a new | ||
| // IP is successfully allocated. This tests the fix for the crash-loop IP exhaustion bug. | ||
| func TestIPAMReleaseStaleIPOnPodRestart(t *testing.T) { | ||
| svc := getTestService(cns.KubernetesCRD) | ||
|
|
||
| // Set up NC with 2 IPs | ||
| ipconfigs := make(map[string]cns.IPConfigurationStatus) | ||
| state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0) | ||
| ipconfigs[state1.ID] = state1 | ||
| state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0) | ||
| ipconfigs[state2.ID] = state2 | ||
| err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) | ||
| require.NoError(t, err) | ||
|
|
||
| // Simulate first container of "mypod" getting IP1 | ||
| oldContainerID := "aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666" | ||
| oldInterfaceID := oldContainerID[:8] + "-eth0" | ||
| oldPodInfo := cns.NewPodInfo(oldInterfaceID, oldContainerID, "mypod", "default") | ||
|
|
||
| req1 := cns.IPConfigsRequest{ | ||
| PodInterfaceID: oldPodInfo.InterfaceID(), | ||
| InfraContainerID: oldPodInfo.InfraContainerID(), | ||
| } | ||
|
Comment on lines
+2505
to
+2512
|
||
| b, _ := oldPodInfo.OrchestratorContext() | ||
| req1.OrchestratorContext = b | ||
|
|
||
| podIPInfo1, err := requestIPConfigsHelper(svc, req1) | ||
| require.NoError(t, err) | ||
| require.Len(t, podIPInfo1, 1) | ||
| assignedIP := podIPInfo1[0].PodIPConfig.IPAddress | ||
|
|
||
| // Verify the old container got an IP assigned | ||
| assert.Contains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key()) | ||
|
|
||
| // Simulate pod restart with a NEW InfraContainerID (no CNI DEL for old container) | ||
| newContainerID := "1111aaaa2222bbbb3333cccc4444dddd5555eeee6666ffff" | ||
| newInterfaceID := newContainerID[:8] + "-eth0" | ||
| newPodInfo := cns.NewPodInfo(newInterfaceID, newContainerID, "mypod", "default") | ||
|
|
||
| req2 := cns.IPConfigsRequest{ | ||
| PodInterfaceID: newPodInfo.InterfaceID(), | ||
| InfraContainerID: newPodInfo.InfraContainerID(), | ||
| } | ||
| b2, _ := newPodInfo.OrchestratorContext() | ||
| req2.OrchestratorContext = b2 | ||
|
|
||
| // This should succeed: the stale IP from the old container should be released first | ||
| podIPInfo2, err := requestIPConfigsHelper(svc, req2) | ||
| require.NoError(t, err) | ||
| require.Len(t, podIPInfo2, 1) | ||
|
|
||
| // The released stale IP should be available and reassigned to the new container | ||
| assert.Equal(t, assignedIP, podIPInfo2[0].PodIPConfig.IPAddress, | ||
| "new container should get the same IP that was released from the old container") | ||
|
|
||
| // Verify the old key's IP was released and the new key got an IP | ||
| assert.NotContains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key(), | ||
| "old container key should be removed from PodIPIDByPodInterfaceKey") | ||
| assert.Contains(t, svc.PodIPIDByPodInterfaceKey, newPodInfo.Key(), | ||
| "new container key should exist in PodIPIDByPodInterfaceKey") | ||
| } | ||
|
|
||
| // TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion verifies that repeated | ||
| // pod restarts (crash-loop) do not exhaust the IP pool because each new allocation | ||
| // releases the stale IP from the previous container. | ||
| func TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion(t *testing.T) { | ||
| svc := getTestService(cns.KubernetesCRD) | ||
|
|
||
| // Set up NC with only 1 IP — any leak will cause exhaustion | ||
| ipconfigs := make(map[string]cns.IPConfigurationStatus) | ||
| state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0) | ||
| ipconfigs[state1.ID] = state1 | ||
| err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) | ||
| require.NoError(t, err) | ||
|
|
||
| podName := "crashloop-pod" | ||
| podNamespace := "default" | ||
|
|
||
| // Simulate 5 consecutive container restarts (crash-loop), each with no CNI DEL | ||
| for i := 0; i < 5; i++ { | ||
| containerID := fmt.Sprintf("%08d0000000000000000000000000000000000000000", i) | ||
| interfaceID := containerID[:8] + "-eth0" | ||
| podInfo := cns.NewPodInfo(interfaceID, containerID, podName, podNamespace) | ||
|
|
||
| req := cns.IPConfigsRequest{ | ||
| PodInterfaceID: podInfo.InterfaceID(), | ||
| InfraContainerID: podInfo.InfraContainerID(), | ||
| } | ||
| b, _ := podInfo.OrchestratorContext() | ||
| req.OrchestratorContext = b | ||
|
|
||
| podIPInfo, err := requestIPConfigsHelper(svc, req) | ||
| require.NoError(t, err, "iteration %d should succeed", i) | ||
| require.Len(t, podIPInfo, 1, "iteration %d should get 1 IP", i) | ||
| assert.Equal(t, testIP1, podIPInfo[0].PodIPConfig.IPAddress, | ||
| "iteration %d should get the same IP", i) | ||
| } | ||
|
|
||
| // Verify only 1 IP is assigned (no leaks) | ||
| assignedCount := 0 | ||
| for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy | ||
| if ipConfig.GetState() == types.Assigned { | ||
| assignedCount++ | ||
| } | ||
| } | ||
| assert.Equal(t, 1, assignedCount, "only 1 IP should be assigned after crash-loop") | ||
| } | ||
|
|
||
| // TestIPAMReleaseStaleIPConfigsForPodNoStale verifies that releaseStaleIPConfigsForPod | ||
| // is a no-op when there are no stale IPs. | ||
| func TestIPAMReleaseStaleIPConfigsForPodNoStale(t *testing.T) { | ||
| svc := getTestService(cns.KubernetesCRD) | ||
|
|
||
| // Set up with a normally assigned IP | ||
| ipconfigs := make(map[string]cns.IPConfigurationStatus) | ||
| state1, _ := newPodStateWithOrchestratorContext(testIP1, testIPID1, testNCID, types.Assigned, ipPrefixBitsv4, 0, testPod1Info) | ||
| ipconfigs[state1.ID] = state1 | ||
| state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0) | ||
| ipconfigs[state2.ID] = state2 | ||
| err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) | ||
| require.NoError(t, err) | ||
|
|
||
| // Call with a different pod — should not affect testPod1's IP | ||
| svc.releaseStaleIPConfigsForPod(testPod2Info) | ||
|
|
||
| // testPod1's IP should still be assigned | ||
| ip1State := svc.PodIPConfigState[testIPID1] | ||
| assert.Equal(t, types.Assigned, ip1State.GetState()) | ||
| assert.Contains(t, svc.PodIPIDByPodInterfaceKey, testPod1Info.Key()) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new reconciliation cleanup builds
activePodKeysfrom thepodInfoByIPargument and then releases any Assigned IPs whose pod key isn’t in that set. In the NodeSubnet initialization path,podInfoByIPis sourced from the CNS endpoint state store (previous saved endpoints), which may include stale entries (so leaks won’t be released) or may be incomplete (risking release of IPs for still-running pods). The comment here also says “currently running pods”, which isn’t necessarily true for that provider. Please clarify the contract forpodInfoByIP(and/or wire this to a provider that enumerates actual active pods) and consider guarding the release to avoid releasing IPs when the active set is known to be unreliable.