diff --git a/cns/restserver/internalapi.go b/cns/restserver/internalapi.go index 9f855ec64f..a857bda9a8 100644 --- a/cns/restserver/internalapi.go +++ b/cns/restserver/internalapi.go @@ -459,9 +459,47 @@ func (service *HTTPRestService) ReconcileIPAMStateForNodeSubnet(ncReqs []*cns.Cr return returnCode } + // Build set of active pod keys and release any stale Assigned IPs that + // don't correspond to currently running pods. This handles cases where + // CNS state was persisted with orphaned IP assignments from previous + // container restarts. + activePodKeys := make(map[string]bool, len(podInfoByIP)) + for _, pi := range podInfoByIP { + activePodKeys[pi.Key()] = true + } + service.releaseStaleAssignedIPs(activePodKeys) + return types.Success } +// releaseStaleAssignedIPs releases any IPs that are marked as Assigned but whose +// pod key is not in the set of active pod keys. This is used during startup +// reconciliation to clean up leaked IP assignments from previous container sandboxes. +func (service *HTTPRestService) releaseStaleAssignedIPs(activePodKeys map[string]bool) { + // Collect stale pod infos under read lock. + service.RLock() + stalePodInfos := make(map[string]cns.PodInfo) + for _, ipConfig := range service.PodIPConfigState { + if ipConfig.GetState() != types.Assigned || ipConfig.PodInfo == nil { + continue + } + if !activePodKeys[ipConfig.PodInfo.Key()] { + stalePodInfos[ipConfig.PodInfo.Key()] = ipConfig.PodInfo + } + } + service.RUnlock() + + // Release stale IPs. Each releaseIPConfigs call acquires its own write lock. + for _, stalePodInfo := range stalePodInfos { + logger.Printf("[releaseStaleAssignedIPs] Releasing stale IP for key %s (pod %s/%s)", + stalePodInfo.Key(), stalePodInfo.Name(), stalePodInfo.Namespace()) + if err := service.releaseIPConfigs(stalePodInfo); err != nil { + logger.Errorf("[releaseStaleAssignedIPs] Failed to release stale IPs for key %s: %v", + stalePodInfo.Key(), err) + } + } +} + var ( errIPParse = errors.New("parse IP") errMultipleIPPerFamily = errors.New("multiple IPs per family") diff --git a/cns/restserver/internalapi_test.go b/cns/restserver/internalapi_test.go index 440e3b4e61..5f7bd66d9a 100644 --- a/cns/restserver/internalapi_test.go +++ b/cns/restserver/internalapi_test.go @@ -1758,3 +1758,154 @@ func setupIMDSMockAPIsWithCustomIDs(svc *HTTPRestService, interfaceIDs []string) // Return cleanup function return func() { svc.imdsClient = originalIMDS } } + +// TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs verifies that +// ReconcileIPAMStateForNodeSubnet releases IPs that are Assigned to pods +// that are no longer running on the node (i.e. their key is not in podInfoByIP). +func TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs(t *testing.T) { + restartService() + setEnv(t) + setOrchestratorTypeInternal(cns.KubernetesCRD) + + // Create an NC with 4 secondary IPs + secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) + ipIDs := make([]string, 4) + for i := 0; i < 4; i++ { + ipaddress := fmt.Sprintf("10.0.0.%d", 6+i) + secIPConfig := newSecondaryIPConfig(ipaddress, -1) + ipID := uuid.New().String() + ipIDs[i] = ipID + secondaryIPConfigs[ipID] = secIPConfig + } + ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc1", "-1") + + // Simulate: 2 pods are currently running, and 1 stale pod (old InfraContainerID) has leaked IPs. + // Pod1 (running) has IP 10.0.0.6 + // Pod2 (running) has IP 10.0.0.7 + // Pod1's OLD container (stale) had IP 10.0.0.8 — this should be released. + // 10.0.0.9 is available + activePodInfo := map[string]cns.PodInfo{ + "10.0.0.6": cns.NewPodInfo("newcontainer1", "newcont1-eth0", "pod1", "ns1"), + "10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"), + } + + returnCode := svc.ReconcileIPAMStateForNodeSubnet( + []*cns.CreateNetworkContainerRequest{ncReq}, activePodInfo, + ) + require.Equal(t, types.Success, returnCode) + + // Verify: 2 IPs should be Assigned, rest should be Available + assignedCount := 0 + availableCount := 0 + for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy + switch ipConfig.GetState() { + case types.Assigned: + assignedCount++ + case types.Available: + availableCount++ + } + } + assert.Equal(t, 2, assignedCount, "only the 2 active pods should have Assigned IPs") + assert.Equal(t, 2, availableCount, "remaining 2 IPs should be Available") +} + +// TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer +// simulates the exact crash-loop scenario: a pod previously had an IP under an +// old InfraContainerID, and now has a new InfraContainerID. The reconciliation +// should assign the IP for the current container and release the stale one. +func TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer(t *testing.T) { + restartService() + setEnv(t) + setOrchestratorTypeInternal(cns.KubernetesCRD) + + // Create NC with 2 IPs + secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) + ipID1 := uuid.New().String() + ipID2 := uuid.New().String() + secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1) + secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1) + ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc2", "-1") + + // First reconcile: pod1 with OLD container gets 10.0.0.6 + oldPodInfo := map[string]cns.PodInfo{ + "10.0.0.6": cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1"), + } + + returnCode := svc.ReconcileIPAMStateForNodeSubnet( + []*cns.CreateNetworkContainerRequest{ncReq}, oldPodInfo, + ) + require.Equal(t, types.Success, returnCode) + + // Verify old container has 1 assigned IP + assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1) + + // Second reconcile: pod1 has restarted with NEW container, IP is now 10.0.0.6 again + // The old container's key is no longer active. + newPodInfo := map[string]cns.PodInfo{ + "10.0.0.6": cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1"), + } + + returnCode = svc.ReconcileIPAMStateForNodeSubnet( + []*cns.CreateNetworkContainerRequest{ncReq}, newPodInfo, + ) + require.Equal(t, types.Success, returnCode) + + // The old container key should be gone, only the new one should have an IP + assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1, + "only the new container key should remain") + + _, hasNewKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1").Key()] + assert.True(t, hasNewKey, "new container key should be present") + + _, hasOldKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1").Key()] + assert.False(t, hasOldKey, "old container key should have been cleaned up") + + // Verify IP states: 1 Assigned, 1 Available + assignedCount := 0 + availableCount := 0 + for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy + switch ipConfig.GetState() { + case types.Assigned: + assignedCount++ + case types.Available: + availableCount++ + } + } + assert.Equal(t, 1, assignedCount) + assert.Equal(t, 1, availableCount) +} + +// TestReleaseStaleAssignedIPsNoOp verifies releaseStaleAssignedIPs does nothing +// when all Assigned IPs belong to active pods. +func TestReleaseStaleAssignedIPsNoOp(t *testing.T) { + restartService() + setEnv(t) + setOrchestratorTypeInternal(cns.KubernetesCRD) + + // Create NC with 2 IPs + secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig) + ipID1 := uuid.New().String() + ipID2 := uuid.New().String() + secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1) + secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1) + ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc3", "-1") + + // Both pods are active + podInfo := map[string]cns.PodInfo{ + "10.0.0.6": cns.NewPodInfo("container1", "cont1-eth0", "pod1", "ns1"), + "10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"), + } + + returnCode := svc.ReconcileIPAMStateForNodeSubnet( + []*cns.CreateNetworkContainerRequest{ncReq}, podInfo, + ) + require.Equal(t, types.Success, returnCode) + + // Both IPs should be Assigned, neither should be released + assert.Len(t, svc.PodIPIDByPodInterfaceKey, 2) + for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy + if ipConfig.PodInfo != nil { + assert.Equal(t, types.Assigned, ipConfig.GetState()) + } + } +} diff --git a/cns/restserver/ipam.go b/cns/restserver/ipam.go index f2f18e2b91..b751b102b8 100644 --- a/cns/restserver/ipam.go +++ b/cns/restserver/ipam.go @@ -856,6 +856,43 @@ func (service *HTTPRestService) releaseIPConfigs(podInfo cns.PodInfo) error { return nil } +// releaseStaleIPConfigsForPod finds and releases IP configs that are assigned +// to the same pod (by name and namespace) but with a different interface key. +// This cleans up leaked IPs from previous container sandboxes of a restarted pod, +// where each restart created a new InfraContainerID and thus a new key, but the +// old IP assignment was never released via CNI DEL. +func (service *HTTPRestService) releaseStaleIPConfigsForPod(podInfo cns.PodInfo) { + if podInfo.Name() == "" || podInfo.Namespace() == "" { + return + } + + // Collect stale pod infos under read lock. A stale entry is one that is + // Assigned to the same pod name+namespace but under a different interface key + // (i.e. a previous sandbox's InfraContainerID). + service.RLock() + stalePodKeys := make(map[string]cns.PodInfo) + for _, ipConfig := range service.PodIPConfigState { + if ipConfig.GetState() != types.Assigned || ipConfig.PodInfo == nil { + continue + } + if ipConfig.PodInfo.Name() == podInfo.Name() && + ipConfig.PodInfo.Namespace() == podInfo.Namespace() && + ipConfig.PodInfo.Key() != podInfo.Key() { + stalePodKeys[ipConfig.PodInfo.Key()] = ipConfig.PodInfo + } + } + service.RUnlock() + + // Release stale IPs. Each releaseIPConfigs call acquires its own write lock. + for _, stalePodInfo := range stalePodKeys { + logger.Printf("[releaseStaleIPConfigsForPod] Releasing stale IPs for pod %s/%s with old key %s (current key: %s)", + stalePodInfo.Name(), stalePodInfo.Namespace(), stalePodInfo.Key(), podInfo.Key()) + if err := service.releaseIPConfigs(stalePodInfo); err != nil { + logger.Errorf("[releaseStaleIPConfigsForPod] Failed to release stale IPs for key %s: %v", stalePodInfo.Key(), err) + } + } +} + // MarkExistingIPsAsPendingRelease is called when CNS is starting up and there are existing ipconfigs in the CRD that are marked as pending. func (service *HTTPRestService) MarkExistingIPsAsPendingRelease(pendingIPIDs []string) error { service.Lock() @@ -1126,6 +1163,12 @@ func requestIPConfigsHelper(service *HTTPRestService, req cns.IPConfigsRequest) return podIPInfo, err } + // Release any stale IP assignments from previous containers of the same pod. + // When a pod's container restarts, it gets a new InfraContainerID and thus a + // new interface key. The old IP assignment (under the old key) is orphaned + // if CNI DEL was not called or failed. Clean those up before allocating. + service.releaseStaleIPConfigsForPod(podInfo) + // if the desired IP configs are not specified, assign any free IPConfigs if len(req.DesiredIPAddresses) == 0 { return service.AssignAvailableIPConfigs(podInfo) diff --git a/cns/restserver/ipam_test.go b/cns/restserver/ipam_test.go index 87032d33ca..5211e96851 100644 --- a/cns/restserver/ipam_test.go +++ b/cns/restserver/ipam_test.go @@ -2485,3 +2485,135 @@ func TestStatelessCNIStateFile(t *testing.T) { }) } } + +// TestIPAMReleaseStaleIPOnPodRestart verifies that when a pod restarts with a new +// InfraContainerID, the stale IP from the previous container is released and a new +// IP is successfully allocated. This tests the fix for the crash-loop IP exhaustion bug. +func TestIPAMReleaseStaleIPOnPodRestart(t *testing.T) { + svc := getTestService(cns.KubernetesCRD) + + // Set up NC with 2 IPs + ipconfigs := make(map[string]cns.IPConfigurationStatus) + state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0) + ipconfigs[state1.ID] = state1 + state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0) + ipconfigs[state2.ID] = state2 + err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) + require.NoError(t, err) + + // Simulate first container of "mypod" getting IP1 + oldContainerID := "aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666" + oldInterfaceID := oldContainerID[:8] + "-eth0" + oldPodInfo := cns.NewPodInfo(oldInterfaceID, oldContainerID, "mypod", "default") + + req1 := cns.IPConfigsRequest{ + PodInterfaceID: oldPodInfo.InterfaceID(), + InfraContainerID: oldPodInfo.InfraContainerID(), + } + b, _ := oldPodInfo.OrchestratorContext() + req1.OrchestratorContext = b + + podIPInfo1, err := requestIPConfigsHelper(svc, req1) + require.NoError(t, err) + require.Len(t, podIPInfo1, 1) + assignedIP := podIPInfo1[0].PodIPConfig.IPAddress + + // Verify the old container got an IP assigned + assert.Contains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key()) + + // Simulate pod restart with a NEW InfraContainerID (no CNI DEL for old container) + newContainerID := "1111aaaa2222bbbb3333cccc4444dddd5555eeee6666ffff" + newInterfaceID := newContainerID[:8] + "-eth0" + newPodInfo := cns.NewPodInfo(newInterfaceID, newContainerID, "mypod", "default") + + req2 := cns.IPConfigsRequest{ + PodInterfaceID: newPodInfo.InterfaceID(), + InfraContainerID: newPodInfo.InfraContainerID(), + } + b2, _ := newPodInfo.OrchestratorContext() + req2.OrchestratorContext = b2 + + // This should succeed: the stale IP from the old container should be released first + podIPInfo2, err := requestIPConfigsHelper(svc, req2) + require.NoError(t, err) + require.Len(t, podIPInfo2, 1) + + // The released stale IP should be available and reassigned to the new container + assert.Equal(t, assignedIP, podIPInfo2[0].PodIPConfig.IPAddress, + "new container should get the same IP that was released from the old container") + + // Verify the old key's IP was released and the new key got an IP + assert.NotContains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key(), + "old container key should be removed from PodIPIDByPodInterfaceKey") + assert.Contains(t, svc.PodIPIDByPodInterfaceKey, newPodInfo.Key(), + "new container key should exist in PodIPIDByPodInterfaceKey") +} + +// TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion verifies that repeated +// pod restarts (crash-loop) do not exhaust the IP pool because each new allocation +// releases the stale IP from the previous container. +func TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion(t *testing.T) { + svc := getTestService(cns.KubernetesCRD) + + // Set up NC with only 1 IP — any leak will cause exhaustion + ipconfigs := make(map[string]cns.IPConfigurationStatus) + state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0) + ipconfigs[state1.ID] = state1 + err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) + require.NoError(t, err) + + podName := "crashloop-pod" + podNamespace := "default" + + // Simulate 5 consecutive container restarts (crash-loop), each with no CNI DEL + for i := 0; i < 5; i++ { + containerID := fmt.Sprintf("%08d0000000000000000000000000000000000000000", i) + interfaceID := containerID[:8] + "-eth0" + podInfo := cns.NewPodInfo(interfaceID, containerID, podName, podNamespace) + + req := cns.IPConfigsRequest{ + PodInterfaceID: podInfo.InterfaceID(), + InfraContainerID: podInfo.InfraContainerID(), + } + b, _ := podInfo.OrchestratorContext() + req.OrchestratorContext = b + + podIPInfo, err := requestIPConfigsHelper(svc, req) + require.NoError(t, err, "iteration %d should succeed", i) + require.Len(t, podIPInfo, 1, "iteration %d should get 1 IP", i) + assert.Equal(t, testIP1, podIPInfo[0].PodIPConfig.IPAddress, + "iteration %d should get the same IP", i) + } + + // Verify only 1 IP is assigned (no leaks) + assignedCount := 0 + for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy + if ipConfig.GetState() == types.Assigned { + assignedCount++ + } + } + assert.Equal(t, 1, assignedCount, "only 1 IP should be assigned after crash-loop") +} + +// TestIPAMReleaseStaleIPConfigsForPodNoStale verifies that releaseStaleIPConfigsForPod +// is a no-op when there are no stale IPs. +func TestIPAMReleaseStaleIPConfigsForPodNoStale(t *testing.T) { + svc := getTestService(cns.KubernetesCRD) + + // Set up with a normally assigned IP + ipconfigs := make(map[string]cns.IPConfigurationStatus) + state1, _ := newPodStateWithOrchestratorContext(testIP1, testIPID1, testNCID, types.Assigned, ipPrefixBitsv4, 0, testPod1Info) + ipconfigs[state1.ID] = state1 + state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0) + ipconfigs[state2.ID] = state2 + err := updatePodIPConfigState(t, svc, ipconfigs, testNCID) + require.NoError(t, err) + + // Call with a different pod — should not affect testPod1's IP + svc.releaseStaleIPConfigsForPod(testPod2Info) + + // testPod1's IP should still be assigned + ip1State := svc.PodIPConfigState[testIPID1] + assert.Equal(t, types.Assigned, ip1State.GetState()) + assert.Contains(t, svc.PodIPIDByPodInterfaceKey, testPod1Info.Key()) +}