Azure · bohuini · Apr 13, 2026 · Copilot · Apr 13, 2026 · Copilot
@@ -459,9 +459,47 @@
 		return returnCode
 	}
 
+	// Build set of active pod keys and release any stale Assigned IPs that
+	// don't correspond to currently running pods. This handles cases where
+	// CNS state was persisted with orphaned IP assignments from previous
+	// container restarts.
+	activePodKeys := make(map[string]bool, len(podInfoByIP))
+	for _, pi := range podInfoByIP {
+		activePodKeys[pi.Key()] = true
+	}
+	service.releaseStaleAssignedIPs(activePodKeys)
+
 	return types.Success
 }
 
+// releaseStaleAssignedIPs releases any IPs that are marked as Assigned but whose
+// pod key is not in the set of active pod keys. This is used during startup
+// reconciliation to clean up leaked IP assignments from previous container sandboxes.
+func (service *HTTPRestService) releaseStaleAssignedIPs(activePodKeys map[string]bool) {
+	// Collect stale pod infos under read lock.
+	service.RLock()
+	stalePodInfos := make(map[string]cns.PodInfo)
+	for _, ipConfig := range service.PodIPConfigState {
+		if ipConfig.GetState() != types.Assigned || ipConfig.PodInfo == nil {
+			continue
+		}
+		if !activePodKeys[ipConfig.PodInfo.Key()] {
+			stalePodInfos[ipConfig.PodInfo.Key()] = ipConfig.PodInfo
+		}
+	}
+	service.RUnlock()
+
+	// Release stale IPs. Each releaseIPConfigs call acquires its own write lock.
+	for _, stalePodInfo := range stalePodInfos {
+		logger.Printf("[releaseStaleAssignedIPs] Releasing stale IP for key %s (pod %s/%s)",
+			stalePodInfo.Key(), stalePodInfo.Name(), stalePodInfo.Namespace())
+		if err := service.releaseIPConfigs(stalePodInfo); err != nil {
+			logger.Errorf("[releaseStaleAssignedIPs] Failed to release stale IPs for key %s: %v",
+				stalePodInfo.Key(), err)
+		}
+	}
+}
+
 var (
 	errIPParse             = errors.New("parse IP")
 	errMultipleIPPerFamily = errors.New("multiple IPs per family")

@@ -1758,3 +1758,154 @@
 	// Return cleanup function
 	return func() { svc.imdsClient = originalIMDS }
 }
+
+// TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs verifies that
+// ReconcileIPAMStateForNodeSubnet releases IPs that are Assigned to pods
+// that are no longer running on the node (i.e. their key is not in podInfoByIP).
+func TestReconcileIPAMStateForNodeSubnetReleasesStaleIPs(t *testing.T) {
+	restartService()
+	setEnv(t)
+	setOrchestratorTypeInternal(cns.KubernetesCRD)
+
+	// Create an NC with 4 secondary IPs
+	secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig)
+	ipIDs := make([]string, 4)
+	for i := 0; i < 4; i++ {
+		ipaddress := fmt.Sprintf("10.0.0.%d", 6+i)
+		secIPConfig := newSecondaryIPConfig(ipaddress, -1)
+		ipID := uuid.New().String()
+		ipIDs[i] = ipID
+		secondaryIPConfigs[ipID] = secIPConfig
+	}
+	ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc1", "-1")
+
+	// Simulate: 2 pods are currently running, and 1 stale pod (old InfraContainerID) has leaked IPs.
+	// Pod1 (running) has IP 10.0.0.6
+	// Pod2 (running) has IP 10.0.0.7
+	// Pod1's OLD container (stale) had IP 10.0.0.8 — this should be released.
+	// 10.0.0.9 is available
+	activePodInfo := map[string]cns.PodInfo{
+		"10.0.0.6": cns.NewPodInfo("newcontainer1", "newcont1-eth0", "pod1", "ns1"),
+		"10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"),
+	}
+
+	returnCode := svc.ReconcileIPAMStateForNodeSubnet(
+		[]*cns.CreateNetworkContainerRequest{ncReq}, activePodInfo,
+	)
+	require.Equal(t, types.Success, returnCode)
+
+	// Verify: 2 IPs should be Assigned, rest should be Available
+	assignedCount := 0
+	availableCount := 0
+	for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy
+		switch ipConfig.GetState() {
+		case types.Assigned:
+			assignedCount++
+		case types.Available:
+			availableCount++
+		}
+	}
+	assert.Equal(t, 2, assignedCount, "only the 2 active pods should have Assigned IPs")
+	assert.Equal(t, 2, availableCount, "remaining 2 IPs should be Available")
+}
+
+// TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer
+// simulates the exact crash-loop scenario: a pod previously had an IP under an
+// old InfraContainerID, and now has a new InfraContainerID. The reconciliation
+// should assign the IP for the current container and release the stale one.
+func TestReconcileIPAMStateForNodeSubnetReleasesStaleFromPreviousContainer(t *testing.T) {
+	restartService()
+	setEnv(t)
+	setOrchestratorTypeInternal(cns.KubernetesCRD)
+
+	// Create NC with 2 IPs
+	secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig)
+	ipID1 := uuid.New().String()
+	ipID2 := uuid.New().String()
+	secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1)
+	secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1)
+	ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc2", "-1")
+
+	// First reconcile: pod1 with OLD container gets 10.0.0.6
+	oldPodInfo := map[string]cns.PodInfo{
+		"10.0.0.6": cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1"),
+	}
+
+	returnCode := svc.ReconcileIPAMStateForNodeSubnet(
+		[]*cns.CreateNetworkContainerRequest{ncReq}, oldPodInfo,
+	)
+	require.Equal(t, types.Success, returnCode)
+
+	// Verify old container has 1 assigned IP
+	assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1)
+
+	// Second reconcile: pod1 has restarted with NEW container, IP is now 10.0.0.6 again
+	// The old container's key is no longer active.
+	newPodInfo := map[string]cns.PodInfo{
+		"10.0.0.6": cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1"),
+	}
+
+	returnCode = svc.ReconcileIPAMStateForNodeSubnet(
+		[]*cns.CreateNetworkContainerRequest{ncReq}, newPodInfo,
+	)
+	require.Equal(t, types.Success, returnCode)
+
+	// The old container key should be gone, only the new one should have an IP
+	assert.Len(t, svc.PodIPIDByPodInterfaceKey, 1,
+		"only the new container key should remain")
+
+	_, hasNewKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("newcontainer", "newcont-eth0", "pod1", "ns1").Key()]
+	assert.True(t, hasNewKey, "new container key should be present")
+
+	_, hasOldKey := svc.PodIPIDByPodInterfaceKey[cns.NewPodInfo("oldcontainer", "oldcont-eth0", "pod1", "ns1").Key()]
+	assert.False(t, hasOldKey, "old container key should have been cleaned up")
+
+	// Verify IP states: 1 Assigned, 1 Available
+	assignedCount := 0
+	availableCount := 0
+	for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy
+		switch ipConfig.GetState() {
+		case types.Assigned:
+			assignedCount++
+		case types.Available:
+			availableCount++
+		}
+	}
+	assert.Equal(t, 1, assignedCount)
+	assert.Equal(t, 1, availableCount)
+}
+
+// TestReleaseStaleAssignedIPsNoOp verifies releaseStaleAssignedIPs does nothing
+// when all Assigned IPs belong to active pods.
+func TestReleaseStaleAssignedIPsNoOp(t *testing.T) {
+	restartService()
+	setEnv(t)
+	setOrchestratorTypeInternal(cns.KubernetesCRD)
+
+	// Create NC with 2 IPs
+	secondaryIPConfigs := make(map[string]cns.SecondaryIPConfig)
+	ipID1 := uuid.New().String()
+	ipID2 := uuid.New().String()
+	secondaryIPConfigs[ipID1] = newSecondaryIPConfig("10.0.0.6", -1)
+	secondaryIPConfigs[ipID2] = newSecondaryIPConfig("10.0.0.7", -1)
+	ncReq := generateNetworkContainerRequest(secondaryIPConfigs, "nodesubnet-nc3", "-1")
+
+	// Both pods are active
+	podInfo := map[string]cns.PodInfo{
+		"10.0.0.6": cns.NewPodInfo("container1", "cont1-eth0", "pod1", "ns1"),
+		"10.0.0.7": cns.NewPodInfo("container2", "cont2-eth0", "pod2", "ns1"),
+	}
+
+	returnCode := svc.ReconcileIPAMStateForNodeSubnet(
+		[]*cns.CreateNetworkContainerRequest{ncReq}, podInfo,
+	)
+	require.Equal(t, types.Success, returnCode)
+
+	// Both IPs should be Assigned, neither should be released
+	assert.Len(t, svc.PodIPIDByPodInterfaceKey, 2)
+	for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy
+		if ipConfig.PodInfo != nil {
+			assert.Equal(t, types.Assigned, ipConfig.GetState())
+		}
+	}
+}
@@ -856,6 +856,43 @@
 	return nil
 }
 
+// releaseStaleIPConfigsForPod finds and releases IP configs that are assigned
+// to the same pod (by name and namespace) but with a different interface key.
+// This cleans up leaked IPs from previous container sandboxes of a restarted pod,
+// where each restart created a new InfraContainerID and thus a new key, but the
+// old IP assignment was never released via CNI DEL.
+func (service *HTTPRestService) releaseStaleIPConfigsForPod(podInfo cns.PodInfo) {
+	if podInfo.Name() == "" || podInfo.Namespace() == "" {
+		return
+	}
+
+	// Collect stale pod infos under read lock. A stale entry is one that is
+	// Assigned to the same pod name+namespace but under a different interface key
+	// (i.e. a previous sandbox's InfraContainerID).
+	service.RLock()
+	stalePodKeys := make(map[string]cns.PodInfo)
+	for _, ipConfig := range service.PodIPConfigState {
+		if ipConfig.GetState() != types.Assigned || ipConfig.PodInfo == nil {
+			continue
+		}
+		if ipConfig.PodInfo.Name() == podInfo.Name() &&
+			ipConfig.PodInfo.Namespace() == podInfo.Namespace() &&
+			ipConfig.PodInfo.Key() != podInfo.Key() {
+			stalePodKeys[ipConfig.PodInfo.Key()] = ipConfig.PodInfo
+		}
+	}
+	service.RUnlock()
+
+	// Release stale IPs. Each releaseIPConfigs call acquires its own write lock.
+	for _, stalePodInfo := range stalePodKeys {
+		logger.Printf("[releaseStaleIPConfigsForPod] Releasing stale IPs for pod %s/%s with old key %s (current key: %s)",
+			stalePodInfo.Name(), stalePodInfo.Namespace(), stalePodInfo.Key(), podInfo.Key())
+		if err := service.releaseIPConfigs(stalePodInfo); err != nil {
+			logger.Errorf("[releaseStaleIPConfigsForPod] Failed to release stale IPs for key %s: %v", stalePodInfo.Key(), err)
+		}
+	}
+}
+
 // MarkExistingIPsAsPendingRelease is called when CNS is starting up and there are existing ipconfigs in the CRD that are marked as pending.
 func (service *HTTPRestService) MarkExistingIPsAsPendingRelease(pendingIPIDs []string) error {
 	service.Lock()
@@ -1126,6 +1163,12 @@
 		return podIPInfo, err
 	}
 
+	// Release any stale IP assignments from previous containers of the same pod.
+	// When a pod's container restarts, it gets a new InfraContainerID and thus a
+	// new interface key. The old IP assignment (under the old key) is orphaned
+	// if CNI DEL was not called or failed. Clean those up before allocating.
+	service.releaseStaleIPConfigsForPod(podInfo)
+
 	// if the desired IP configs are not specified, assign any free IPConfigs
 	if len(req.DesiredIPAddresses) == 0 {
 		return service.AssignAvailableIPConfigs(podInfo)

@@ -2485,3 +2485,135 @@ func TestStatelessCNIStateFile(t *testing.T) {
 		})
 	}
 }
+
+// TestIPAMReleaseStaleIPOnPodRestart verifies that when a pod restarts with a new
+// InfraContainerID, the stale IP from the previous container is released and a new
+// IP is successfully allocated. This tests the fix for the crash-loop IP exhaustion bug.
+func TestIPAMReleaseStaleIPOnPodRestart(t *testing.T) {
+	svc := getTestService(cns.KubernetesCRD)
+
+	// Set up NC with 2 IPs
+	ipconfigs := make(map[string]cns.IPConfigurationStatus)
+	state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0)
+	ipconfigs[state1.ID] = state1
+	state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0)
+	ipconfigs[state2.ID] = state2
+	err := updatePodIPConfigState(t, svc, ipconfigs, testNCID)
+	require.NoError(t, err)
+
+	// Simulate first container of "mypod" getting IP1
+	oldContainerID := "aaaa1111bbbb2222cccc3333dddd4444eeee5555ffff6666"
+	oldInterfaceID := oldContainerID[:8] + "-eth0"
+	oldPodInfo := cns.NewPodInfo(oldInterfaceID, oldContainerID, "mypod", "default")
+
+	req1 := cns.IPConfigsRequest{
+		PodInterfaceID:   oldPodInfo.InterfaceID(),
+		InfraContainerID: oldPodInfo.InfraContainerID(),
+	}
+	b, _ := oldPodInfo.OrchestratorContext()
+	req1.OrchestratorContext = b
+
+	podIPInfo1, err := requestIPConfigsHelper(svc, req1)
+	require.NoError(t, err)
+	require.Len(t, podIPInfo1, 1)
+	assignedIP := podIPInfo1[0].PodIPConfig.IPAddress
+
+	// Verify the old container got an IP assigned
+	assert.Contains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key())
+
+	// Simulate pod restart with a NEW InfraContainerID (no CNI DEL for old container)
+	newContainerID := "1111aaaa2222bbbb3333cccc4444dddd5555eeee6666ffff"
+	newInterfaceID := newContainerID[:8] + "-eth0"
+	newPodInfo := cns.NewPodInfo(newInterfaceID, newContainerID, "mypod", "default")
+
+	req2 := cns.IPConfigsRequest{
+		PodInterfaceID:   newPodInfo.InterfaceID(),
+		InfraContainerID: newPodInfo.InfraContainerID(),
+	}
+	b2, _ := newPodInfo.OrchestratorContext()
+	req2.OrchestratorContext = b2
+
+	// This should succeed: the stale IP from the old container should be released first
+	podIPInfo2, err := requestIPConfigsHelper(svc, req2)
+	require.NoError(t, err)
+	require.Len(t, podIPInfo2, 1)
+
+	// The released stale IP should be available and reassigned to the new container
+	assert.Equal(t, assignedIP, podIPInfo2[0].PodIPConfig.IPAddress,
+		"new container should get the same IP that was released from the old container")
+
+	// Verify the old key's IP was released and the new key got an IP
+	assert.NotContains(t, svc.PodIPIDByPodInterfaceKey, oldPodInfo.Key(),
+		"old container key should be removed from PodIPIDByPodInterfaceKey")
+	assert.Contains(t, svc.PodIPIDByPodInterfaceKey, newPodInfo.Key(),
+		"new container key should exist in PodIPIDByPodInterfaceKey")
+}
+
+// TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion verifies that repeated
+// pod restarts (crash-loop) do not exhaust the IP pool because each new allocation
+// releases the stale IP from the previous container.
+func TestIPAMReleaseStaleIPOnPodRestartPreventsExhaustion(t *testing.T) {
+	svc := getTestService(cns.KubernetesCRD)
+
+	// Set up NC with only 1 IP — any leak will cause exhaustion
+	ipconfigs := make(map[string]cns.IPConfigurationStatus)
+	state1 := newPodState(testIP1, testIPID1, testNCID, types.Available, 0)
+	ipconfigs[state1.ID] = state1
+	err := updatePodIPConfigState(t, svc, ipconfigs, testNCID)
+	require.NoError(t, err)
+
+	podName := "crashloop-pod"
+	podNamespace := "default"
+
+	// Simulate 5 consecutive container restarts (crash-loop), each with no CNI DEL
+	for i := 0; i < 5; i++ {
+		containerID := fmt.Sprintf("%08d0000000000000000000000000000000000000000", i)
+		interfaceID := containerID[:8] + "-eth0"
+		podInfo := cns.NewPodInfo(interfaceID, containerID, podName, podNamespace)
+
+		req := cns.IPConfigsRequest{
+			PodInterfaceID:   podInfo.InterfaceID(),
+			InfraContainerID: podInfo.InfraContainerID(),
+		}
+		b, _ := podInfo.OrchestratorContext()
+		req.OrchestratorContext = b
+
+		podIPInfo, err := requestIPConfigsHelper(svc, req)
+		require.NoError(t, err, "iteration %d should succeed", i)
+		require.Len(t, podIPInfo, 1, "iteration %d should get 1 IP", i)
+		assert.Equal(t, testIP1, podIPInfo[0].PodIPConfig.IPAddress,
+			"iteration %d should get the same IP", i)
+	}
+
+	// Verify only 1 IP is assigned (no leaks)
+	assignedCount := 0
+	for _, ipConfig := range svc.PodIPConfigState { //nolint:gocritic // ignore copy
+		if ipConfig.GetState() == types.Assigned {
+			assignedCount++
+		}
+	}
+	assert.Equal(t, 1, assignedCount, "only 1 IP should be assigned after crash-loop")
+}
+
+// TestIPAMReleaseStaleIPConfigsForPodNoStale verifies that releaseStaleIPConfigsForPod
+// is a no-op when there are no stale IPs.
+func TestIPAMReleaseStaleIPConfigsForPodNoStale(t *testing.T) {
+	svc := getTestService(cns.KubernetesCRD)
+
+	// Set up with a normally assigned IP
+	ipconfigs := make(map[string]cns.IPConfigurationStatus)
+	state1, _ := newPodStateWithOrchestratorContext(testIP1, testIPID1, testNCID, types.Assigned, ipPrefixBitsv4, 0, testPod1Info)
+	ipconfigs[state1.ID] = state1
+	state2 := newPodState(testIP2, testIPID2, testNCID, types.Available, 0)
+	ipconfigs[state2.ID] = state2
+	err := updatePodIPConfigState(t, svc, ipconfigs, testNCID)
+	require.NoError(t, err)
+
+	// Call with a different pod — should not affect testPod1's IP
+	svc.releaseStaleIPConfigsForPod(testPod2Info)
+
+	// testPod1's IP should still be assigned
+	ip1State := svc.PodIPConfigState[testIPID1]
+	assert.Equal(t, types.Assigned, ip1State.GetState())
+	assert.Contains(t, svc.PodIPIDByPodInterfaceKey, testPod1Info.Key())
+}