Skip to content

Commit 862100f

Browse files
authored
Improve SR-IOV device assignment to ensure it's idempotent (#7322) (#7327)
If the SR-IOV device configuration fails after the device has been assigned to the Pod, subsequent retries will also fail due to the device being marked as unavailable. Improve the device assignment logic to ensure the Pod can consistently get the correct device. Signed-off-by: Lan Luo <[email protected]>
1 parent 75214bd commit 862100f

File tree

2 files changed

+37
-15
lines changed

2 files changed

+37
-15
lines changed

pkg/agent/secondarynetwork/podwatch/controller_test.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,7 @@ func TestPodControllerRun(t *testing.T) {
335335
containerNetNS(containerID),
336336
interfaceName,
337337
defaultMTU,
338-
// We haven't updated the vfDeviceIDUsageMap, so a different device will be allocated.
339-
sriovDeviceID12,
338+
sriovDeviceID11,
340339
&ipamResult.Result,
341340
).Do(func(string, string, string, string, string, int, string, *current.Result) {
342341
atomic.AddInt32(&interfaceConfigured, 1)
@@ -1144,15 +1143,26 @@ func TestPodControllerAddPod(t *testing.T) {
11441143
t.Run("updating deviceID cache per Pod", func(t *testing.T) {
11451144
ctrl := gomock.NewController(t)
11461145
podController, _, _, _ := testPodController(ctrl)
1147-
_, err := podController.assignUnusedSriovVFDeviceID(podName, testNamespace, sriovResourceName1, interfaceName)
1148-
_, exists := podController.vfDeviceIDUsageMap.Load(podKey)
1146+
tmpPodName := "poda"
1147+
tmpPodNamespace := "testns"
1148+
tmpPodKey := podKeyGet(tmpPodName, tmpPodNamespace)
1149+
_, err := podController.assignSriovVFDeviceID(tmpPodName, tmpPodNamespace, sriovResourceName1, interfaceName)
1150+
vfDeviceIDInfoCache, exists := podController.vfDeviceIDUsageMap.Load(tmpPodKey)
11491151
assert.True(t, exists)
1152+
vfDeviceIDsInfo := vfDeviceIDInfoCache.([]podSriovVFDeviceIDInfo)
1153+
assert.Equal(t, interfaceName, vfDeviceIDsInfo[0].ifName, "incorrect interface name")
11501154
require.NoError(t, err, "error while assigning unused VfDevice ID")
1151-
podController.releaseSriovVFDeviceID(podName, testNamespace, interfaceName)
1152-
_, exists = podController.vfDeviceIDUsageMap.Load(podKey)
1155+
// The second call would get the assigned SR-IOV device ID directly.
1156+
deviceID, err := podController.assignSriovVFDeviceID(tmpPodName, tmpPodNamespace, sriovResourceName1, interfaceName)
1157+
_, exists = podController.vfDeviceIDUsageMap.Load(tmpPodKey)
1158+
assert.True(t, exists)
1159+
assert.Equal(t, sriovDeviceID11, deviceID, "incorrect device ID")
1160+
require.NoError(t, err, "error while getting assigned VfDevice ID")
1161+
podController.releaseSriovVFDeviceID(tmpPodName, tmpPodNamespace, interfaceName)
1162+
_, exists = podController.vfDeviceIDUsageMap.Load(tmpPodKey)
11531163
assert.True(t, exists)
1154-
podController.deleteVFDeviceIDListPerPod(podName, testNamespace)
1155-
_, exists = podController.vfDeviceIDUsageMap.Load(podKey)
1164+
podController.deleteVFDeviceIDListPerPod(tmpPodName, tmpPodNamespace)
1165+
_, exists = podController.vfDeviceIDUsageMap.Load(tmpPodKey)
11561166
assert.False(t, exists)
11571167
})
11581168
}

pkg/agent/secondarynetwork/podwatch/sriov.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,19 +147,31 @@ func (pc *PodController) releaseSriovVFDeviceID(podName, podNamespace, interface
147147
}
148148
}
149149

150-
func (pc *PodController) assignUnusedSriovVFDeviceID(podName, podNamespace, resourceName, interfaceName string) (string, error) {
150+
func (pc *PodController) assignSriovVFDeviceID(podName, podNamespace, resourceName, interfaceName string) (string, error) {
151151
var cache []podSriovVFDeviceIDInfo
152152
cache, err := pc.buildVFDeviceIDListPerPod(podName, podNamespace)
153153
if err != nil {
154154
return "", err
155155
}
156-
for idx := range cache {
157-
if cache[idx].resourceName == resourceName && cache[idx].ifName == "" {
158-
// Unused PCI address found. Associate PCI address to the interface.
159-
cache[idx].ifName = interfaceName
160-
return cache[idx].vfDeviceID, nil
156+
157+
var unusedCacheEntry *podSriovVFDeviceIDInfo
158+
for i := range cache {
159+
entry := &cache[i]
160+
if entry.resourceName == resourceName {
161+
if entry.ifName == interfaceName {
162+
return entry.vfDeviceID, nil
163+
}
164+
if entry.ifName == "" && unusedCacheEntry == nil {
165+
unusedCacheEntry = entry // remember the first match of unused PCI address
166+
}
161167
}
162168
}
169+
170+
if unusedCacheEntry != nil {
171+
// Update the cache entry
172+
unusedCacheEntry.ifName = interfaceName
173+
return unusedCacheEntry.vfDeviceID, nil
174+
}
163175
return "", fmt.Errorf("no available device")
164176
}
165177

@@ -172,7 +184,7 @@ func (pc *PodController) configureSriovAsSecondaryInterface(
172184
mtu int,
173185
result *current.Result,
174186
) error {
175-
podSriovVFDeviceID, err := pc.assignUnusedSriovVFDeviceID(pod.Name, pod.Namespace, resourceName, network.InterfaceRequest)
187+
podSriovVFDeviceID, err := pc.assignSriovVFDeviceID(pod.Name, pod.Namespace, resourceName, network.InterfaceRequest)
176188
if err != nil {
177189
return err
178190
}

0 commit comments

Comments
 (0)