Skip to content

[EBPF] gpu: handle runtime changes of CUDA_VISIBLE_DEVICES #38312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/ebpf/cgo/genpost.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ func processFile(rdr io.Reader, out io.Writer) error {
"Topic_name",
"Trigger_comm",
"Victim_comm",
"Devices",
}

// Convert []int8 to []byte in multiple generated fields from the kernel, to simplify
Expand Down
8 changes: 7 additions & 1 deletion pkg/ebpf/uprobes/attacher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,7 @@ func (s *SharedLibrarySuite) TestMultipleLibsets() {
// Create test files for different libsets
cryptoLibPath, _ := createTempTestFile(t, "foo-libssl.so")
gpuLibPath, _ := createTempTestFile(t, "foo-libcudart.so")
libcLibPath, _ := createTempTestFile(t, "foo-libc.so")

attachCfg := AttacherConfig{
Rules: []*AttachRule{
Expand All @@ -1063,9 +1064,13 @@ func (s *SharedLibrarySuite) TestMultipleLibsets() {
LibraryNameRegex: regexp.MustCompile(`foo-libcudart\.so`),
Targets: AttachToSharedLibraries,
},
{
LibraryNameRegex: regexp.MustCompile(`foo-libc\.so`),
Targets: AttachToSharedLibraries,
},
},
EbpfConfig: ebpfCfg,
SharedLibsLibsets: []sharedlibraries.Libset{sharedlibraries.LibsetCrypto, sharedlibraries.LibsetGPU},
SharedLibsLibsets: []sharedlibraries.Libset{sharedlibraries.LibsetCrypto, sharedlibraries.LibsetGPU, sharedlibraries.LibsetLibc},
EnablePeriodicScanNewProcesses: false,
}

Expand Down Expand Up @@ -1094,6 +1099,7 @@ func (s *SharedLibrarySuite) TestMultipleLibsets() {
testCases := []testCase{
{cryptoLibPath, "foo-libssl.so", "crypto library"},
{gpuLibPath, "foo-libcudart.so", "GPU library"},
{libcLibPath, "foo-libc.so", "libc library"},
}

var commands []*exec.Cmd
Expand Down
7 changes: 6 additions & 1 deletion pkg/ebpf/uprobes/testutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,12 @@ func waitAndRetryIfFail(t *testing.T, setupFunc func(), testFunc func() bool, re
}
}

require.Fail(t, "condition not met after %d retries", maxRetries, msgAndArgs)
extraFmt := ""
if len(msgAndArgs) > 0 {
extraFmt = fmt.Sprintf(msgAndArgs[0].(string), msgAndArgs[1:]...) + ": "
}

require.Fail(t, "condition not met", "%scondition not met after %d retries", extraFmt, maxRetries)
}

// processMonitorProxy is a wrapper around a ProcessMonitor that stores the
Expand Down
12 changes: 11 additions & 1 deletion pkg/gpu/consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"time"
"unsafe"

"golang.org/x/sys/unix"

"github.com/DataDog/datadog-agent/comp/core/telemetry"
ddebpf "github.com/DataDog/datadog-agent/pkg/ebpf"
"github.com/DataDog/datadog-agent/pkg/gpu/config"
Expand Down Expand Up @@ -163,7 +165,7 @@ func (c *cudaEventConsumer) Start() {
}

func isStreamSpecificEvent(eventType gpuebpf.CudaEventType) bool {
return eventType != gpuebpf.CudaEventTypeSetDevice
return eventType != gpuebpf.CudaEventTypeSetDevice && eventType != gpuebpf.CudaEventTypeVisibleDevicesSet
}

func (c *cudaEventConsumer) handleEvent(header *gpuebpf.CudaEventHeader, dataPtr unsafe.Pointer, dataLen int) error {
Expand Down Expand Up @@ -222,11 +224,19 @@ func (c *cudaEventConsumer) handleSetDevice(csde *gpuebpf.CudaSetDeviceEvent) {
c.sysCtx.setDeviceSelection(int(pid), int(tid), csde.Device)
}

func (c *cudaEventConsumer) handleVisibleDevicesSet(vds *gpuebpf.CudaVisibleDevicesSetEvent) {
pid, _ := getPidTidFromHeader(&vds.Header)

c.sysCtx.setUpdatedVisibleDevicesEnvVar(int(pid), unix.ByteSliceToString(vds.Devices[:]))
}

func (c *cudaEventConsumer) handleGlobalEvent(header *gpuebpf.CudaEventHeader, data unsafe.Pointer, dataLen int) error {
eventType := gpuebpf.CudaEventType(header.Type)
switch eventType {
case gpuebpf.CudaEventTypeSetDevice:
return handleTypedEvent(c, c.handleSetDevice, eventType, data, dataLen, gpuebpf.SizeofCudaSetDeviceEvent)
case gpuebpf.CudaEventTypeVisibleDevicesSet:
return handleTypedEvent(c, c.handleVisibleDevicesSet, eventType, data, dataLen, gpuebpf.SizeofCudaVisibleDevicesSetEvent)
default:
c.telemetry.eventErrors.Inc(telemetryEventTypeUnknown, telemetryEventErrorUnknownType)
return fmt.Errorf("unknown event type: %d", header.Type)
Expand Down
34 changes: 29 additions & 5 deletions pkg/gpu/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/DataDog/datadog-agent/pkg/gpu/cuda"
ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
gpuutil "github.com/DataDog/datadog-agent/pkg/util/gpu"
"github.com/DataDog/datadog-agent/pkg/util/kernel"
"github.com/DataDog/datadog-agent/pkg/util/ktime"
)

Expand All @@ -35,6 +36,12 @@ type systemContext struct {
// be modified by the CUDA_VISIBLE_DEVICES environment variable later
selectedDeviceByPIDAndTID map[int]map[int]int32

// cudaVisibleDevicesPerProcess maps each process ID to the latest visible
// devices environment variable that was set by the process. This is used to
// keep track of updates during process runtime, which aren't visible in
// /proc/pid/environ.
cudaVisibleDevicesPerProcess map[int]string

// deviceCache is a cache of GPU devices on the system
deviceCache ddnvml.DeviceCache

Expand Down Expand Up @@ -104,10 +111,11 @@ func getSystemContext(optList ...systemContextOption) (*systemContext, error) {
opts := newSystemContextOptions(optList...)

ctx := &systemContext{
procRoot: opts.procRoot,
selectedDeviceByPIDAndTID: make(map[int]map[int]int32),
visibleDevicesCache: make(map[int][]ddnvml.Device),
workloadmeta: opts.wmeta,
procRoot: opts.procRoot,
selectedDeviceByPIDAndTID: make(map[int]map[int]int32),
visibleDevicesCache: make(map[int][]ddnvml.Device),
cudaVisibleDevicesPerProcess: make(map[int]string),
workloadmeta: opts.wmeta,
}

var err error
Expand Down Expand Up @@ -135,6 +143,7 @@ func getSystemContext(optList ...systemContextOption) (*systemContext, error) {
func (ctx *systemContext) removeProcess(pid int) {
delete(ctx.selectedDeviceByPIDAndTID, pid)
delete(ctx.visibleDevicesCache, pid)
delete(ctx.cudaVisibleDevicesPerProcess, pid)

if ctx.cudaKernelCache != nil {
ctx.cudaKernelCache.CleanProcessData(pid)
Expand Down Expand Up @@ -251,7 +260,15 @@ func (ctx *systemContext) getCurrentActiveGpuDevice(pid int, tid int, containerI
return nil, fmt.Errorf("error filtering devices for container %s: %w", containerID, err)
}

visibleDevices, err = cuda.GetVisibleDevicesForProcess(visibleDevices, pid, ctx.procRoot)
envVar, ok := ctx.cudaVisibleDevicesPerProcess[pid]
if !ok {
envVar, err = kernel.GetProcessEnvVariable(pid, ctx.procRoot, cuda.CudaVisibleDevicesEnvVar)
if err != nil {
return nil, fmt.Errorf("error getting env var %s for process %d: %w", cuda.CudaVisibleDevicesEnvVar, pid, err)
}
}

visibleDevices, err = cuda.ParseVisibleDevices(visibleDevices, envVar)
if err != nil {
return nil, fmt.Errorf("error getting visible devices for process %d: %w", pid, err)
}
Expand Down Expand Up @@ -284,3 +301,10 @@ func (ctx *systemContext) setDeviceSelection(pid int, tid int, deviceIndex int32

ctx.selectedDeviceByPIDAndTID[pid][tid] = deviceIndex
}

func (ctx *systemContext) setUpdatedVisibleDevicesEnvVar(pid int, envVar string) {
ctx.cudaVisibleDevicesPerProcess[pid] = envVar

// Invalidate the visible devices cache to force a re-scan of the devices
delete(ctx.visibleDevicesCache, pid)
}
24 changes: 24 additions & 0 deletions pkg/gpu/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ func TestGetCurrentActiveGpuDevice(t *testing.T) {
containerID string
configuredDeviceIdx []int32
expectedDeviceIdx []int32
updatedEnvVar string
}{
{
name: "NoContainer",
Expand Down Expand Up @@ -205,10 +206,29 @@ func TestGetCurrentActiveGpuDevice(t *testing.T) {
configuredDeviceIdx: []int32{1, 2},
expectedDeviceIdx: []int32{containerDeviceIndexes[envVisibleDevices[1]], containerDeviceIndexes[envVisibleDevices[2]]},
},
{
name: "NoContainerAndRuntimeEnvVar",
pid: pidNoContainer,
configuredDeviceIdx: []int32{0},
expectedDeviceIdx: []int32{1},
updatedEnvVar: "1",
},
{
name: "NoContainerAndRuntimeUpdatedEnvVar",
pid: pidNoContainerButEnv,
configuredDeviceIdx: []int32{0},
expectedDeviceIdx: []int32{1},
updatedEnvVar: "1",
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if c.updatedEnvVar != "" {
sysCtx.setUpdatedVisibleDevicesEnvVar(c.pid, c.updatedEnvVar)
require.NotContains(t, sysCtx.visibleDevicesCache, c.pid, "cache not invalidated for process %d", c.pid)
}

for i, idx := range c.configuredDeviceIdx {
sysCtx.setDeviceSelection(c.pid, c.pid+i, idx)
}
Expand All @@ -218,6 +238,10 @@ func TestGetCurrentActiveGpuDevice(t *testing.T) {
require.NoError(t, err)
nvmltestutil.RequireDevicesEqual(t, sysCtx.deviceCache.All()[idx], activeDevice, "invalid device at index %d (real index is %d, selected index is %d)", i, idx, c.configuredDeviceIdx[i])
}

// Note: we're explicitly not resetting the caches, as we want to test
// whether the functions correctly invalidate the caches when the
// environment variable is updated.
})
}
}
23 changes: 5 additions & 18 deletions pkg/gpu/cuda/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,14 @@ import (
"strconv"
"strings"

"github.com/DataDog/datadog-agent/pkg/util/kernel"

ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
)

const cudaVisibleDevicesEnvVar = "CUDA_VISIBLE_DEVICES"
// CudaVisibleDevicesEnvVar is the name of the environment variable that controls the visible GPUs for CUDA applications
const CudaVisibleDevicesEnvVar = "CUDA_VISIBLE_DEVICES"

// GetVisibleDevicesForProcess modifies the list of GPU devices according to the
// value of the CUDA_VISIBLE_DEVICES environment variable for the specified
// process. Reference:
// ParseVisibleDevices modifies the list of GPU devices according to the
// value of the CUDA_VISIBLE_DEVICES environment variable. Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars.
//
// As a summary, the CUDA_VISIBLE_DEVICES environment variable should be a comma
Expand All @@ -36,18 +34,7 @@ const cudaVisibleDevicesEnvVar = "CUDA_VISIBLE_DEVICES"
// devices whose index precedes the invalid index are visible to CUDA
// applications." If an invalid index is found, an error is returned together
// with the list of valid devices found up until that point.
func GetVisibleDevicesForProcess(devices []ddnvml.Device, pid int, procfs string) ([]ddnvml.Device, error) {
cudaVisibleDevicesForProcess, err := kernel.GetProcessEnvVariable(pid, procfs, cudaVisibleDevicesEnvVar)
if err != nil {
return nil, fmt.Errorf("cannot get env var %s for process %d: %w", cudaVisibleDevicesEnvVar, pid, err)
}

return getVisibleDevices(devices, cudaVisibleDevicesForProcess)
}

// getVisibleDevices processes the list of GPU devices according to the value of
// the CUDA_VISIBLE_DEVICES environment variable
func getVisibleDevices(devices []ddnvml.Device, cudaVisibleDevicesForProcess string) ([]ddnvml.Device, error) {
func ParseVisibleDevices(devices []ddnvml.Device, cudaVisibleDevicesForProcess string) ([]ddnvml.Device, error) {
// First, we adjust the list of devices to take into account how CUDA presents MIG devices in order. This
// list will not be used when searching by prefix because prefix matching is done against *all* devices,
// but index filtering is done against the adjusted list where devices with MIG children are replaced by
Expand Down
4 changes: 2 additions & 2 deletions pkg/gpu/cuda/env_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestGetVisibleDevices(t *testing.T) {

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
devices, err := getVisibleDevices(devList, tc.visibleDevices)
devices, err := ParseVisibleDevices(devList, tc.visibleDevices)
if tc.expectsError {
require.Error(t, err)
} else {
Expand Down Expand Up @@ -319,7 +319,7 @@ func TestGetVisibleDevicesWithMIG(t *testing.T) {

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
devices, err := getVisibleDevices(tc.systemDevices, tc.visibleDevices)
devices, err := ParseVisibleDevices(tc.systemDevices, tc.visibleDevices)
if tc.expectsError {
require.Error(t, err)
} else {
Expand Down
3 changes: 2 additions & 1 deletion pkg/gpu/e2e_events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/stretchr/testify/require"

"github.com/DataDog/datadog-agent/pkg/gpu/config"
"github.com/DataDog/datadog-agent/pkg/gpu/ebpf"
ddnvml "github.com/DataDog/datadog-agent/pkg/gpu/safenvml"
nvmltestutil "github.com/DataDog/datadog-agent/pkg/gpu/safenvml/testutil"
"github.com/DataDog/datadog-agent/pkg/gpu/testutil"
Expand Down Expand Up @@ -71,7 +72,7 @@ func TestPytorchBatchedKernels(t *testing.T) {

telemetryMetrics, err := telemetryMock.GetCountMetric("gpu__consumer", "events")
require.NoError(t, err)
require.Equal(t, 4, len(telemetryMetrics)) // one for each event type
require.Equal(t, int(ebpf.CudaEventTypeCount), len(telemetryMetrics)) // one for each event type
expectedEventsByType := testutil.DataSampleInfos[testutil.DataSamplePytorchBatchedKernels].EventByType
for _, metric := range telemetryMetrics {
eventTypeTag := metric.Tags()["event_type"]
Expand Down
39 changes: 39 additions & 0 deletions pkg/gpu/ebpf/c/runtime/gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,43 @@ int BPF_URETPROBE(uretprobe__cudaMemcpy) {
return 0;
}

SEC("uprobe/setenv")
int BPF_UPROBE(uprobe__setenv, const char *name, const char *value, int overwrite) {
// Check if the env var is CUDA_VISIBLE_DEVICES. This is BPF_UPROBE, so we can't use a string
// comparison.
const char cuda_visible_devices[] = "CUDA_VISIBLE_DEVICES";
char name_buf[sizeof(cuda_visible_devices)];

// bpf_probe_read_user_str is available from kernel 5.5, our minimum kernel version is 5.8.0
int res = bpf_probe_read_user_str_with_telemetry(name_buf, sizeof(name_buf), name);
if (res < 0) {
return 0;
}

// return value of bpf_probe_read_user_str_with_telemetry is the length of the string read,
// including the NULL byte. If the string is not the same length, it's not CUDA_VISIBLE_DEVICES.
if (res != sizeof(cuda_visible_devices)) {
return 0;
}

// bpf_strncmp is available in kernel 5.17, our minimum kernel version is 5.8.0
// so we need to do a manual comparison
for (int i = 0; i < sizeof(cuda_visible_devices); i++) {
if (name_buf[i] != cuda_visible_devices[i]) {
return 0;
}
}

cuda_visible_devices_set_t event = { 0 };

if (bpf_probe_read_user_str_with_telemetry(event.visible_devices, sizeof(event.visible_devices), value) < 0) {
return 0;
}

fill_header(&event.header, 0, cuda_visible_devices_set);

bpf_ringbuf_output_with_telemetry(&cuda_events, &event, sizeof(event), 0);
return 0;
}

char __license[] SEC("license") = "GPL";
7 changes: 7 additions & 0 deletions pkg/gpu/ebpf/c/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ typedef enum {
cuda_memory_event,
cuda_sync,
cuda_set_device,
cuda_visible_devices_set,
cuda_event_type_count,
} cuda_event_type_t;

#define MAX_CONTAINER_ID_LEN 129
#define MAX_ENV_VAR_LEN 256 // Not the actual max (which seems to be 32KB) but enough for the CUDA_VISIBLE_DEVICES env var use case

typedef struct {
__u64 pid_tgid;
Expand Down Expand Up @@ -70,4 +72,9 @@ typedef struct {
__u64 last_access_ktime_ns;
} cuda_event_value_t;

typedef struct {
cuda_event_header_t header;
char visible_devices[MAX_ENV_VAR_LEN];
} cuda_visible_devices_set_t;

#endif
4 changes: 4 additions & 0 deletions pkg/gpu/ebpf/kprobe_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,16 @@ type CudaMemEventType C.cuda_memory_event_type_t

type CudaSetDeviceEvent C.cuda_set_device_event_t

type CudaVisibleDevicesSetEvent C.cuda_visible_devices_set_t

type CudaEventKey C.cuda_event_key_t
type CudaEventValue C.cuda_event_value_t

const CudaEventTypeKernelLaunch CudaEventType = C.cuda_kernel_launch
const CudaEventTypeMemory CudaEventType = C.cuda_memory_event
const CudaEventTypeSync CudaEventType = C.cuda_sync
const CudaEventTypeSetDevice CudaEventType = C.cuda_set_device
const CudaEventTypeVisibleDevicesSet CudaEventType = C.cuda_visible_devices_set
const CudaEventTypeCount CudaEventType = C.cuda_event_type_count

const CudaMemAlloc = C.cudaMalloc
Expand All @@ -44,3 +47,4 @@ const SizeofCudaMemEvent = C.sizeof_cuda_memory_event_t
const SizeofCudaEventHeader = C.sizeof_cuda_event_header_t
const SizeofCudaSync = C.sizeof_cuda_sync_t
const SizeofCudaSetDeviceEvent = C.sizeof_cuda_set_device_event_t
const SizeofCudaVisibleDevicesSetEvent = C.sizeof_cuda_visible_devices_set_t
Loading