@@ -12,6 +12,7 @@ import (
12
12
"io"
13
13
"net/http"
14
14
"os"
15
+ "os/exec"
15
16
"regexp"
16
17
"strings"
17
18
"time"
@@ -223,21 +224,67 @@ func setClearHostTag(s *common.Setup, tagKey, value string) {
223
224
}
224
225
225
226
func setupGPUIntegration (s * common.Setup ) {
226
- if os .Getenv ("GPU_MONITORING_ENABLED" ) != "" {
227
- s .Out .WriteString ("GPU monitoring enabled via GPU_MONITORING_ENABLED environment variable\n " )
227
+ gpuEnabled := os .Getenv ("GPU_MONITORING_ENABLED" )
228
228
229
- s .Config .DatadogYAML .CollectGPUTags = true
230
- s .Config .DatadogYAML .EnableNVMLDetection = true
229
+ // Check if GPU monitoring is explicitly enabled with "1" or "true"
230
+ if gpuEnabled != "1" && gpuEnabled != "true" {
231
+ return
232
+ }
231
233
232
- if s .Config .SystemProbeYAML == nil {
233
- s .Config .SystemProbeYAML = & config.SystemProbeConfig {}
234
- }
235
- s .Config .SystemProbeYAML .GPUMonitoringConfig = config.GPUMonitoringConfig {
236
- Enabled : true ,
237
- }
234
+ s .Out .WriteString ("GPU monitoring enabled via GPU_MONITORING_ENABLED environment variable\n " )
235
+
236
+ s .Config .DatadogYAML .CollectGPUTags = true
237
+ s .Config .DatadogYAML .EnableNVMLDetection = true
238
+
239
+ if s .Config .SystemProbeYAML == nil {
240
+ s .Config .SystemProbeYAML = & config.SystemProbeConfig {}
241
+ }
242
+ s .Config .SystemProbeYAML .GPUMonitoringConfig = config.GPUMonitoringConfig {
243
+ Enabled : true ,
244
+ }
245
+
246
+ s .Span .SetTag ("gpu_monitoring_enabled" , "true" )
238
247
239
- s .Span .SetTag ("gpu_monitoring_enabled" , "true" )
248
+ // NVML is not initialized when the databricks init script executes.
249
+ // This causes the GPU integration to fail initialisation, we must retry after
250
+ // a timeout so the NVML checks are passing and the GPU integration starts polling metrics.
251
+ scheduleDelayedAgentRestart (s , 30 * time .Second )
252
+ }
253
+
254
+ // scheduleDelayedAgentRestart schedules an agent restart after the specified delay
255
+ func scheduleDelayedAgentRestart (s * common.Setup , delay time.Duration ) {
256
+ s .Out .WriteString (fmt .Sprintf ("Scheduling agent restart in %v for GPU monitoring\n " , delay ))
257
+
258
+ // Create a shell script that will run in the background
259
+ script := fmt .Sprintf (`#!/bin/bash
260
+ echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] GPU restart script started, waiting %v..." >> /tmp/datadog-gpu-restart.log
261
+ sleep %d
262
+ echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Restarting Datadog agent for GPU monitoring..." >> /tmp/datadog-gpu-restart.log
263
+ service datadog-agent restart >> /tmp/datadog-gpu-restart.log 2>&1
264
+ if [ $? -eq 0 ]; then
265
+ echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Successfully restarted Datadog agent for GPU monitoring" >> /tmp/datadog-gpu-restart.log
266
+ else
267
+ echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Failed to restart agent for GPU monitoring" >> /tmp/datadog-gpu-restart.log
268
+ fi
269
+ ` , delay , int (delay .Seconds ()))
270
+
271
+ // Write the script to a temporary file
272
+ scriptFile := "/tmp/datadog-gpu-restart.sh"
273
+ err := os .WriteFile (scriptFile , []byte (script ), 0755 )
274
+ if err != nil {
275
+ s .Out .WriteString (fmt .Sprintf ("Failed to write restart script: %v\n " , err ))
276
+ return
240
277
}
278
+
279
+ // Execute the script in the background using nohup and bash -c to properly detach it
280
+ cmd := exec .Command ("bash" , "-c" , fmt .Sprintf ("nohup %s > /dev/null 2>&1 &" , scriptFile ))
281
+ err = cmd .Run ()
282
+ if err != nil {
283
+ s .Out .WriteString (fmt .Sprintf ("Failed to start background restart script: %v\n " , err ))
284
+ return
285
+ }
286
+
287
+ s .Out .WriteString ("GPU restart script started in background (check /tmp/datadog-gpu-restart.log)\n " )
241
288
}
242
289
243
290
func setupDatabricksDriver (s * common.Setup ) {
0 commit comments