Skip to content

Commit 9510799

Browse files
committed
auto-restart agent when GPU monitoring is configured
1 parent ef4847c commit 9510799

File tree

1 file changed

+58
-11
lines changed

1 file changed

+58
-11
lines changed

pkg/fleet/installer/setup/djm/databricks.go

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"io"
1313
"net/http"
1414
"os"
15+
"os/exec"
1516
"regexp"
1617
"strings"
1718
"time"
@@ -223,21 +224,67 @@ func setClearHostTag(s *common.Setup, tagKey, value string) {
223224
}
224225

225226
func setupGPUIntegration(s *common.Setup) {
226-
if os.Getenv("GPU_MONITORING_ENABLED") != "" {
227-
s.Out.WriteString("GPU monitoring enabled via GPU_MONITORING_ENABLED environment variable\n")
227+
gpuEnabled := os.Getenv("GPU_MONITORING_ENABLED")
228228

229-
s.Config.DatadogYAML.CollectGPUTags = true
230-
s.Config.DatadogYAML.EnableNVMLDetection = true
229+
// Check if GPU monitoring is explicitly enabled with "1" or "true"
230+
if gpuEnabled != "1" && gpuEnabled != "true" {
231+
return
232+
}
231233

232-
if s.Config.SystemProbeYAML == nil {
233-
s.Config.SystemProbeYAML = &config.SystemProbeConfig{}
234-
}
235-
s.Config.SystemProbeYAML.GPUMonitoringConfig = config.GPUMonitoringConfig{
236-
Enabled: true,
237-
}
234+
s.Out.WriteString("GPU monitoring enabled via GPU_MONITORING_ENABLED environment variable\n")
235+
236+
s.Config.DatadogYAML.CollectGPUTags = true
237+
s.Config.DatadogYAML.EnableNVMLDetection = true
238+
239+
if s.Config.SystemProbeYAML == nil {
240+
s.Config.SystemProbeYAML = &config.SystemProbeConfig{}
241+
}
242+
s.Config.SystemProbeYAML.GPUMonitoringConfig = config.GPUMonitoringConfig{
243+
Enabled: true,
244+
}
245+
246+
s.Span.SetTag("gpu_monitoring_enabled", "true")
238247

239-
s.Span.SetTag("gpu_monitoring_enabled", "true")
248+
// NVML is not initialized when the databricks init script executes.
249+
// This causes the GPU integration to fail initialisation, we must retry after
250+
// a timeout so the NVML checks are passing and the GPU integration starts polling metrics.
251+
scheduleDelayedAgentRestart(s, 30*time.Second)
252+
}
253+
254+
// scheduleDelayedAgentRestart schedules an agent restart after the specified delay
255+
func scheduleDelayedAgentRestart(s *common.Setup, delay time.Duration) {
256+
s.Out.WriteString(fmt.Sprintf("Scheduling agent restart in %v for GPU monitoring\n", delay))
257+
258+
// Create a shell script that will run in the background
259+
script := fmt.Sprintf(`#!/bin/bash
260+
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] GPU restart script started, waiting %v..." >> /tmp/datadog-gpu-restart.log
261+
sleep %d
262+
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Restarting Datadog agent for GPU monitoring..." >> /tmp/datadog-gpu-restart.log
263+
service datadog-agent restart >> /tmp/datadog-gpu-restart.log 2>&1
264+
if [ $? -eq 0 ]; then
265+
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Successfully restarted Datadog agent for GPU monitoring" >> /tmp/datadog-gpu-restart.log
266+
else
267+
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] Failed to restart agent for GPU monitoring" >> /tmp/datadog-gpu-restart.log
268+
fi
269+
`, delay, int(delay.Seconds()))
270+
271+
// Write the script to a temporary file
272+
scriptFile := "/tmp/datadog-gpu-restart.sh"
273+
err := os.WriteFile(scriptFile, []byte(script), 0755)
274+
if err != nil {
275+
s.Out.WriteString(fmt.Sprintf("Failed to write restart script: %v\n", err))
276+
return
240277
}
278+
279+
// Execute the script in the background using nohup and bash -c to properly detach it
280+
cmd := exec.Command("bash", "-c", fmt.Sprintf("nohup %s > /dev/null 2>&1 &", scriptFile))
281+
err = cmd.Run()
282+
if err != nil {
283+
s.Out.WriteString(fmt.Sprintf("Failed to start background restart script: %v\n", err))
284+
return
285+
}
286+
287+
s.Out.WriteString("GPU restart script started in background (check /tmp/datadog-gpu-restart.log)\n")
241288
}
242289

243290
func setupDatabricksDriver(s *common.Setup) {

0 commit comments

Comments
 (0)