Skip to content

Commit bde0979

Browse files
vijaysgpensan-tou
authored andcommitted
fix jobs dashboard missing gpu health in table (#449) (#450) (#451)
(cherry picked from commit 8f67ad10b875f2d2b73880b77c0642383398f8aa) (cherry picked from commit a959f23a3cfa5319f07b34bb61e19ca934f9935f) Co-authored-by: Titus Ou <[email protected]>
1 parent 2b2efa1 commit bde0979

File tree

3 files changed

+24
-8
lines changed

3 files changed

+24
-8
lines changed

grafana/dashboard_job.json

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2419,7 +2419,7 @@
24192419
"type": "prometheus",
24202420
"uid": "${DS_PROMETHEUS}"
24212421
},
2422-
"description": "List of all GPUs used by this job.",
2422+
"description": "List of all GPUs used by this job. Health is the last known status of the GPU during a running job in the selected time range.",
24232423
"fieldConfig": {
24242424
"defaults": {
24252425
"color": {
@@ -2569,7 +2569,7 @@
25692569
"showHeader": true,
25702570
"sortBy": [
25712571
{
2572-
"desc": true,
2572+
"desc": false,
25732573
"displayName": "HEALTH"
25742574
},
25752575
{
@@ -2589,8 +2589,8 @@
25892589
"type": "prometheus",
25902590
"uid": "${DS_PROMETHEUS}"
25912591
},
2592-
"editorMode": "builder",
2593-
"expr": "${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"}",
2592+
"editorMode": "code",
2593+
"expr": "${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"} or vector(0)",
25942594
"instant": false,
25952595
"legendFormat": "__auto",
25962596
"range": true,
@@ -2601,8 +2601,8 @@
26012601
"type": "prometheus",
26022602
"uid": "${DS_PROMETHEUS}"
26032603
},
2604-
"editorMode": "builder",
2605-
"expr": "${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"}",
2604+
"editorMode": "code",
2605+
"expr": "${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"} or vector(0)",
26062606
"hide": false,
26072607
"instant": false,
26082608
"legendFormat": "__auto",
@@ -2672,6 +2672,22 @@
26722672
"hostname (last)": "HOSTNAME"
26732673
}
26742674
}
2675+
},
2676+
{
2677+
"id": "filterByValue",
2678+
"options": {
2679+
"filters": [
2680+
{
2681+
"config": {
2682+
"id": "isNull",
2683+
"options": {}
2684+
},
2685+
"fieldName": "GPU UUID"
2686+
}
2687+
],
2688+
"match": "any",
2689+
"type": "exclude"
2690+
}
26752691
}
26762692
],
26772693
"type": "table"

grafana/dashboard_node.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2232,7 +2232,7 @@
22322232
"showHeader": true,
22332233
"sortBy": [
22342234
{
2235-
"desc": true,
2235+
"desc": false,
22362236
"displayName": "HEALTH"
22372237
},
22382238
{

grafana/dashboard_overview.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2120,7 +2120,7 @@
21202120
"showHeader": true,
21212121
"sortBy": [
21222122
{
2123-
"desc": true,
2123+
"desc": false,
21242124
"displayName": "HEALTH"
21252125
},
21262126
{

0 commit comments

Comments
 (0)