Skip to content

Commit c9213c2

Browse files
committed
Added NVIDIA DCGM
1 parent edb0a9d commit c9213c2

13 files changed

+409
-0
lines changed

platforms/gke/base/_shared_config/workloads_variables.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,21 @@ variable "lws_version" {
3737
description = "Version of LeaderWorkerSet (LWS) (https://github.com/kubernetes-sigs/lws/) to install."
3838
type = string
3939
}
40+
41+
variable "nvidia_dcgm_exporter_image" {
42+
default = "nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04"
43+
description = "NVIDIA Data Center GPU Manager (DCGM) Exporter image (https://hub.docker.com/r/nvidia/dcgm-exporter/tags) to install."
44+
type = string
45+
}
46+
47+
variable "nvidia_dcgm_image" {
48+
default = "nvcr.io/nvidia/cloud-native/dcgm:4.1.1-1-ubuntu22.04"
49+
description = "NVIDIA Data Center GPU Manager (DCGM) image (https://hub.docker.com/r/nvidia/dcgm-exporter/tags) to install."
50+
type = string
51+
}
52+
53+
variable "nvidia_dcgm_version" {
54+
default = "4.1.1-1"
55+
description = "NVIDIA Data Center GPU Manager (DCGM) version to install. The corresponding image should be used for the 'nvidia_dcgm_image' amd 'nvidia_dcgm_exporter_image' variables."
56+
type = string
57+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/cluster.auto.tfvars
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/cluster_variables.tf
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/platform.auto.tfvars
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/platform_variables.tf
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/workloads.auto.tfvars
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/workloads_variables.tf
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
locals {
16+
kubeconfig_directory = "${path.module}/../../../kubernetes/kubeconfig"
17+
kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}"
18+
19+
manifests_directory = "${local.namespace_directory}/gmp-public"
20+
namespace_directory = "${local.manifests_directory_root}/namespace"
21+
version_manifests_directory = "${path.module}/manifests/dcgm-${var.nvidia_dcgm_version}"
22+
}
23+
24+
data "local_file" "kubeconfig" {
25+
filename = local.kubeconfig_file
26+
}
27+
28+
resource "terraform_data" "manifests" {
29+
input = {
30+
manifests_dir = local.manifests_directory
31+
version_manifests_dir = local.version_manifests_directory
32+
version = var.lws_version
33+
}
34+
35+
provisioner "local-exec" {
36+
command = <<EOT
37+
mkdir -p "${self.input.version_manifests_dir}" && \
38+
mkdir -p "${self.input.manifests_dir}" && \
39+
cp -r templates/workload/* "${self.input.version_manifests_dir}/" && \
40+
cp -r "${self.input.version_manifests_dir}"/* "${self.input.manifests_dir}/"
41+
EOT
42+
interpreter = ["bash", "-c"]
43+
working_dir = path.module
44+
}
45+
46+
triggers_replace = {
47+
manifests_dir = local.manifests_directory
48+
version_manifests_dir = local.version_manifests_directory
49+
version = var.lws_version
50+
}
51+
}
52+
53+
module "kubectl_apply_manifests" {
54+
depends_on = [
55+
terraform_data.manifests,
56+
module.kubectl_apply_namespace,
57+
]
58+
59+
source = "../../../modules/kubectl_apply"
60+
61+
apply_server_side = true
62+
kubeconfig_file = data.local_file.kubeconfig.filename
63+
manifest = local.version_manifests_directory
64+
manifest_includes_namespace = true
65+
use_kustomize = true
66+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm
16+
apiVersion: apps/v1
17+
kind: DaemonSet
18+
metadata:
19+
name: nvidia-dcgm
20+
labels:
21+
app: nvidia-dcgm
22+
spec:
23+
selector:
24+
matchLabels:
25+
app: nvidia-dcgm
26+
updateStrategy:
27+
type: RollingUpdate
28+
template:
29+
metadata:
30+
labels:
31+
name: nvidia-dcgm
32+
app: nvidia-dcgm
33+
spec:
34+
affinity:
35+
nodeAffinity:
36+
requiredDuringSchedulingIgnoredDuringExecution:
37+
nodeSelectorTerms:
38+
- matchExpressions:
39+
- key: cloud.google.com/gke-accelerator
40+
operator: Exists
41+
tolerations:
42+
- operator: "Exists"
43+
volumes:
44+
- name: nvidia-install-dir-host
45+
hostPath:
46+
path: /home/kubernetes/bin/nvidia
47+
type: Directory
48+
containers:
49+
- name: nvidia-dcgm
50+
# https://hub.docker.com/r/nvidia/dcgm/tags
51+
image: ${dcgm_image}
52+
command: ["nv-hostengine", "-n", "-b", "ALL"]
53+
ports:
54+
- containerPort: 5555
55+
hostPort: 5555
56+
securityContext:
57+
privileged: true
58+
volumeMounts:
59+
- name: nvidia-install-dir-host
60+
mountPath: /usr/local/nvidia
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Reference: https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/nvidia-dcgm
16+
apiVersion: monitoring.googleapis.com/v1
17+
kind: ClusterPodMonitoring
18+
metadata:
19+
name: nvidia-dcgm-exporter
20+
labels:
21+
app.kubernetes.io/name: nvidia-dcgm-exporter
22+
app.kubernetes.io/part-of: google-cloud-managed-prometheus
23+
spec:
24+
selector:
25+
matchLabels:
26+
app.kubernetes.io/name: nvidia-dcgm-exporter
27+
endpoints:
28+
- port: metrics
29+
interval: 30s
30+
targetLabels:
31+
metadata: []
32+

0 commit comments

Comments
 (0)