Skip to content

Commit ee94bdc

Browse files
Slurm on GKE - Guide (#864)
* Slurm on GKE - Guide and code * Update README.md * remove empty line * add new lines at the end of each file * remove embedded image * modules moved to the shared modules directory * module references updated * Update README.md * Update README.md * Update README.md * Revert "Update README.md" This reverts commit 76703f0. * pinned version * Update providers.tf
1 parent d72c274 commit ee94bdc

33 files changed

+2575
-0
lines changed

modules/slurm-cluster/main.tf

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/**
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
18+
locals {
19+
wl_templates = [
20+
for f in fileset(local.wl_templates_path, "[0-9]*yml") :
21+
"${local.wl_templates_path}/${f}"
22+
]
23+
wl_templates_path = (
24+
var.templates_path == null
25+
? "${path.module}/manifest-templates"
26+
: pathexpand(var.templates_path)
27+
)
28+
}
29+
30+
resource "kubernetes_namespace" "default" {
31+
count = var.namespace_create ? 1 : 0
32+
metadata {
33+
name = var.namespace
34+
}
35+
}
36+
37+
resource "kubernetes_manifest" "default" {
38+
for_each = toset(local.wl_templates)
39+
manifest = yamldecode(templatefile(each.value, {
40+
namespace = var.namespace
41+
cluster_config = var.cluster_config
42+
}))
43+
44+
timeouts {
45+
create = "30m"
46+
}
47+
field_manager {
48+
force_conflicts = true
49+
}
50+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#
2+
# Copyright 2024 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
apiVersion: v1
18+
kind: ConfigMap
19+
metadata:
20+
name: slurm-conf-configmap
21+
namespace: ${namespace}
22+
data:
23+
slurm.conf: |
24+
# slurm.conf
25+
#
26+
# See the slurm.conf man page for more information.
27+
#
28+
ClusterName=linux
29+
SlurmctldHost=slurmctld-0
30+
#
31+
SlurmUser=slurm
32+
SlurmctldPort=6820-6830
33+
SlurmdPort=6818
34+
AuthType=auth/munge
35+
StateSaveLocation=/var/spool/slurmctld
36+
SlurmdSpoolDir=/var/spool/slurmd
37+
SwitchType=switch/none
38+
MpiDefault=pmix
39+
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
40+
SlurmdPidFile=/var/run/slurmd/slurmd.pid
41+
ProctrackType=proctrack/linuxproc
42+
ReturnToService=2
43+
#
44+
# TIMERS
45+
SlurmctldTimeout=300
46+
SlurmdTimeout=30
47+
InactiveLimit=0
48+
MinJobAge=300
49+
KillWait=30
50+
Waittime=0
51+
#
52+
# SCHEDULING
53+
SchedulerType=sched/backfill
54+
SelectType=select/cons_tres
55+
SelectTypeParameters=CR_CPU_Memory
56+
#
57+
# LOGGING
58+
SlurmctldDebug=3
59+
SlurmctldLogFile=/var/log/slurm/slurmctld.log
60+
SlurmdDebug=3
61+
SlurmdLogFile=/var/log/slurm/slurmd.log
62+
JobCompType=jobcomp/filetxt
63+
JobCompLoc=/var/log/slurm/jobcomp.log
64+
#
65+
# ACCOUNTING
66+
JobAcctGatherType=jobacct_gather/linux
67+
JobAcctGatherFrequency=30
68+
#
69+
AccountingStorageType=accounting_storage/slurmdbd
70+
AccountingStorageHost=slurmdbd
71+
AccountingStoragePort=6819
72+
#
73+
SlurmctldParameters=cloud_reg_addrs
74+
75+
# CLOUD CONFIGURATIONS
76+
MaxNodeCount=64000
77+
include cloud.conf
78+
cloud.conf: |
79+
PrivateData=cloud
80+
SlurmctldParameters=enable_configless
81+
## GRES
82+
GresTypes=gpu
83+
AccountingStorageTRES=gres/gpu
84+
DebugFlags=Gres
85+
TreeWidth=128
86+
87+
# NODES
88+
NodeName=DEFAULT State=UNKNOWN RealMemory=15000 CPUs=4 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:1
89+
NodeName=slurmd-[0-39] State=CLOUD Gres=gpu:1
90+
NodeSet=slurmdnodeset Nodes=slurmd-[0-39]
91+
92+
NodeName=DEFAULT State=UNKNOWN RealMemory=30000 CPUs=8 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:2
93+
NodeName=slurmd1-[0-39] State=CLOUD Gres=gpu:2
94+
NodeSet=slurmd1nodeset Nodes=slurmd1-[0-39]
95+
96+
# PARTITIONS
97+
PartitionName=all Default=yes Nodes=ALL MaxTime=INFINITE State=UP
98+
99+
PropagateResourceLimitsExcept=MEMLOCK
100+
101+
PartitionName=1gpunodes Nodes=slurmdnodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120
102+
PartitionName=2gpunodes Nodes=slurmd1nodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120
103+
104+
cloud_gres.conf: |
105+
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
106+
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
107+
gres.conf: |
108+
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
109+
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
110+
cgroup.conf: |
111+
###
112+
#
113+
# Slurm cgroup support configuration file
114+
#
115+
# See man slurm.conf and man cgroup.conf for further
116+
# information on cgroup configuration parameters
117+
#--
118+
ConstrainCores=yes
119+
ConstrainDevices=yes
120+
ConstrainRAMSpace=yes
121+
ConstrainSwapSpace=yes
122+
IgnoreSystemd=yes
123+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: ConfigMap
25+
metadata:
26+
name: slurmdbd-conf-configmap
27+
namespace: ${namespace}
28+
data:
29+
slurmdbd.conf: |
30+
#
31+
# Example slurmdbd.conf file.
32+
#
33+
# See the slurmdbd.conf man page for more information.
34+
#
35+
# Authentication info
36+
AuthType=auth/munge
37+
#
38+
# slurmDBD info
39+
DbdAddr=slurmdbd
40+
DbdHost=slurmdbd
41+
SlurmUser=slurm
42+
DebugLevel=4
43+
LogFile=/var/log/slurm/slurmdbd.log
44+
PidFile=/var/run/slurmdbd/slurmdbd.pid
45+
#
46+
# Database info
47+
StorageType=accounting_storage/mysql
48+
StorageHost=${cluster_config.database.host}
49+
StorageUser=${cluster_config.database.user}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: Secret
25+
metadata:
26+
name: database-auth-secret
27+
namespace: ${namespace}
28+
data:
29+
password: ${cluster_config.database.password}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: Secret
25+
metadata:
26+
name: munge-key-secret
27+
namespace: ${namespace}
28+
data:
29+
munge.key: ${base64encode(cluster_config.munge.key)}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: PersistentVolumeClaim
25+
metadata:
26+
name: slurm-shared-storage
27+
namespace: ${namespace}
28+
spec:
29+
storageClassName: standard-rwx
30+
accessModes:
31+
- ReadWriteMany
32+
resources:
33+
requests:
34+
storage: ${cluster_config.storage.size_gb}Gi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: PersistentVolumeClaim
25+
metadata:
26+
labels:
27+
app.kubernetes.io/name: slurm
28+
app.kubernetes.io/component: mysql
29+
name: var-lib-mysql
30+
namespace: ${namespace}
31+
spec:
32+
accessModes:
33+
- ReadWriteOnce
34+
resources:
35+
requests:
36+
storage: ${cluster_config.database.storage_size_gb}Gi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# MIT License
2+
3+
# Copyright (c) 2019 Giovanni Torres
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
apiVersion: v1
24+
kind: PersistentVolumeClaim
25+
metadata:
26+
labels:
27+
app.kubernetes.io/name: slurm
28+
app.kubernetes.io/component: slurmctld
29+
name: var-spool-slurmctld
30+
namespace: ${namespace}
31+
spec:
32+
accessModes:
33+
- ReadWriteOnce
34+
resources:
35+
requests:
36+
storage: 100Mi

0 commit comments

Comments
 (0)