Skip to content

Commit 19b81f0

Browse files
authored
Merge pull request #4971 from dcwangmit01/update_gpu_hook
[GPU] Updated Kops GPU Setup Hook
2 parents c680e05 + ef958a7 commit 19b81f0

File tree

2 files changed

+209
-36
lines changed

2 files changed

+209
-36
lines changed

hooks/nvidia-bootstrap/image/Dockerfile

+6-3
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
FROM alpine:3.6
15+
FROM debian:jessie
16+
# ^ Cannot be Alpine since it does not support systemctl
17+
# ^ Systemctl is used to restart kubelet upon successful run of run.sh
18+
19+
RUN apt-get update && apt-get -yq install curl jq
1620

17-
RUN apk --no-cache add ca-certificates wget && update-ca-certificates
1821
ADD run.sh /run.sh
1922

20-
CMD /run.sh
23+
CMD [ "/bin/bash", "/run.sh" ]

hooks/nvidia-bootstrap/image/run.sh

+203-33
Original file line numberDiff line numberDiff line change
@@ -12,50 +12,220 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
#!/bin/sh
15+
#!/bin/bash
16+
set -euo pipefail
1617

17-
# Simple early detection of nvidia card
18-
grep 10de102d /proc/bus/pci/devices || exit 0
18+
#################################################
19+
# Settings
1920

20-
# p2.xlarge
21-
# 00f0 10de102d 4b 84000000 100000000c 0 8200000c 0 0 0 1000000 400000000 0 2000000 0 0 0 nvidia
21+
# A place on the host machine to cache these huge 1.6GB+ downloads in between reboots.
22+
ROOTFS_DIR=/rootfs
23+
CACHE_DIR_HOST=/nvidia-bootstrap-cache
24+
CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}"
2225

26+
# AWS Instance Types to Nvidia Card Mapping (cut and pasted from AWS docs)
27+
# Load the correct driver for the correct instance type
28+
# Instances Product Type Product Series Product
29+
# G2 GRID GRID Series GRID K520 <-- I think they meant G3
30+
# P2 Tesla K-Series K-80
31+
# P3 Tesla V-Series V100
32+
# Both P2 and P3 are set for Cuda Toolkit 9.1
33+
# http://www.nvidia.com/Download/index.aspx
34+
declare -A class_to_driver_file
35+
class_to_driver_file=( \
36+
["g3"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \
37+
["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
38+
["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
39+
)
40+
declare -A class_to_driver_checksum
41+
class_to_driver_checksum=( \
42+
["g3"]="77f37939efeea4b6505842bed50445971992e303" \
43+
["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
44+
["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
45+
)
2346

24-
# This is pretty annoying.... note this is installed onto the host
25-
chroot /rootfs apt-get update
26-
chroot /rootfs apt-get install --yes gcc
47+
# CUDA Files that need to be installed ~1.4GB
48+
# First one is main installation
49+
# Subsequent files are patches which need to be applied in order
50+
# Order in the arrays below matters
51+
# https://developer.nvidia.com/cuda-downloads
52+
cuda_files=( \
53+
"https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
54+
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
55+
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
56+
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
57+
)
58+
cuda_files_checksums=( \
59+
"1540658f4fe657dddd8b0899555b7468727d4aa8" \
60+
"7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
61+
"cfa3b029b58fc117d8ce510a70efc848924dd565" \
62+
"6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
63+
)
2764

28-
mkdir -p /rootfs/tmp
29-
cd /rootfs/tmp
30-
# TODO: We can't download over SSL - presents an akamai cert
31-
wget http://us.download.nvidia.com/XFree86/Linux-x86_64/375.39/NVIDIA-Linux-x86_64-375.39.run
32-
echo '5e5b9fbf12f4f926ed70c1fe39f71d9d9f154abea0268b1cf035982b34bd7c94baef7667e4f647cc19a62702b46f63b3c3df9f1589261f7138ed2ff151af63cc NVIDIA-Linux-x86_64-375.39.run' | sha3sum -c - || exit 1
33-
chmod +x NVIDIA-Linux-x86_64-375.39.run
34-
chroot /rootfs /tmp/NVIDIA-Linux-x86_64-375.39.run --accept-license --ui=none
65+
containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; }
3566

36-
cd /rootfs/tmp
37-
wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run
38-
chmod +x cuda_8.0.61_375.26_linux-run
39-
# If we want to install samples as well, add: --samples
40-
chroot /rootfs /tmp/cuda_8.0.61_375.26_linux-run --toolkit --silent
67+
#################################################
68+
# Ensure that we are on a proper AWS GPU Instance
4169

42-
chroot /rootfs nvidia-smi -pm 1
43-
chroot /rootfs nvidia-smi -acp 0
44-
chroot /rootfs nvidia-smi --auto-boost-default=0
45-
chroot /rootfs nvidia-smi --auto-boost-permission=0
46-
chroot /rootfs nvidia-smi -ac 2505,875
70+
AWS_INSTANCE_TYPE=$(curl -m 2 -fsSL http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r ".instanceType" || true) # eg: p2.micro
71+
AWS_INSTANCE_CLASS=$(echo $AWS_INSTANCE_TYPE | cut -d . -f 1 || true) # eg: p2
4772

73+
if [[ -z $AWS_INSTANCE_TYPE ]] || [[ -z $AWS_INSTANCE_CLASS ]]; then
74+
echo "This machine is not an AWS instance"
75+
echo " Exiting without installing GPU drivers"
76+
exit 0
77+
fi
4878

49-
# TODO: Problem ... why is this needed - why didn't this happen when we installed nvidia-uvm?
50-
# TODO: Problem ... we need to restart kubelet
79+
classnames=${!class_to_driver_file[@]} # e.g. [ "g3", "p2", "p3" ]
80+
if ! containsElement $AWS_INSTANCE_CLASS $classnames; then
81+
echo "This machine is an AWS instance, but not a GPU instance"
82+
echo " Exiting without installing GPU drivers"
83+
exit 0
84+
fi
5185

52-
chroot /rootfs /sbin/modprobe nvidia-uvm
86+
echo "Identified machine as AWS_INSTANCE_TYPE[$AWS_INSTANCE_TYPE] AWS_INSTANCE_CLASS[$AWS_INSTANCE_CLASS]"
5387

54-
if [ "$?" -eq 0 ]; then
55-
# Find out the major device number used by the nvidia-uvm driver
56-
D=`grep nvidia-uvm /proc/devices | awk '{print $1}'`
88+
#################################################
89+
# Install dependencies
90+
91+
# Install GCC and linux headers on the host machine
92+
# This is unfortunate but necessary. That NVIDIA driver build must be
93+
# compiled with the same version of GCC as the kernel. In addition,
94+
# linux-headers are machine image specific.
95+
96+
if [[ ! -f ${ROOTFS_DIR}/usr/bin/gcc ]]; then
97+
# Cuda requires regular stock gcc and host headers
98+
chroot ${ROOTFS_DIR} apt-get update
99+
# use --no-upgrade so that the c-libs are not upgraded, possible breaking programs and requiring restart
100+
chroot ${ROOTFS_DIR} /bin/bash -c 'apt-get --no-upgrade -y install gcc linux-headers-$(uname -r)'
101+
fi
102+
103+
if [[ ! -f ${ROOTFS_DIR}/usr/bin/gcc-7 ]]; then
104+
echo "Installing gcc-7 on host machine"
105+
106+
# Temporarily add the debian "buster" repo where gcc-7 lives
107+
# But first clear it out first if it already exists
108+
sed -n '/buster/q;p' -i ${ROOTFS_DIR}/etc/apt/sources.list
109+
echo "deb http://deb.debian.org/debian buster main" >> ${ROOTFS_DIR}/etc/apt/sources.list
110+
111+
# Install gcc-7
112+
chroot ${ROOTFS_DIR} apt-get update
113+
chroot ${ROOTFS_DIR} /bin/bash -c 'apt-get -y install linux-headers-$(uname -r)'
114+
chroot ${ROOTFS_DIR} /bin/bash -c 'DEBIAN_FRONTEND=noninteractive apt-get -t buster --no-upgrade -y install gcc-7'
115+
116+
# Remove the debian "buster" repo line that was added above
117+
sed -n '/buster/q;p' -i ${ROOTFS_DIR}/etc/apt/sources.list
118+
chroot ${ROOTFS_DIR} apt-get update
119+
fi
120+
121+
# Unload open-source nouveau driver if it exists
122+
# The nvidia drivers won't install otherwise
123+
# "g3" instances in particular have this module auto-loaded
124+
chroot ${ROOTFS_DIR} modprobe -r nouveau || true
125+
126+
127+
#################################################
128+
# Download and install the Nvidia drivers and cuda libraries
57129

58-
chroot /rootfs mknod -m 666 /dev/nvidia-uvm c $D 0
59-
else
130+
# Create list of URLs and Checksums by merging driver item with array of cuda files
131+
downloads=(${class_to_driver_file[$AWS_INSTANCE_CLASS]} ${cuda_files[@]})
132+
checksums=(${class_to_driver_checksum[$AWS_INSTANCE_CLASS]} ${cuda_files_checksums[@]})
133+
134+
# Ensure that the cache directory exists
135+
mkdir -p $CACHE_DIR_CONTAINER
136+
137+
# Download, verify, and execute each file
138+
length=${#downloads[@]}
139+
for (( i=0; i<${length}; i++ )); do
140+
download=${downloads[$i]}
141+
checksum=${checksums[$i]}
142+
filename=$(basename $download)
143+
filepath_host="${CACHE_DIR_HOST}/${filename}"
144+
filepath_container="${CACHE_DIR_CONTAINER}/${filename}"
145+
filepath_installed="${CACHE_DIR_CONTAINER}/${filename}.installed"
146+
147+
echo "Checking for file at $filepath_container"
148+
if [[ ! -f $filepath_container ]] || ! (echo "$checksum $filepath_container" | sha1sum -c - 2>&1 >/dev/null); then
149+
echo "Downloading $download"
150+
curl -L $download > $filepath_container
151+
chmod a+x $filepath_container
152+
fi
153+
154+
echo "Verifying sha1sum of file at $filepath_container"
155+
if ! (echo "$checksum $filepath_container" | sha1sum -c -); then
156+
echo "Failed to verify sha1sum for file at $filepath_container"
157+
exit 1
158+
fi
159+
160+
# Install the Nvidia driver and cuda libs
161+
if [[ -f $filepath_installed ]]; then
162+
echo "Detected prior install of file $filename on host"
163+
else
164+
echo "Installing file $filename on host"
165+
if [[ $download =~ .*NVIDIA.* ]]; then
166+
# Install the nvidia package (using gcc-7)
167+
chroot ${ROOTFS_DIR} /bin/bash -c "CC=/usr/bin/gcc-7 $filepath_host --accept-license --silent"
168+
touch $filepath_installed # Mark successful installation
169+
elif [[ $download =~ .*local_installers.*cuda.* ]]; then
170+
# Install the primary cuda library (using gcc)
171+
chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent --verbose
172+
touch $filepath_installed # Mark successful installation
173+
elif [[ $download =~ .*patches.*cuda.* ]]; then
174+
# Install an update to the primary cuda library (using gcc)
175+
chroot ${ROOTFS_DIR} $filepath_host --accept-eula --silent
176+
touch $filepath_installed # Mark successful installation
177+
else
178+
echo "Unable to handle file $filepath_host"
179+
exit 1
180+
fi
181+
fi
182+
done
183+
184+
#################################################
185+
# Now that things are installed, let's output GPU info for debugging
186+
chroot ${ROOTFS_DIR} nvidia-smi --list-gpus
187+
188+
# Configure and Optimize Nvidia cards now that things are installed
189+
# AWS Optimizization Doc
190+
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/optimize_gpu.html
191+
# Nvidia Doc
192+
# http://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf
193+
194+
# Common configurations
195+
chroot ${ROOTFS_DIR} nvidia-smi -pm 1
196+
chroot ${ROOTFS_DIR} nvidia-smi --auto-boost-default=0
197+
chroot ${ROOTFS_DIR} nvidia-smi --auto-boost-permission=0
198+
199+
# Custom configurations per class of nvidia video card
200+
case "$AWS_INSTANCE_CLASS" in
201+
"g2" | "g3")
202+
chroot ${ROOTFS_DIR} nvidia-smi -ac 2505,1177
203+
;;
204+
"p2")
205+
chroot ${ROOTFS_DIR} nvidia-smi -ac 2505,875
206+
chroot ${ROOTFS_DIR} nvidia-smi -acp 0
207+
;;
208+
"p3")
209+
chroot ${ROOTFS_DIR} nvidia-smi -ac 877,1530
210+
chroot ${ROOTFS_DIR} nvidia-smi -acp 0
211+
;;
212+
*)
213+
;;
214+
esac
215+
216+
# Load the Kernel Module
217+
if ! chroot ${ROOTFS_DIR} /sbin/modprobe nvidia-uvm; then
60218
echo "Unable to modprobe nvidia-uvm"
219+
exit 1
61220
fi
221+
222+
# Ensure that the device node exists
223+
if ! chroot ${ROOTFS_DIR} test -e /dev/nvidia-uvm; then
224+
# Find out the major device number used by the nvidia-uvm driver
225+
D=`grep nvidia-uvm /proc/devices | awk '{print $1}'`
226+
chroot ${ROOTFS_DIR} mknod -m 666 /dev/nvidia-uvm c $D 0
227+
fi
228+
229+
# Restart Kubelet
230+
echo "Restarting Kubelet"
231+
systemctl restart kubelet.service

0 commit comments

Comments
 (0)