|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
15 |
| -#!/bin/sh |
| 15 | +#!/bin/bash |
| 16 | +set -euo pipefail |
16 | 17 |
|
17 |
| -# Simple early detection of nvidia card |
18 |
| -grep 10de102d /proc/bus/pci/devices || exit 0 |
| 18 | +################################################# |
| 19 | +# Settings |
19 | 20 |
|
20 |
| -# p2.xlarge |
21 |
| -# 00f0 10de102d 4b 84000000 100000000c 0 8200000c 0 0 0 1000000 400000000 0 2000000 0 0 0 nvidia |
| 21 | +# A place on the host machine to cache these huge 1.6GB+ downloads in between reboots. |
| 22 | +ROOTFS_DIR=/rootfs |
| 23 | +CACHE_DIR_HOST=/nvidia-bootstrap-cache |
| 24 | +CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}" |
22 | 25 |
|
| 26 | +# AWS Instance Types to Nvidia Card Mapping (cut and pasted from AWS docs) |
| 27 | +# Load the correct driver for the correct instance type |
| 28 | +# Instances Product Type Product Series Product |
| 29 | +# G2 GRID GRID Series GRID K520 <-- I think they meant G3 |
| 30 | +# P2 Tesla K-Series K-80 |
| 31 | +# P3 Tesla V-Series V100 |
| 32 | +# Both P2 and P3 are set for Cuda Toolkit 9.1 |
| 33 | +# http://www.nvidia.com/Download/index.aspx |
| 34 | +declare -A class_to_driver_file |
| 35 | +class_to_driver_file=( \ |
| 36 | + ["g3"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \ |
| 37 | + ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ |
| 38 | + ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ |
| 39 | +) |
| 40 | +declare -A class_to_driver_checksum |
| 41 | +class_to_driver_checksum=( \ |
| 42 | + ["g3"]="77f37939efeea4b6505842bed50445971992e303" \ |
| 43 | + ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ |
| 44 | + ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ |
| 45 | +) |
23 | 46 |
|
24 |
| -# This is pretty annoying.... note this is installed onto the host |
25 |
| -chroot /rootfs apt-get update |
26 |
| -chroot /rootfs apt-get install --yes gcc |
| 47 | +# CUDA Files that need to be installed ~1.4GB |
| 48 | +# First one is main installation |
| 49 | +# Subsequent files are patches which need to be applied in order |
| 50 | +# Order in the arrays below matters |
| 51 | +# https://developer.nvidia.com/cuda-downloads |
| 52 | +cuda_files=( \ |
| 53 | + "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \ |
| 54 | + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \ |
| 55 | + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \ |
| 56 | + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \ |
| 57 | +) |
| 58 | +cuda_files_checksums=( \ |
| 59 | + "1540658f4fe657dddd8b0899555b7468727d4aa8" \ |
| 60 | + "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \ |
| 61 | + "cfa3b029b58fc117d8ce510a70efc848924dd565" \ |
| 62 | + "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \ |
| 63 | +) |
27 | 64 |
|
28 |
| -mkdir -p /rootfs/tmp |
29 |
| -cd /rootfs/tmp |
30 |
| -# TODO: We can't download over SSL - presents an akamai cert |
31 |
| -wget http://us.download.nvidia.com/XFree86/Linux-x86_64/375.39/NVIDIA-Linux-x86_64-375.39.run |
32 |
| -echo '5e5b9fbf12f4f926ed70c1fe39f71d9d9f154abea0268b1cf035982b34bd7c94baef7667e4f647cc19a62702b46f63b3c3df9f1589261f7138ed2ff151af63cc NVIDIA-Linux-x86_64-375.39.run' | sha3sum -c - || exit 1 |
33 |
| -chmod +x NVIDIA-Linux-x86_64-375.39.run |
34 |
| -chroot /rootfs /tmp/NVIDIA-Linux-x86_64-375.39.run --accept-license --ui=none |
| 65 | +containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; } |
35 | 66 |
|
36 |
| -cd /rootfs/tmp |
37 |
| -wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run |
38 |
| -chmod +x cuda_8.0.61_375.26_linux-run |
39 |
| -# If we want to install samples as well, add: --samples |
40 |
| -chroot /rootfs /tmp/cuda_8.0.61_375.26_linux-run --toolkit --silent |
| 67 | +################################################# |
| 68 | +# Ensure that we are on a proper AWS GPU Instance |
41 | 69 |
|
42 |
| -chroot /rootfs nvidia-smi -pm 1 |
43 |
| -chroot /rootfs nvidia-smi -acp 0 |
44 |
| -chroot /rootfs nvidia-smi --auto-boost-default=0 |
45 |
| -chroot /rootfs nvidia-smi --auto-boost-permission=0 |
46 |
| -chroot /rootfs nvidia-smi -ac 2505,875 |
| 70 | +AWS_INSTANCE_TYPE=$(curl -m 2 -fsSL http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r ".instanceType" || true) # eg: p2.micro |
| 71 | +AWS_INSTANCE_CLASS=$(echo $AWS_INSTANCE_TYPE | cut -d . -f 1 || true) # eg: p2 |
47 | 72 |
|
| 73 | +if [[ -z $AWS_INSTANCE_TYPE ]] || [[ -z $AWS_INSTANCE_CLASS ]]; then |
| 74 | + echo "This machine is not an AWS instance" |
| 75 | + echo " Exiting without installing GPU drivers" |
| 76 | + exit 0 |
| 77 | +fi |
48 | 78 |
|
49 |
| -# TODO: Problem ... why is this needed - why didn't this happen when we installed nvidia-uvm? |
50 |
| -# TODO: Problem ... we need to restart kubelet |
| 79 | +classnames=${!class_to_driver_file[@]} # e.g. [ "g3", "p2", "p3" ] |
| 80 | +if ! containsElement $AWS_INSTANCE_CLASS $classnames; then |
| 81 | + echo "This machine is an AWS instance, but not a GPU instance" |
| 82 | + echo " Exiting without installing GPU drivers" |
| 83 | + exit 0 |
| 84 | +fi |
51 | 85 |
|
52 |
| -chroot /rootfs /sbin/modprobe nvidia-uvm |
| 86 | +echo "Identified machine as AWS_INSTANCE_TYPE[$AWS_INSTANCE_TYPE] AWS_INSTANCE_CLASS[$AWS_INSTANCE_CLASS]" |
53 | 87 |
|
54 |
| -if [ "$?" -eq 0 ]; then |
55 |
| - # Find out the major device number used by the nvidia-uvm driver |
56 |
| - D=`grep nvidia-uvm /proc/devices | awk '{print $1}'` |
| 88 | +################################################# |
| 89 | +# Install dependencies |
| 90 | + |
| 91 | +# Install GCC and linux headers on the host machine |
| 92 | +# This is unfortunate but necessary. That NVIDIA driver build must be |
| 93 | +# compiled with the same version of GCC as the kernel. In addition, |
| 94 | +# linux-headers are machine image specific. |
| 95 | + |
| 96 | +if [[ ! -f ${ROOTFS_DIR}/usr/bin/gcc ]]; then |
| 97 | + # Cuda requires regular stock gcc and host headers |
| 98 | + chroot ${ROOTFS_DIR} apt-get update |
| 99 | + # use --no-upgrade so that the c-libs are not upgraded, possible breaking programs and requiring restart |
| 100 | + chroot ${ROOTFS_DIR} /bin/bash -c 'apt-get --no-upgrade -y install gcc linux-headers-$(uname -r)' |
| 101 | +fi |
| 102 | + |
| 103 | +if [[ ! -f ${ROOTFS_DIR}/usr/bin/gcc-7 ]]; then |
| 104 | + echo "Installing gcc-7 on host machine" |
| 105 | + |
| 106 | + # Temporarily add the debian "buster" repo where gcc-7 lives |
| 107 | + # But first clear it out first if it already exists |
| 108 | + sed -n '/buster/q;p' -i ${ROOTFS_DIR}/etc/apt/sources.list |
| 109 | + echo "deb http://deb.debian.org/debian buster main" >> ${ROOTFS_DIR}/etc/apt/sources.list |
| 110 | + |
| 111 | + # Install gcc-7 |
| 112 | + chroot ${ROOTFS_DIR} apt-get update |
| 113 | + chroot ${ROOTFS_DIR} /bin/bash -c 'apt-get -y install linux-headers-$(uname -r)' |
| 114 | + chroot ${ROOTFS_DIR} /bin/bash -c 'DEBIAN_FRONTEND=noninteractive apt-get -t buster --no-upgrade -y install gcc-7' |
| 115 | + |
| 116 | + # Remove the debian "buster" repo line that was added above |
| 117 | + sed -n '/buster/q;p' -i ${ROOTFS_DIR}/etc/apt/sources.list |
| 118 | + chroot ${ROOTFS_DIR} apt-get update |
| 119 | +fi |
| 120 | + |
| 121 | +# Unload open-source nouveau driver if it exists |
| 122 | +# The nvidia drivers won't install otherwise |
| 123 | +# "g3" instances in particular have this module auto-loaded |
| 124 | +chroot ${ROOTFS_DIR} modprobe -r nouveau || true |
| 125 | + |
| 126 | + |
| 127 | +################################################# |
| 128 | +# Download and install the Nvidia drivers and cuda libraries |
57 | 129 |
|
58 |
| - chroot /rootfs mknod -m 666 /dev/nvidia-uvm c $D 0 |
59 |
| -else |
| 130 | +# Create list of URLs and Checksums by merging driver item with array of cuda files |
| 131 | +downloads=(${class_to_driver_file[$AWS_INSTANCE_CLASS]} ${cuda_files[@]}) |
| 132 | +checksums=(${class_to_driver_checksum[$AWS_INSTANCE_CLASS]} ${cuda_files_checksums[@]}) |
| 133 | + |
| 134 | +# Ensure that the cache directory exists |
| 135 | +mkdir -p $CACHE_DIR_CONTAINER |
| 136 | + |
| 137 | +# Download, verify, and execute each file |
| 138 | +length=${#downloads[@]} |
| 139 | +for (( i=0; i<${length}; i++ )); do |
| 140 | + download=${downloads[$i]} |
| 141 | + checksum=${checksums[$i]} |
| 142 | + filename=$(basename $download) |
| 143 | + filepath_host="${CACHE_DIR_HOST}/${filename}" |
| 144 | + filepath_container="${CACHE_DIR_CONTAINER}/${filename}" |
| 145 | + filepath_installed="${CACHE_DIR_CONTAINER}/${filename}.installed" |
| 146 | + |
| 147 | + echo "Checking for file at $filepath_container" |
| 148 | + if [[ ! -f $filepath_container ]] || ! (echo "$checksum $filepath_container" | sha1sum -c - 2>&1 >/dev/null); then |
| 149 | + echo "Downloading $download" |
| 150 | + curl -L $download > $filepath_container |
| 151 | + chmod a+x $filepath_container |
| 152 | + fi |
| 153 | + |
| 154 | + echo "Verifying sha1sum of file at $filepath_container" |
| 155 | + if ! (echo "$checksum $filepath_container" | sha1sum -c -); then |
| 156 | + echo "Failed to verify sha1sum for file at $filepath_container" |
| 157 | + exit 1 |
| 158 | + fi |
| 159 | + |
| 160 | + # Install the Nvidia driver and cuda libs |
| 161 | + if [[ -f $filepath_installed ]]; then |
| 162 | + echo "Detected prior install of file $filename on host" |
| 163 | + else |
| 164 | + echo "Installing file $filename on host" |
| 165 | + if [[ $download =~ .*NVIDIA.* ]]; then |
| 166 | + # Install the nvidia package (using gcc-7) |
| 167 | + chroot ${ROOTFS_DIR} /bin/bash -c "CC=/usr/bin/gcc-7 $filepath_host --accept-license --silent" |
| 168 | + touch $filepath_installed # Mark successful installation |
| 169 | + elif [[ $download =~ .*local_installers.*cuda.* ]]; then |
| 170 | + # Install the primary cuda library (using gcc) |
| 171 | + chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent --verbose |
| 172 | + touch $filepath_installed # Mark successful installation |
| 173 | + elif [[ $download =~ .*patches.*cuda.* ]]; then |
| 174 | + # Install an update to the primary cuda library (using gcc) |
| 175 | + chroot ${ROOTFS_DIR} $filepath_host --accept-eula --silent |
| 176 | + touch $filepath_installed # Mark successful installation |
| 177 | + else |
| 178 | + echo "Unable to handle file $filepath_host" |
| 179 | + exit 1 |
| 180 | + fi |
| 181 | + fi |
| 182 | +done |
| 183 | + |
| 184 | +################################################# |
| 185 | +# Now that things are installed, let's output GPU info for debugging |
| 186 | +chroot ${ROOTFS_DIR} nvidia-smi --list-gpus |
| 187 | + |
| 188 | +# Configure and Optimize Nvidia cards now that things are installed |
| 189 | +# AWS Optimizization Doc |
| 190 | +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/optimize_gpu.html |
| 191 | +# Nvidia Doc |
| 192 | +# http://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf |
| 193 | + |
| 194 | +# Common configurations |
| 195 | +chroot ${ROOTFS_DIR} nvidia-smi -pm 1 |
| 196 | +chroot ${ROOTFS_DIR} nvidia-smi --auto-boost-default=0 |
| 197 | +chroot ${ROOTFS_DIR} nvidia-smi --auto-boost-permission=0 |
| 198 | + |
| 199 | +# Custom configurations per class of nvidia video card |
| 200 | +case "$AWS_INSTANCE_CLASS" in |
| 201 | +"g2" | "g3") |
| 202 | + chroot ${ROOTFS_DIR} nvidia-smi -ac 2505,1177 |
| 203 | + ;; |
| 204 | +"p2") |
| 205 | + chroot ${ROOTFS_DIR} nvidia-smi -ac 2505,875 |
| 206 | + chroot ${ROOTFS_DIR} nvidia-smi -acp 0 |
| 207 | + ;; |
| 208 | +"p3") |
| 209 | + chroot ${ROOTFS_DIR} nvidia-smi -ac 877,1530 |
| 210 | + chroot ${ROOTFS_DIR} nvidia-smi -acp 0 |
| 211 | + ;; |
| 212 | +*) |
| 213 | + ;; |
| 214 | +esac |
| 215 | + |
| 216 | +# Load the Kernel Module |
| 217 | +if ! chroot ${ROOTFS_DIR} /sbin/modprobe nvidia-uvm; then |
60 | 218 | echo "Unable to modprobe nvidia-uvm"
|
| 219 | + exit 1 |
61 | 220 | fi
|
| 221 | + |
| 222 | +# Ensure that the device node exists |
| 223 | +if ! chroot ${ROOTFS_DIR} test -e /dev/nvidia-uvm; then |
| 224 | + # Find out the major device number used by the nvidia-uvm driver |
| 225 | + D=`grep nvidia-uvm /proc/devices | awk '{print $1}'` |
| 226 | + chroot ${ROOTFS_DIR} mknod -m 666 /dev/nvidia-uvm c $D 0 |
| 227 | +fi |
| 228 | + |
| 229 | +# Restart Kubelet |
| 230 | +echo "Restarting Kubelet" |
| 231 | +systemctl restart kubelet.service |
0 commit comments