From c9acf8aede2a2bd44bd2eaa17d4a8e95b7bc57ed Mon Sep 17 00:00:00 2001 From: Syed Tousif Ahmed Date: Mon, 31 Oct 2022 14:34:24 -0700 Subject: [PATCH] Installs NCCL from redist, uses system NCCL, and adds pypi RPATH --- common/install_cuda.sh | 12 +++++++++++- manywheel/build_cuda.sh | 5 ++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 77d190011..d3b754afe 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -98,7 +98,7 @@ function install_116 { } function install_117 { - echo "Installing CUDA 11.7 and CuDNN 8.3" + echo "Installing CUDA 11.7 and CuDNN 8.3 and NCCL 2.14" rm -rf /usr/local/cuda-11.7 /usr/local/cuda # install CUDA 11.7.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run @@ -116,6 +116,16 @@ function install_117 { cd .. rm -rf tmp_cudnn ldconfig + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + mkdir tmp_nccl && cd tmp_nccl + wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.14/nccl_2.14.3-1+cuda11.7_x86_64.txz + tar xf nccl_2.14.3-1+cuda11.7_x86_64.txz + cp -a nccl_2.14.3-1+cuda11.7_x86_64/include/* /usr/local/cuda/include/ + cp -a nccl_2.14.3-1+cuda11.7_x86_64/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_nccl + ldconfig } function prune_102 { diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index ed8ab4cb5..f8cbed8a4 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -275,15 +275,18 @@ elif [[ $CUDA_VERSION == "11.7" ]]; then "libcublasLt.so.11" ) else - echo "Using cudnn and cublas from pypi." + echo "Using cudnn, cublas, and nccl from pypi." CUDA_RPATHS=( '$ORIGIN/../../nvidia/cublas/lib' '$ORIGIN/../../nvidia/cudnn/lib' + '$ORIGIN/../../nvidia/nccl/lib' ) CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}") export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib' export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN' export FORCE_RPATH="--force-rpath" + export USE_STATIC_NCCL=0 + export USE_SYSTEM_NCCL=1 fi else echo "Unknown cuda version $CUDA_VERSION"