Skip to content

Commit ee59264

Browse files
authored
Adds infra to use nvidia dependencies from pypi and cleans up patches (#1196)
* Installs NCCL from redist, uses system NCCL, and adds pypi RPATH * Cleans up nvrtc patches and adds it using main script * Fixes typo * Adds more dependencies and builds torch with dynamic linking * NCCL dirs have to be specified. Otherwise picks up different version * Handles 11.8 * Adds echo message for nccl 2.15
1 parent ef93e89 commit ee59264

File tree

3 files changed

+44
-80
lines changed

3 files changed

+44
-80
lines changed

common/install_cuda.sh

+22-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ function install_116 {
2424
}
2525

2626
function install_117 {
27-
echo "Installing CUDA 11.7 and CuDNN 8.5"
27+
echo "Installing CUDA 11.7 and CuDNN 8.5 and NCCL 2.14"
2828
rm -rf /usr/local/cuda-11.7 /usr/local/cuda
2929
# install CUDA 11.7.0 in the same container
3030
wget -q https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
@@ -42,10 +42,20 @@ function install_117 {
4242
cd ..
4343
rm -rf tmp_cudnn
4444
ldconfig
45+
46+
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
47+
mkdir tmp_nccl && cd tmp_nccl
48+
wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.14/nccl_2.14.3-1+cuda11.7_x86_64.txz
49+
tar xf nccl_2.14.3-1+cuda11.7_x86_64.txz
50+
cp -a nccl_2.14.3-1+cuda11.7_x86_64/include/* /usr/local/cuda/include/
51+
cp -a nccl_2.14.3-1+cuda11.7_x86_64/lib/* /usr/local/cuda/lib64/
52+
cd ..
53+
rm -rf tmp_nccl
54+
ldconfig
4555
}
4656

4757
function install_118 {
48-
echo "Installing CUDA 11.8 and cuDNN 8.5"
58+
echo "Installing CUDA 11.8 and cuDNN 8.5 and NCCL 2.15"
4959
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
5060
# install CUDA 11.8.0 in the same container
5161
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -63,6 +73,16 @@ function install_118 {
6373
cd ..
6474
rm -rf tmp_cudnn
6575
ldconfig
76+
77+
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
78+
mkdir tmp_nccl && cd tmp_nccl
79+
wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.15.5/nccl_2.15.5-1+cuda11.8_x86_64.txz
80+
tar xf nccl_2.15.5-1+cuda11.8_x86_64.txz
81+
cp -a nccl_2.15.5-1+cuda11.8_x86_64/include/* /usr/local/cuda/include/
82+
cp -a nccl_2.15.5-1+cuda11.8_x86_64/lib/* /usr/local/cuda/lib64/
83+
cd ..
84+
rm -rf tmp_nccl
85+
ldconfig
6686
}
6787

6888
function prune_116 {

manywheel/build_cuda.sh

+22-64
Original file line numberDiff line numberDiff line change
@@ -142,22 +142,14 @@ DEPS_SONAME=(
142142
"libcublasLt.so.11"
143143
"libgomp.so.1"
144144
)
145-
elif [[ $CUDA_VERSION == "11.7" ]]; then
145+
elif [[ $CUDA_VERSION == "11.7" || $CUDA_VERSION == "11.8" ]]; then
146146
export USE_STATIC_CUDNN=0
147147
# Try parallelizing nvcc as well
148148
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
149149
DEPS_LIST=(
150-
"/usr/local/cuda/lib64/libcudart.so.11.0"
151-
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
152-
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake for 11.7, it links to 11.7.50
153-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.7"
154150
"$LIBGOMP_PATH"
155151
)
156152
DEPS_SONAME=(
157-
"libcudart.so.11.0"
158-
"libnvToolsExt.so.1"
159-
"libnvrtc.so.11.2"
160-
"libnvrtc-builtins.so.11.7"
161153
"libgomp.so.1"
162154
)
163155

@@ -173,6 +165,10 @@ elif [[ $CUDA_VERSION == "11.7" ]]; then
173165
"/usr/local/cuda/lib64/libcudnn.so.8"
174166
"/usr/local/cuda/lib64/libcublas.so.11"
175167
"/usr/local/cuda/lib64/libcublasLt.so.11"
168+
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake for 11.7, it links to 11.7.50
169+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.7"
170+
"/usr/local/cuda/lib64/libcudart.so.11.0"
171+
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
176172
)
177173
DEPS_SONAME+=(
178174
"libcudnn_adv_infer.so.8"
@@ -186,69 +182,31 @@ elif [[ $CUDA_VERSION == "11.7" ]]; then
186182
"libcublasLt.so.11"
187183
)
188184
else
189-
echo "Using cudnn and cublas from pypi."
190-
CUDA_RPATHS=(
191-
'$ORIGIN/../../nvidia/cublas/lib'
192-
'$ORIGIN/../../nvidia/cudnn/lib'
193-
)
194-
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
195-
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
196-
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
197-
export FORCE_RPATH="--force-rpath"
198-
fi
199-
elif [[ $CUDA_VERSION == "11.8" ]]; then
200-
export USE_STATIC_CUDNN=0
201-
# Try parallelizing nvcc as well
202-
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
203-
DEPS_LIST=(
204-
"/usr/local/cuda/lib64/libcudart.so.11.0"
205-
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
206-
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake for 11.8, it links to 11.8.89
207-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
208-
"$LIBGOMP_PATH"
209-
)
210-
DEPS_SONAME=(
211-
"libcudart.so.11.0"
212-
"libnvToolsExt.so.1"
213-
"libnvrtc.so.11.2"
214-
"libnvrtc-builtins.so.11.8"
215-
"libgomp.so.1"
216-
)
217-
218-
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
219-
echo "Bundling with cudnn and cublas."
220-
DEPS_LIST+=(
221-
"/usr/local/cuda/lib64/libcudnn_adv_infer.so.8"
222-
"/usr/local/cuda/lib64/libcudnn_adv_train.so.8"
223-
"/usr/local/cuda/lib64/libcudnn_cnn_infer.so.8"
224-
"/usr/local/cuda/lib64/libcudnn_cnn_train.so.8"
225-
"/usr/local/cuda/lib64/libcudnn_ops_infer.so.8"
226-
"/usr/local/cuda/lib64/libcudnn_ops_train.so.8"
227-
"/usr/local/cuda/lib64/libcudnn.so.8"
228-
"/usr/local/cuda/lib64/libcublas.so.11"
229-
"/usr/local/cuda/lib64/libcublasLt.so.11"
230-
)
231-
DEPS_SONAME+=(
232-
"libcudnn_adv_infer.so.8"
233-
"libcudnn_adv_train.so.8"
234-
"libcudnn_cnn_infer.so.8"
235-
"libcudnn_cnn_train.so.8"
236-
"libcudnn_ops_infer.so.8"
237-
"libcudnn_ops_train.so.8"
238-
"libcudnn.so.8"
239-
"libcublas.so.11"
240-
"libcublasLt.so.11"
241-
)
242-
else
243-
echo "Using cudnn and cublas from pypi."
185+
echo "Using nvidia libs from pypi."
244186
CUDA_RPATHS=(
245187
'$ORIGIN/../../nvidia/cublas/lib'
188+
'$ORIGIN/../../nvidia/cuda_cupti/lib'
189+
'$ORIGIN/../../nvidia/cuda_nvrtc/lib'
190+
'$ORIGIN/../../nvidia/cuda_runtime/lib'
246191
'$ORIGIN/../../nvidia/cudnn/lib'
192+
'$ORIGIN/../../nvidia/cufft/lib'
193+
'$ORIGIN/../../nvidia/curand/lib'
194+
'$ORIGIN/../../nvidia/cusolver/lib'
195+
'$ORIGIN/../../nvidia/cusparse/lib'
196+
'$ORIGIN/../../nvidia/nccl/lib'
197+
'$ORIGIN/../../nvidia/nvtx/lib'
247198
)
248199
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
249200
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
250201
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
251202
export FORCE_RPATH="--force-rpath"
203+
export USE_STATIC_NCCL=0
204+
export USE_SYSTEM_NCCL=1
205+
export ATEN_STATIC_CUDA=0
206+
export USE_CUDA_STATIC_LINK=0
207+
export USE_CUPTI_SO=1
208+
export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
209+
export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
252210
fi
253211
else
254212
echo "Unknown cuda version $CUDA_VERSION"

release/pypi/prep_binary_for_pypi.sh

-14
Original file line numberDiff line numberDiff line change
@@ -56,22 +56,8 @@ for whl_file in "$@"; do
5656
if [[ $whl_file == *"with.pypi.cudnn"* ]]; then
5757
rm -rf "${whl_dir}/caffe2"
5858
rm -rf "${whl_dir}"/torch/lib/libnvrtc*
59-
sed -i -e "s/Requires-Dist: nvidia-cuda-runtime-cu11/Requires-Dist: nvidia-cuda-runtime-cu11 (==11.7.99)/" "${whl_dir}"/*/METADATA
60-
sed -i -e "/^Requires-Dist: nvidia-cublas-cu11 (==11.10.3.66).*/a Requires-Dist: nvidia-cuda-nvrtc-cu11 (==11.7.99) ; platform_system == \"Linux\"" "${whl_dir}"/*/METADATA
6159

6260
sed -i -e "s/-with-pypi-cudnn//g" "${whl_dir}/torch/version.py"
63-
find "${whl_dir}/torch/" -maxdepth 1 -type f -name "*.so*" | while read sofile; do
64-
patchelf --set-rpath '$ORIGIN/../../nvidia/cublas/lib:$ORIGIN/../../nvidia/cudnn/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN:$ORIGIN/lib' \
65-
--force-rpath $sofile
66-
patchelf --print-rpath $sofile
67-
done
68-
69-
find "${whl_dir}/torch/lib" -maxdepth 1 -type f -name "*.so*" | while read sofile; do
70-
patchelf --set-rpath '$ORIGIN/../../nvidia/cublas/lib:$ORIGIN/../../nvidia/cudnn/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN' \
71-
--force-rpath $sofile
72-
patchelf --print-rpath $sofile
73-
done
74-
patchelf --replace-needed libnvrtc-d833c4f3.so.11.2 libnvrtc.so.11.2 "${whl_dir}/torch/lib/libcaffe2_nvrtc.so"
7561
fi
7662

7763
find "${dist_info_folder}" -type f -exec sed -i "s!${version_with_suffix}!${version_no_suffix}!" {} \;

0 commit comments

Comments
 (0)