allenai · epwalsh · Aug 19, 2021 · Aug 18, 2021 · Aug 18, 2021 · Aug 19, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,9 +17,10 @@ on:
   - cron: '37 11 * * 1,2,3,4,5'  # early morning (11:37 UTC / 4:37 AM PDT) Monday - Friday
 
 env:
-  # NOTE: Need to update `TORCH_VERSION` and `TORCH_VISION_VERSION` for new torch releases.
+  # NOTE: Need to update `TORCH_VERSION`, `TORCH_CPU_INSTALL` and `TORCH_GPU_INSTALL` for new torch releases.
   TORCH_VERSION: 1.9.0
-  TORCH_VISION_VERSION: 0.10.0
+  TORCH_CPU_INSTALL: pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
+  TORCH_GPU_INSTALL: pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html # Our self-hosted GPU runners currently support CUDA 11.*
   # Change this to invalidate existing cache.
   CACHE_PREFIX: v0
   # Disable tokenizers parallelism because this doesn't help, and can cause issues in distributed tests.
@@ -94,9 +95,9 @@ jobs:
           run: make test
 
         - name: GPU Tests
-          runs_on: [self-hosted, GPU]
+          runs_on: [self-hosted, GPU, Multi GPU]
           coverage_report: true
-          torch_platform: cu111  # Our self-hosted GPU runners currently support CUDA 11.*
+          torch_platform: gpu
           run: make gpu-tests
 
         - name: Model Tests
@@ -132,6 +133,16 @@ jobs:
         # Use week number in cache key so we can refresh the cache weekly.
         echo "WEEK_NUMBER=$(date +%V)" >> $GITHUB_ENV
 
+    - name: Set build variables (CPU only)
+      if: matrix.task.torch_platform == 'cpu'
+      run: |
+        echo "TORCH_INSTALL=$TORCH_CPU_INSTALL" >> $GITHUB_ENV
+
+    - name: Set build variables (GPU only)
+      if: matrix.task.torch_platform == 'gpu'
+      run: |
+        echo "TORCH_INSTALL=$TORCH_GPU_INSTALL" >> $GITHUB_ENV
+
     - uses: actions/cache@v2
       id: virtualenv-cache
       with:
@@ -145,7 +156,7 @@ jobs:
       run: |
         test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
         . .venv/bin/activate
-        make install TORCH_VERSION="torch==${TORCH_VERSION}+${{ matrix.task.torch_platform }} torchvision==${TORCH_VISION_VERSION}+${{ matrix.task.torch_platform }} -f https://download.pytorch.org/whl/torch_stable.html"
+        make install TORCH_INSTALL="$TORCH_INSTALL"
 
     - name: Setup virtual environment (cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit == 'true'
@@ -280,7 +291,7 @@ jobs:
       run: |
         test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
         . .venv/bin/activate
-        make install TORCH_VERSION="torch==${TORCH_VERSION}+cpu torchvision==${TORCH_VISION_VERSION}+cpu -f https://download.pytorch.org/whl/torch_stable.html"
+        make install TORCH_INSTALL="$TORCH_CPU_INSTALL"
 
     - name: Setup virtual environment (cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit == 'true'
@@ -385,37 +396,19 @@ jobs:
     name: Docker (CUDA ${{ matrix.cuda }})
     timeout-minutes: 18
     if: github.repository == 'allenai/allennlp'
-    # Run on self-hosted to utilize layer caching.
-    runs-on: [self-hosted, Docker-enabled]
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        cuda: ['10.1', '10.2', '11.1']
+        cuda: ['10.2', '11.1']
 
     steps:
     - uses: actions/checkout@v2
 
-    - name: Set torch version
-      env:
-        CUDA: ${{ matrix.cuda }}
-      run: |
-        # Check the install instructions on https://pytorch.org/ to keep these up-to-date.
-        if [[ $CUDA == '10.1' ]]; then
-            # NOTE: We need to use an older version of torch to support CUDA 10.1
-            echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
-        elif [[ $CUDA == '10.2' ]]; then
-            # NOTE: 10.2 is still the default but that could change in the next release.
-            echo "DOCKER_TORCH_VERSION='torch==${{ env.TORCH_VERSION }} torchvision==${{ env.TORCH_VISION_VERSION }}'" >> $GITHUB_ENV;
-        elif [[ $CUDA == '11.1' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==${{ env.TORCH_VERSION }}+cu111 torchvision==${{ env.TORCH_VISION_VERSION }}+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
-        else
-            echo "Unhandled CUDA version $CUDA";
-            exit 1;
-        fi
-
-    - name: Set image name
+    - name: Set image name and torch version
       env:
         CUDA: ${{ matrix.cuda }}
       run: |
+        echo "DOCKER_TORCH_VERSION=${TORCH_VERSION}-cuda${CUDA}" >> $GITHUB_ENV;
         if [[ $GITHUB_EVENT_NAME == 'release' ]]; then
             echo "DOCKER_IMAGE_NAME=allennlp/allennlp:${GITHUB_REF#refs/tags/}-cuda${CUDA}" >> $GITHUB_ENV;
         else
@@ -503,7 +496,7 @@ jobs:
       run: |
         test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
         . .venv/bin/activate
-        make install TORCH_VERSION="torch==${TORCH_VERSION}+cpu torchvision==${TORCH_VISION_VERSION}+cpu -f https://download.pytorch.org/whl/torch_stable.html"
+        make install TORCH_INSTALL="$TORCH_CPU_INSTALL"
 
     - name: Setup virtual environment (cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit == 'true'

diff --git a/Dockerfile b/Dockerfile
@@ -1,28 +1,13 @@
 # This Dockerfile creates an environment suitable for downstream usage of AllenNLP.
-# It's built from a wheel installation of allennlp.
+# It's built from a wheel installation of allennlp using the base images from
+# https://github.com/allenai/docker-images/pkgs/container/pytorch
 
-FROM python:3.8
-
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# Tell nvidia-docker the driver spec that we need as well as to
-# use all available devices, which are mounted at /usr/local/nvidia.
-# The LABEL supports an older version of nvidia-docker, the env
-# variables a newer one.
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-LABEL com.nvidia.volumes.needed="nvidia_driver"
+ARG TORCH=1.9.0-cuda10.2
+ARG PYTHON=3.9
+FROM ghcr.io/allenai/pytorch:${TORCH}-python${PYTHON}-v0.0.1
 
 WORKDIR /stage/allennlp
 
-# Install torch ecosystem first. This build arg should be in the form of a version requirement,
-# like 'torch==1.7' or 'torch==1.7+cu102 -f https://download.pytorch.org/whl/torch_stable.html'.
-ARG TORCH
-RUN pip install --no-cache-dir ${TORCH}
-
 # Installing AllenNLP's dependencies is the most time-consuming part of building
 # this Docker image, so we make use of layer caching here by adding the minimal files
 # necessary to install the dependencies.

diff --git a/Dockerfile.test b/Dockerfile.test
@@ -1,19 +1,8 @@
 # Used to build an image for running tests.
 
-FROM python:3.8
-
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# Tell nvidia-docker the driver spec that we need as well as to
-# use all available devices, which are mounted at /usr/local/nvidia.
-# The LABEL supports an older version of nvidia-docker, the env
-# variables a newer one.
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-LABEL com.nvidia.volumes.needed="nvidia_driver"
+ARG TORCH=1.9.0-cuda10.2
+ARG PYTHON=3.9
+FROM ghcr.io/allenai/pytorch:${TORCH}-python${PYTHON}-v0.0.1
 
 # These environment variables are helpful for debugging.
 # See https://pytorch.org/docs/stable/distributed.html#common-environment-variables for more info.
@@ -22,11 +11,6 @@ ENV NCCL_DEBUG_SUBSYS ALL
 
 WORKDIR /stage/allennlp
 
-# Install torch ecosystem first. This build arg should be in the form of a version requirement,
-# like 'torch==1.7' or 'torch==1.7+cu102 -f https://download.pytorch.org/whl/torch_stable.html'.
-ARG TORCH
-RUN pip install --no-cache-dir ${TORCH}
-
 # Installing AllenNLP's dependencies is the most time-consuming part of building
 # this Docker image, so we make use of layer caching here by adding the minimal files
 # necessary to install the dependencies.

diff --git a/Makefile b/Makefile
@@ -10,13 +10,14 @@ MD_DOCS_CONF_SRC = mkdocs-skeleton.yml
 MD_DOCS_TGT = site/
 MD_DOCS_EXTRAS = $(addprefix $(MD_DOCS_ROOT),README.md CHANGELOG.md CONTRIBUTING.md)
 
-TORCH_VERSION = torch==1.9.0 torchvision==0.10.0
+TORCH_INSTALL = pip install torch torchvision
+DOCKER_TORCH_VERSION = 1.9.0-cuda10.2
+DOCKER_TEST_TORCH_VERSION = 1.9.0-cuda10.2
+DOCKER_PYTHON_VERSION = 3.9
 
 DOCKER_TAG = latest
 DOCKER_IMAGE_NAME = allennlp/allennlp:$(DOCKER_TAG)
 DOCKER_TEST_IMAGE_NAME = allennlp/test:$(DOCKER_TAG)
-DOCKER_TORCH_VERSION = $(TORCH_VERSION)
-DOCKER_TEST_TORCH_VERSION = $(TORCH_VERSION)
 DOCKER_RUN_CMD = docker run --rm \
 		-v $$HOME/.allennlp:/root/.allennlp \
 		-v $$HOME/.cache/huggingface:/root/.cache/huggingface \
@@ -96,7 +97,7 @@ install :
 	# See https://github.com/pypa/pip/issues/4537.
 	# python setup.py install_egg_info
 	# Install torch ecosystem first.
-	pip install $(TORCH_VERSION)
+	$(TORCH_INSTALL)
 	pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
 	# These nltk packages are used by the 'checklist' module.
 	$(NLTK_DOWNLOAD_CMD)
@@ -158,6 +159,7 @@ docker-image :
 		--pull \
 		-f Dockerfile \
 		--build-arg TORCH=$(DOCKER_TORCH_VERSION) \
+		--build-arg PYTHON=$(DOCKER_PYTHON_VERSION) \
 		-t $(DOCKER_IMAGE_NAME) .
 
 DOCKER_GPUS = --gpus all

diff --git a/README.md b/README.md
@@ -259,12 +259,12 @@ For various reasons you may need to create your own AllenNLP Docker image, such
 of PyTorch. To do so, just run `make docker-image` from the root of your local clone of AllenNLP.
 
 By default this builds an image with the tag `allennlp/allennlp`, but you can change this to anything you want
-by setting the `DOCKER_TAG` flag when you call `make`. For example,
-`make docker-image DOCKER_TAG=my-allennlp`.
+by setting the `DOCKER_IMAGE_NAME` flag when you call `make`. For example,
+`make docker-image DOCKER_IMAGE_NAME=my-allennlp`.
 
-If you want to use a different version of PyTorch, set the flag `DOCKER_TORCH_VERSION` to something like
-`torch==1.7.0` or `torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html`.
-The value of this flag will passed directly to `pip install`.
+If you want to use a different version of Python or PyTorch, set the flags `DOCKER_PYTHON_VERSION` and `DOCKER_TORCH_VERSION` to something like
+`3.9` and `1.9.0-cuda10.2`, respectively. These flags together determine the base image that is used. You can see the list of valid
+combinations in this GitHub Container Registry: [github.com/allenai/docker-images/pkgs/container/pytorch](https://github.com/allenai/docker-images/pkgs/container/pytorch).
 
 After building the image you should be able to see it listed by running `docker images allennlp`.