allenai · epwalsh · Jun 17, 2021 · Jun 16, 2021 · Jun 16, 2021 · Jun 16, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -98,7 +98,11 @@ jobs:
       # Required to use the setup-python action.
       AGENT_TOOLSDIRECTORY: '/opt/hostedtoolcache'
       # Our self-hosted runner currently is currently compatible with CUDA 11.*.
-      TORCH_VERSION: 'torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html'
+      TORCH_VERSION: 'torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -149,9 +153,15 @@ jobs:
   cpu_tests:
     name: CPU Tests
     runs-on: ubuntu-latest
+    timeout-minutes: 20
     strategy:
       matrix:
         python: ['3.7', '3.8']
+    env:
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -197,7 +207,13 @@ jobs:
 
   model_tests:
     name: Model Tests
+    timeout-minutes: 18
     runs-on: ubuntu-latest
+    env:
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -249,6 +265,7 @@ jobs:
 
   upload_coverage:
     name: Upload Coverage Report
+    timeout-minutes: 18
     if: github.repository == 'allenai/allennlp' && (github.event_name == 'push' || github.event_name == 'pull_request')
     runs-on: ubuntu-latest
     needs: [cpu_tests, gpu_tests, model_tests]
@@ -288,6 +305,7 @@ jobs:
   # Builds package distribution files for PyPI.
   build_package:
     name: Build Package
+    timeout-minutes: 18
     runs-on: ubuntu-latest
 
     steps:
@@ -354,6 +372,7 @@ jobs:
   # Tests installing from the distribution files.
   test_package:
     name: Test Package
+    timeout-minutes: 18
     needs: [build_package]  # needs the package artifact created from 'build_package' job.
     runs-on: ubuntu-latest
     strategy:
@@ -391,6 +410,7 @@ jobs:
   # Builds Docker image from the core distribution files and uploads to Docker Hub.
   docker:
     name: Docker (CUDA ${{ matrix.cuda }})
+    timeout-minutes: 18
     if: github.repository == 'allenai/allennlp'
     # Run on self-hosted to utilize layer caching.
     runs-on: [self-hosted]
@@ -409,9 +429,9 @@ jobs:
         if [[ $CUDA == '10.1' ]]; then
             echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         elif [[ $CUDA == '10.2' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.8.0'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.9.0 torchvision==0.10.0'" >> $GITHUB_ENV;
         elif [[ $CUDA == '11.1' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu111 torchvision==0.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         else
             echo "Unhandled CUDA version $CUDA";
             exit 1;
@@ -463,6 +483,7 @@ jobs:
   # allennlp-docs repo.
   docs:
     name: Docs
+    timeout-minutes: 18
     # Don't run for forks.
     if: github.repository == 'allenai/allennlp'
     runs-on: ubuntu-latest
@@ -599,6 +620,7 @@ jobs:
   # Publish the core distribution files to PyPI.
   publish:
     name: PyPI
+    timeout-minutes: 18
     needs: [style, lint, cpu_tests, gpu_tests, model_tests, build_package, test_package, docker, docs]
     # Only publish to PyPI on releases and nightly builds to "allenai/allennlp" (not forks).
     if: github.repository == 'allenai/allennlp' && (github.event_name == 'release' || github.event_name == 'schedule')

diff --git a/Makefile b/Makefile
@@ -10,12 +10,12 @@ MD_DOCS_CONF_SRC = mkdocs-skeleton.yml
 MD_DOCS_TGT = site/
 MD_DOCS_EXTRAS = $(addprefix $(MD_DOCS_ROOT),README.md CHANGELOG.md CONTRIBUTING.md)
 
-TORCH_VERSION = torch==1.8.1 torchvision==0.9.1
+TORCH_VERSION = torch==1.9.0 torchvision==0.10.0
 
 DOCKER_TAG = latest
 DOCKER_IMAGE_NAME = allennlp/allennlp:$(DOCKER_TAG)
 DOCKER_TEST_IMAGE_NAME = allennlp/test:$(DOCKER_TAG)
-DOCKER_TORCH_VERSION = 'torch==1.7.1 torchvision==0.8.2'
+DOCKER_TORCH_VERSION = 'torch==1.9.0 torchvision==0.10.0'
 # Our self-hosted runner currently has CUDA 11.0.
 DOCKER_TEST_TORCH_VERSION = 'torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html'
 DOCKER_RUN_CMD = docker run --rm \

diff --git a/allennlp/common/testing/distributed_test.py b/allennlp/common/testing/distributed_test.py
@@ -42,7 +42,8 @@ def init_process(
 
     func(global_rank, world_size, gpu_id, *(func_args or []), **(func_kwargs or {}))
 
-    dist.barrier()
+    #  dist.barrier()
+    dist.destroy_process_group()
 
 
 def run_distributed_test(
@@ -63,8 +64,7 @@ def run_distributed_test(
         `func` needs to be global for spawning the processes, so that it can be pickled.
 
     start_method: `Optional[str]`, optional (default = `None`)
-        The start method to use for starting the workers. Defaults to "spawn" for GPU
-        processes and fork otherwise.
+        The start method to use for starting the workers. Defaults to "spawn".
     """
     device_ids = device_ids or [-1, -1]
     check_for_gpu(device_ids)

diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
     ),
     install_requires=[
         "torch>=1.6.0,<1.10.0",
-        "torchvision>=0.8.1,<0.10.0",
+        "torchvision>=0.8.1,<0.11.0",
         "jsonnet>=0.10.0 ; sys.platform != 'win32'",
         "overrides==3.1.0",
         "nltk",

diff --git a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from allennlp.common import Params
+from allennlp.common import Params, cached_transformers
 from allennlp.common.testing import AllenNlpTestCase, requires_gpu
 from allennlp.data import Vocabulary
 from allennlp.data.batch import Batch
@@ -15,6 +15,10 @@
 
 
 class TestPretrainedTransformerEmbedder(AllenNlpTestCase):
+    def teardown_method(self):
+        super().teardown_method()
+        cached_transformers._clear_caches()
+
     @requires_gpu
     def test_forward_runs_when_initialized_from_params(self):
         # This code just passes things off to `transformers`, so we only have a very simple

diff --git a/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from allennlp.common import Params
+from allennlp.common import Params, cached_transformers
 from allennlp.common.checks import ConfigurationError
 from allennlp.data import Token, Vocabulary
 from allennlp.data.batch import Batch
@@ -14,6 +14,10 @@
 
 
 class TestPretrainedTransformerMismatchedEmbedder(AllenNlpTestCase):
+    def teardown_method(self):
+        super().teardown_method()
+        cached_transformers._clear_caches()
+
     @pytest.mark.parametrize("train_parameters", [True, False])
     def test_end_to_end(self, train_parameters: bool):
         token_indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased")

diff --git a/tests/modules/transformer/transformer_layer_test.py b/tests/modules/transformer/transformer_layer_test.py
@@ -17,6 +17,10 @@
 )
 
 
+def teardown_function(function):
+    cached_transformers._clear_caches()
+
+
 ATTENTION_PARAMS_DICT = {
     "hidden_size": 6,
     "num_attention_heads": 2,