diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31c47a49445..bbae1abb226 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -98,7 +98,11 @@ jobs:
       # Required to use the setup-python action.
       AGENT_TOOLSDIRECTORY: '/opt/hostedtoolcache'
       # Our self-hosted runner currently is currently compatible with CUDA 11.*.
-      TORCH_VERSION: 'torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html'
+      TORCH_VERSION: 'torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -149,9 +153,15 @@ jobs:
   cpu_tests:
     name: CPU Tests
     runs-on: ubuntu-latest
+    timeout-minutes: 20
     strategy:
       matrix:
         python: ['3.7', '3.8']
+    env:
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -197,7 +207,13 @@ jobs:
 
   model_tests:
     name: Model Tests
+    timeout-minutes: 18
     runs-on: ubuntu-latest
+    env:
+      # Disable tokenizers parallelism because this doesn't help.
+      TOKENIZERS_PARALLELISM: 'false'
+      # Disable multithreading with OMP because this can lead to dead-locks in distributed training.
+      OMP_NUM_THREADS: '1'
 
     steps:
     - uses: actions/checkout@v2
@@ -249,6 +265,7 @@ jobs:
 
   upload_coverage:
     name: Upload Coverage Report
+    timeout-minutes: 18
     if: github.repository == 'allenai/allennlp' && (github.event_name == 'push' || github.event_name == 'pull_request')
     runs-on: ubuntu-latest
     needs: [cpu_tests, gpu_tests, model_tests]
@@ -288,6 +305,7 @@ jobs:
   # Builds package distribution files for PyPI.
   build_package:
     name: Build Package
+    timeout-minutes: 18
     runs-on: ubuntu-latest
 
     steps:
@@ -354,6 +372,7 @@ jobs:
   # Tests installing from the distribution files.
   test_package:
     name: Test Package
+    timeout-minutes: 18
     needs: [build_package]  # needs the package artifact created from 'build_package' job.
     runs-on: ubuntu-latest
     strategy:
@@ -391,6 +410,7 @@ jobs:
   # Builds Docker image from the core distribution files and uploads to Docker Hub.
   docker:
     name: Docker (CUDA ${{ matrix.cuda }})
+    timeout-minutes: 18
     if: github.repository == 'allenai/allennlp'
     # Run on self-hosted to utilize layer caching.
     runs-on: [self-hosted]
@@ -409,9 +429,9 @@ jobs:
         if [[ $CUDA == '10.1' ]]; then
             echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         elif [[ $CUDA == '10.2' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.8.0'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.9.0 torchvision==0.10.0'" >> $GITHUB_ENV;
         elif [[ $CUDA == '11.1' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu111 torchvision==0.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         else
             echo "Unhandled CUDA version $CUDA";
             exit 1;
@@ -463,6 +483,7 @@ jobs:
   # allennlp-docs repo.
   docs:
     name: Docs
+    timeout-minutes: 18
     # Don't run for forks.
     if: github.repository == 'allenai/allennlp'
     runs-on: ubuntu-latest
@@ -599,6 +620,7 @@ jobs:
   # Publish the core distribution files to PyPI.
   publish:
     name: PyPI
+    timeout-minutes: 18
     needs: [style, lint, cpu_tests, gpu_tests, model_tests, build_package, test_package, docker, docs]
     # Only publish to PyPI on releases and nightly builds to "allenai/allennlp" (not forks).
     if: github.repository == 'allenai/allennlp' && (github.event_name == 'release' || github.event_name == 'schedule')
diff --git a/Makefile b/Makefile
index 4fea30f33fb..22f455e21b3 100644
--- a/Makefile
+++ b/Makefile
@@ -10,14 +10,13 @@ MD_DOCS_CONF_SRC = mkdocs-skeleton.yml
 MD_DOCS_TGT = site/
 MD_DOCS_EXTRAS = $(addprefix $(MD_DOCS_ROOT),README.md CHANGELOG.md CONTRIBUTING.md)
 
-TORCH_VERSION = torch==1.8.1 torchvision==0.9.1
+TORCH_VERSION = torch==1.9.0 torchvision==0.10.0
 
 DOCKER_TAG = latest
 DOCKER_IMAGE_NAME = allennlp/allennlp:$(DOCKER_TAG)
 DOCKER_TEST_IMAGE_NAME = allennlp/test:$(DOCKER_TAG)
-DOCKER_TORCH_VERSION = 'torch==1.7.1 torchvision==0.8.2'
-# Our self-hosted runner currently has CUDA 11.0.
-DOCKER_TEST_TORCH_VERSION = 'torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html'
+DOCKER_TORCH_VERSION = $(TORCH_VERSION)
+DOCKER_TEST_TORCH_VERSION = $(TORCH_VERSION)
 DOCKER_RUN_CMD = docker run --rm \
 		-v $$HOME/.allennlp:/root/.allennlp \
 		-v $$HOME/.cache/huggingface:/root/.cache/huggingface \
diff --git a/allennlp/common/testing/distributed_test.py b/allennlp/common/testing/distributed_test.py
index 2fae00ff635..72b2ea66f1d 100644
--- a/allennlp/common/testing/distributed_test.py
+++ b/allennlp/common/testing/distributed_test.py
@@ -42,7 +42,8 @@ def init_process(
 
     func(global_rank, world_size, gpu_id, *(func_args or []), **(func_kwargs or {}))
 
-    dist.barrier()
+    #  dist.barrier()
+    dist.destroy_process_group()
 
 
 def run_distributed_test(
diff --git a/setup.py b/setup.py
index 3d5f6918c82..fc5b2a2a776 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
     ),
     install_requires=[
         "torch>=1.6.0,<1.10.0",
-        "torchvision>=0.8.1,<0.10.0",
+        "torchvision>=0.8.1,<0.11.0",
         "jsonnet>=0.10.0 ; sys.platform != 'win32'",
         "overrides==3.1.0",
         "nltk",
diff --git a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py
index a3343c07700..6cd14a1cf9e 100644
--- a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py
+++ b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from allennlp.common import Params
+from allennlp.common import Params, cached_transformers
 from allennlp.common.testing import AllenNlpTestCase, requires_gpu
 from allennlp.data import Vocabulary
 from allennlp.data.batch import Batch
@@ -15,6 +15,10 @@
 
 
 class TestPretrainedTransformerEmbedder(AllenNlpTestCase):
+    @classmethod
+    def teardown_class(cls):
+        cached_transformers._clear_caches()
+
     @requires_gpu
     def test_forward_runs_when_initialized_from_params(self):
         # This code just passes things off to `transformers`, so we only have a very simple
diff --git a/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py
index b5a4aae472a..b2af3ae4b3b 100644
--- a/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py
+++ b/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from allennlp.common import Params
+from allennlp.common import Params, cached_transformers
 from allennlp.common.checks import ConfigurationError
 from allennlp.data import Token, Vocabulary
 from allennlp.data.batch import Batch
@@ -14,6 +14,10 @@
 
 
 class TestPretrainedTransformerMismatchedEmbedder(AllenNlpTestCase):
+    @classmethod
+    def teardown_class(cls):
+        cached_transformers._clear_caches()
+
     @pytest.mark.parametrize("train_parameters", [True, False])
     def test_end_to_end(self, train_parameters: bool):
         token_indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased")
diff --git a/tests/modules/transformer/transformer_layer_test.py b/tests/modules/transformer/transformer_layer_test.py
index 538df89c9e0..8cb624617ba 100644
--- a/tests/modules/transformer/transformer_layer_test.py
+++ b/tests/modules/transformer/transformer_layer_test.py
@@ -17,6 +17,10 @@
 )
 
 
+def teardown_module(function):
+    cached_transformers._clear_caches()
+
+
 ATTENTION_PARAMS_DICT = {
     "hidden_size": 6,
     "num_attention_heads": 2,