diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31c47a49445..bbae1abb226 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,7 +98,11 @@ jobs: # Required to use the setup-python action. AGENT_TOOLSDIRECTORY: '/opt/hostedtoolcache' # Our self-hosted runner currently is currently compatible with CUDA 11.*. - TORCH_VERSION: 'torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html' + TORCH_VERSION: 'torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html' + # Disable tokenizers parallelism because this doesn't help. + TOKENIZERS_PARALLELISM: 'false' + # Disable multithreading with OMP because this can lead to dead-locks in distributed training. + OMP_NUM_THREADS: '1' steps: - uses: actions/checkout@v2 @@ -149,9 +153,15 @@ jobs: cpu_tests: name: CPU Tests runs-on: ubuntu-latest + timeout-minutes: 20 strategy: matrix: python: ['3.7', '3.8'] + env: + # Disable tokenizers parallelism because this doesn't help. + TOKENIZERS_PARALLELISM: 'false' + # Disable multithreading with OMP because this can lead to dead-locks in distributed training. + OMP_NUM_THREADS: '1' steps: - uses: actions/checkout@v2 @@ -197,7 +207,13 @@ jobs: model_tests: name: Model Tests + timeout-minutes: 18 runs-on: ubuntu-latest + env: + # Disable tokenizers parallelism because this doesn't help. + TOKENIZERS_PARALLELISM: 'false' + # Disable multithreading with OMP because this can lead to dead-locks in distributed training. + OMP_NUM_THREADS: '1' steps: - uses: actions/checkout@v2 @@ -249,6 +265,7 @@ jobs: upload_coverage: name: Upload Coverage Report + timeout-minutes: 18 if: github.repository == 'allenai/allennlp' && (github.event_name == 'push' || github.event_name == 'pull_request') runs-on: ubuntu-latest needs: [cpu_tests, gpu_tests, model_tests] @@ -288,6 +305,7 @@ jobs: # Builds package distribution files for PyPI. build_package: name: Build Package + timeout-minutes: 18 runs-on: ubuntu-latest steps: @@ -354,6 +372,7 @@ jobs: # Tests installing from the distribution files. test_package: name: Test Package + timeout-minutes: 18 needs: [build_package] # needs the package artifact created from 'build_package' job. runs-on: ubuntu-latest strategy: @@ -391,6 +410,7 @@ jobs: # Builds Docker image from the core distribution files and uploads to Docker Hub. docker: name: Docker (CUDA ${{ matrix.cuda }}) + timeout-minutes: 18 if: github.repository == 'allenai/allennlp' # Run on self-hosted to utilize layer caching. runs-on: [self-hosted] @@ -409,9 +429,9 @@ jobs: if [[ $CUDA == '10.1' ]]; then echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV; elif [[ $CUDA == '10.2' ]]; then - echo "DOCKER_TORCH_VERSION='torch==1.8.0'" >> $GITHUB_ENV; + echo "DOCKER_TORCH_VERSION='torch==1.9.0 torchvision==0.10.0'" >> $GITHUB_ENV; elif [[ $CUDA == '11.1' ]]; then - echo "DOCKER_TORCH_VERSION='torch==1.8.0+cu111 torchvision==0.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV; + echo "DOCKER_TORCH_VERSION='torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV; else echo "Unhandled CUDA version $CUDA"; exit 1; @@ -463,6 +483,7 @@ jobs: # allennlp-docs repo. docs: name: Docs + timeout-minutes: 18 # Don't run for forks. if: github.repository == 'allenai/allennlp' runs-on: ubuntu-latest @@ -599,6 +620,7 @@ jobs: # Publish the core distribution files to PyPI. publish: name: PyPI + timeout-minutes: 18 needs: [style, lint, cpu_tests, gpu_tests, model_tests, build_package, test_package, docker, docs] # Only publish to PyPI on releases and nightly builds to "allenai/allennlp" (not forks). if: github.repository == 'allenai/allennlp' && (github.event_name == 'release' || github.event_name == 'schedule') diff --git a/Makefile b/Makefile index 4fea30f33fb..22f455e21b3 100644 --- a/Makefile +++ b/Makefile @@ -10,14 +10,13 @@ MD_DOCS_CONF_SRC = mkdocs-skeleton.yml MD_DOCS_TGT = site/ MD_DOCS_EXTRAS = $(addprefix $(MD_DOCS_ROOT),README.md CHANGELOG.md CONTRIBUTING.md) -TORCH_VERSION = torch==1.8.1 torchvision==0.9.1 +TORCH_VERSION = torch==1.9.0 torchvision==0.10.0 DOCKER_TAG = latest DOCKER_IMAGE_NAME = allennlp/allennlp:$(DOCKER_TAG) DOCKER_TEST_IMAGE_NAME = allennlp/test:$(DOCKER_TAG) -DOCKER_TORCH_VERSION = 'torch==1.7.1 torchvision==0.8.2' -# Our self-hosted runner currently has CUDA 11.0. -DOCKER_TEST_TORCH_VERSION = 'torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html' +DOCKER_TORCH_VERSION = $(TORCH_VERSION) +DOCKER_TEST_TORCH_VERSION = $(TORCH_VERSION) DOCKER_RUN_CMD = docker run --rm \ -v $$HOME/.allennlp:/root/.allennlp \ -v $$HOME/.cache/huggingface:/root/.cache/huggingface \ diff --git a/allennlp/common/testing/distributed_test.py b/allennlp/common/testing/distributed_test.py index 2fae00ff635..72b2ea66f1d 100644 --- a/allennlp/common/testing/distributed_test.py +++ b/allennlp/common/testing/distributed_test.py @@ -42,7 +42,8 @@ def init_process( func(global_rank, world_size, gpu_id, *(func_args or []), **(func_kwargs or {})) - dist.barrier() + # dist.barrier() + dist.destroy_process_group() def run_distributed_test( diff --git a/setup.py b/setup.py index 3d5f6918c82..fc5b2a2a776 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ ), install_requires=[ "torch>=1.6.0,<1.10.0", - "torchvision>=0.8.1,<0.10.0", + "torchvision>=0.8.1,<0.11.0", "jsonnet>=0.10.0 ; sys.platform != 'win32'", "overrides==3.1.0", "nltk", diff --git a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py index a3343c07700..6cd14a1cf9e 100644 --- a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py +++ b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py @@ -2,7 +2,7 @@ import pytest import torch -from allennlp.common import Params +from allennlp.common import Params, cached_transformers from allennlp.common.testing import AllenNlpTestCase, requires_gpu from allennlp.data import Vocabulary from allennlp.data.batch import Batch @@ -15,6 +15,10 @@ class TestPretrainedTransformerEmbedder(AllenNlpTestCase): + @classmethod + def teardown_class(cls): + cached_transformers._clear_caches() + @requires_gpu def test_forward_runs_when_initialized_from_params(self): # This code just passes things off to `transformers`, so we only have a very simple diff --git a/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py index b5a4aae472a..b2af3ae4b3b 100644 --- a/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py +++ b/tests/modules/token_embedders/pretrained_transformer_mismatched_embedder_test.py @@ -1,7 +1,7 @@ import pytest import torch -from allennlp.common import Params +from allennlp.common import Params, cached_transformers from allennlp.common.checks import ConfigurationError from allennlp.data import Token, Vocabulary from allennlp.data.batch import Batch @@ -14,6 +14,10 @@ class TestPretrainedTransformerMismatchedEmbedder(AllenNlpTestCase): + @classmethod + def teardown_class(cls): + cached_transformers._clear_caches() + @pytest.mark.parametrize("train_parameters", [True, False]) def test_end_to_end(self, train_parameters: bool): token_indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased") diff --git a/tests/modules/transformer/transformer_layer_test.py b/tests/modules/transformer/transformer_layer_test.py index 538df89c9e0..8cb624617ba 100644 --- a/tests/modules/transformer/transformer_layer_test.py +++ b/tests/modules/transformer/transformer_layer_test.py @@ -17,6 +17,10 @@ ) +def teardown_module(function): + cached_transformers._clear_caches() + + ATTENTION_PARAMS_DICT = { "hidden_size": 6, "num_attention_heads": 2,