Expose from_pretrained keyword arguments (#4651)

mulhod · epwalsh · web-flow · commit d7c06fe7e78a · 2020-09-22T10:39:11.000-07:00
* Add ability to pass through transformers cache-related kwargs such as cache_dir and local_files_only

* Add a couple tests for cached_transformers

* Update CHANGELOG

* Fix formatting

* Apply suggestions

* Add/fix tokenizers_kwargs/transformer_kwargs in a few places; add documentation wherever it occurs

* Update CHANGELOG.md

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;

* Update bert_pooler.py transformer_kwargs documentation

* Apply suggestions from code review

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;

* Remove test_from_pretrained_kwargs_local_files_only_missing_from_cache test

* Use AllenNlpTestCase in cached_transformers_test.py

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   by adding a class-level variable called `authorized_missing_keys` to any PyTorch module that a `Model` uses.
   If defined, `authorized_missing_keys` should be a list of regex string patterns.
 - Added `FBetaMultiLabelMeasure`, a multi-label Fbeta metric. This is a subclass of the existing `FBetaMeasure`.
+- Added ability to pass additional key word arguments to `cached_transformers.get()`, which will be passed on to `AutoModel.from_pretrained()`.
+- Added an `overrides` argument to `Predictor.from_path()`.
 
 ### Changed
 
diff --git a/allennlp/common/cached_transformers.py b/allennlp/common/cached_transformers.py
@@ -21,6 +21,7 @@ def get(
     make_copy: bool,
     override_weights_file: Optional[str] = None,
     override_weights_strip_prefix: Optional[str] = None,
+    **kwargs,
 ) -> transformers.PreTrainedModel:
     """
     Returns a transformer model from the cache.
@@ -74,9 +75,16 @@ def strip_prefix(s):
                     )
                 override_weights = {strip_prefix(k): override_weights[k] for k in valid_keys}
 
-            transformer = AutoModel.from_pretrained(model_name, state_dict=override_weights)
+            transformer = AutoModel.from_pretrained(
+                model_name,
+                state_dict=override_weights,
+                **kwargs,
+            )
         else:
-            transformer = AutoModel.from_pretrained(model_name)
+            transformer = AutoModel.from_pretrained(
+                model_name,
+                **kwargs,
+            )
         _model_cache[spec] = transformer
     if make_copy:
         import copy
@@ -95,6 +103,9 @@ def get_tokenizer(model_name: str, **kwargs) -> transformers.PreTrainedTokenizer
     global _tokenizer_cache
     tokenizer = _tokenizer_cache.get(cache_key, None)
     if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **kwargs)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_name,
+            **kwargs,
+        )
         _tokenizer_cache[cache_key] = tokenizer
     return tokenizer
diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Any
 import logging
 import torch
 from allennlp.common.util import pad_sequence_to_length
@@ -38,14 +38,25 @@ class PretrainedTransformerIndexer(TokenIndexer):
         before feeding into the embedder. The embedder embeds these segments independently and
         concatenate the results to get the original document representation. Should be set to
         the same value as the `max_length` option on the `PretrainedTransformerEmbedder`.
-    """
+    tokenizer_kwargs : `Dict[str, Any]`, optional (default = `None`)
+        Dictionary with
+        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
+        for `AutoTokenizer.from_pretrained`.
+    """  # noqa: E501
 
     def __init__(
-        self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs
+        self,
+        model_name: str,
+        namespace: str = "tags",
+        max_length: int = None,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self._namespace = namespace
-        self._allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
+        self._allennlp_tokenizer = PretrainedTransformerTokenizer(
+            model_name, tokenizer_kwargs=tokenizer_kwargs
+        )
         self._tokenizer = self._allennlp_tokenizer.tokenizer
         self._added_to_vocabulary = False
 
diff --git a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
@@ -51,11 +51,10 @@ class PretrainedTransformerTokenizer(Tokenizer):
         - 'only_first': Only truncate the first sequence
         - 'only_second': Only truncate the second sequence
         - 'do_not_truncate': Do not truncate (raise an error if the input sequence is longer than max_length)
-    tokenizer_kwargs: `Dict[str, Any]`
+    tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`)
         Dictionary with
         [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
         for `AutoTokenizer.from_pretrained`.
-
     """  # noqa: E501
 
     def __init__(
diff --git a/allennlp/models/archival.py b/allennlp/models/archival.py
@@ -8,6 +8,7 @@
 import tempfile
 import tarfile
 import shutil
+from pathlib import Path
 
 from torch.nn import Module
 
@@ -129,7 +130,7 @@ def archive_model(
 
 
 def load_archive(
-    archive_file: str,
+    archive_file: Union[str, Path],
     cuda_device: int = -1,
     overrides: str = "",
     weights_file: str = None,
@@ -139,7 +140,7 @@ def load_archive(
 
     # Parameters
 
-    archive_file : `str`
+    archive_file : `Union[str, Path]`
         The archive file to load the model from.
     cuda_device : `int`, optional (default = `-1`)
         If `cuda_device` is >= 0, the model will be loaded onto the
diff --git a/allennlp/modules/seq2vec_encoders/bert_pooler.py b/allennlp/modules/seq2vec_encoders/bert_pooler.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict, Any
 
 from overrides import overrides
 
@@ -31,7 +31,11 @@ class BertPooler(Seq2VecEncoder):
         Otherwise they will not.
     dropout : `float`, optional, (default = `0.0`)
         Amount of dropout to apply after pooling
-    """
+    transformer_kwargs: `Dict[str, Any]`, optional (default = `None`)
+        Dictionary with
+        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/modeling_utils.py#L253)
+        for `AutoModel.from_pretrained`.
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -40,14 +44,19 @@ def __init__(
         override_weights_file: Optional[str] = None,
         override_weights_strip_prefix: Optional[str] = None,
         requires_grad: bool = True,
-        dropout: float = 0.0
+        dropout: float = 0.0,
+        transformer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
         from allennlp.common import cached_transformers
 
         model = cached_transformers.get(
-            pretrained_model, False, override_weights_file, override_weights_strip_prefix
+            pretrained_model,
+            False,
+            override_weights_file,
+            override_weights_strip_prefix,
+            **(transformer_kwargs or {}),
         )
 
         self._dropout = torch.nn.Dropout(p=dropout)
diff --git a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
@@ -1,5 +1,5 @@
 import math
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict, Any
 
 from overrides import overrides
 
@@ -42,7 +42,15 @@ class PretrainedTransformerEmbedder(TokenEmbedder):
         is used.
     gradient_checkpointing: `bool`, optional (default = `None`)
         Enable or disable gradient checkpointing.
-    """
+    tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`)
+        Dictionary with
+        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
+        for `AutoTokenizer.from_pretrained`.
+    transformer_kwargs: `Dict[str, Any]`, optional (default = `None`)
+        Dictionary with
+        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/modeling_utils.py#L253)
+        for `AutoModel.from_pretrained`.
+    """  # noqa: E501
 
     authorized_missing_keys = [r"position_ids$"]
 
@@ -57,12 +65,18 @@ def __init__(
         override_weights_file: Optional[str] = None,
         override_weights_strip_prefix: Optional[str] = None,
         gradient_checkpointing: Optional[bool] = None,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        transformer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
         from allennlp.common import cached_transformers
 
         self.transformer_model = cached_transformers.get(
-            model_name, True, override_weights_file, override_weights_strip_prefix
+            model_name,
+            True,
+            override_weights_file=override_weights_file,
+            override_weights_strip_prefix=override_weights_strip_prefix,
+            **(transformer_kwargs or {}),
         )
 
         if gradient_checkpointing is not None:
@@ -83,7 +97,10 @@ def __init__(
             self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
             self.config.output_hidden_states = True
 
-        tokenizer = PretrainedTransformerTokenizer(model_name)
+        tokenizer = PretrainedTransformerTokenizer(
+            model_name,
+            tokenizer_kwargs=tokenizer_kwargs,
+        )
         self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens)
         self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
         self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens
diff --git a/allennlp/predictors/predictor.py b/allennlp/predictors/predictor.py
@@ -1,7 +1,8 @@
-from typing import List, Iterator, Dict, Tuple, Any, Type
+from typing import List, Iterator, Dict, Tuple, Any, Type, Union
 import json
 import re
 from contextlib import contextmanager
+from pathlib import Path
 
 import numpy
 from torch.utils.hooks import RemovableHandle
@@ -232,12 +233,13 @@ def _batch_json_to_instances(self, json_dicts: List[JsonDict]) -> List[Instance]
     @classmethod
     def from_path(
         cls,
-        archive_path: str,
+        archive_path: Union[str, Path],
         predictor_name: str = None,
         cuda_device: int = -1,
         dataset_reader_to_load: str = "validation",
         frozen: bool = True,
         import_plugins: bool = True,
+        overrides: str = "",
     ) -> "Predictor":
         """
         Instantiate a `Predictor` from an archive path.
@@ -247,7 +249,7 @@ def from_path(
 
         # Parameters
 
-        archive_path : `str`
+        archive_path : `Union[str, Path]`
             The path to the archive.
         predictor_name : `str`, optional (default=`None`)
             Name that the predictor is registered as, or None to use the
@@ -265,6 +267,8 @@ def from_path(
             This comes with additional overhead, but means you don't need to explicitly
             import the modules that your predictor depends on as long as those modules
             can be found by `allennlp.common.plugins.import_plugins()`.
+        overrides : `str`, optional (default = `""`)
+            JSON overrides to apply to the unarchived `Params` object.
 
         # Returns
 
@@ -274,7 +278,7 @@ def from_path(
         if import_plugins:
             plugins.import_plugins()
         return Predictor.from_archive(
-            load_archive(archive_path, cuda_device=cuda_device),
+            load_archive(archive_path, cuda_device=cuda_device, overrides=overrides),
             predictor_name,
             dataset_reader_to_load=dataset_reader_to_load,
             frozen=frozen,
diff --git a/tests/common/cached_transformers_test.py b/tests/common/cached_transformers_test.py
@@ -0,0 +1,35 @@
+import pytest
+
+from allennlp.common import cached_transformers
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestCachedTransformers(AllenNlpTestCase):
+    def test_get_missing_from_cache_local_files_only(self):
+        with pytest.raises(ValueError) as execinfo:
+            cached_transformers.get(
+                "bert-base-uncased",
+                True,
+                cache_dir=self.TEST_DIR,
+                local_files_only=True,
+            )
+        assert str(execinfo.value) == (
+            "Cannot find the requested files in the cached path and "
+            "outgoing traffic has been disabled. To enable model "
+            "look-ups and downloads online, set 'local_files_only' "
+            "to False."
+        )
+
+    def test_get_tokenizer_missing_from_cache_local_files_only(self):
+        with pytest.raises(ValueError) as execinfo:
+            cached_transformers.get_tokenizer(
+                "bert-base-uncased",
+                cache_dir=self.TEST_DIR,
+                local_files_only=True,
+            )
+        assert str(execinfo.value) == (
+            "Cannot find the requested files in the cached path and "
+            "outgoing traffic has been disabled. To enable model "
+            "look-ups and downloads online, set 'local_files_only' "
+            "to False."
+        )