Adding tokenizer_kwargs argument to PretrainedTransformerBackbone constructor. (#4944)

pvcastro · web-flow · commit f0ae9f3c8c27 · 2021-01-29T09:31:19.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Added `transformer_kwargs` argument to `PretrainedTransformerBackbone`
+- Added `tokenizer_kwargs` and `transformer_kwargs` arguments to `PretrainedTransformerBackbone`
 
 ## [v2.0.0](https://github.com/allenai/allennlp/releases/tag/v2.0.0) - 2021-01-27
 
diff --git a/allennlp/modules/backbones/pretrained_transformer_backbone.py b/allennlp/modules/backbones/pretrained_transformer_backbone.py
@@ -47,6 +47,10 @@ class PretrainedTransformerBackbone(Backbone):
         When `True` (the default), only the final layer of the pretrained transformer is taken
         for the embeddings. But if set to `False`, a scalar mix of all of the layers
         is used.
+    tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`)
+        Dictionary with
+        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
+        for `AutoTokenizer.from_pretrained`.
     transformer_kwargs: `Dict[str, Any]`, optional (default = `None`)
         Dictionary with
         [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/modeling_utils.py#L253)
@@ -72,6 +76,7 @@ def __init__(
         last_layer_only: bool = True,
         override_weights_file: Optional[str] = None,
         override_weights_strip_prefix: Optional[str] = None,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
         transformer_kwargs: Optional[Dict[str, Any]] = None,
         output_token_strings: bool = True,
         vocab_namespace: str = "tags",
@@ -87,6 +92,7 @@ def __init__(
             last_layer_only=last_layer_only,
             override_weights_file=override_weights_file,
             override_weights_strip_prefix=override_weights_strip_prefix,
+            tokenizer_kwargs=tokenizer_kwargs,
             transformer_kwargs=transformer_kwargs,
         )
         self._output_token_strings = output_token_strings