The truncation setting doesn't do anything anymore (#4672)

dirkgr · web-flow · commit 55cfb47b5a0c · 2020-09-25T14:18:38.000-07:00
* The truncation setting doesn't do anything anymore

* Changelog
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - `transformers` dependency updated to version 3.1.0.
 - When `cached_path` is called on a local archive with `extract_archive=True`, the archive is now extracted into a unique subdirectory of the cache root instead of a subdirectory of the archive's directory. The extraction directory is also unique to the modification time of the archive, so if the file changes, subsequent calls to `cached_path` will know to re-extract the archive.
+- Removed the `truncation_strategy` parameter to `PretrainedTransformerTokenizer`. The way we're calling the tokenizer, the truncation strategy takes no effect anyways.
 
 ### Fixed
 
@@ -46,6 +47,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed a bug in our doc building script where markdown links did not render properly
   if the "href" part of the link (the part inside the `()`) was on a new line.
 
+
 ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08
 
 ### Fixed
diff --git a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
@@ -44,13 +44,6 @@ class PretrainedTransformerTokenizer(Tokenizer):
     stride : `int`, optional (default=`0`)
         If set to a number along with max_length, the overflowing tokens returned will contain some tokens
         from the main sequence returned. The value of this argument defines the number of additional tokens.
-    truncation_strategy : `str`, optional (default=`'longest_first'`)
-        String selected in the following options:
-        - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-        starting from the longest one at each token (when there is a pair of input sequences)
-        - 'only_first': Only truncate the first sequence
-        - 'only_second': Only truncate the second sequence
-        - 'do_not_truncate': Do not truncate (raise an error if the input sequence is longer than max_length)
     tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`)
         Dictionary with
         [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
@@ -63,7 +56,6 @@ def __init__(
         add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         stride: int = 0,
-        truncation_strategy: str = "longest_first",
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         if tokenizer_kwargs is None:
@@ -82,7 +74,6 @@ def __init__(
         self._add_special_tokens = add_special_tokens
         self._max_length = max_length
         self._stride = stride
-        self._truncation_strategy = truncation_strategy
 
         self._tokenizer_lowercases = self.tokenizer_lowercases(self.tokenizer)
 
@@ -230,12 +221,15 @@ def tokenize(self, text: str) -> List[Token]:
         """
         This method only handles a single sentence (or sequence) of text.
         """
+        max_length = self._max_length
+        if max_length is not None and self._add_special_tokens:
+            max_length -= self.num_special_tokens_for_sequence()
+
         encoded_tokens = self.tokenizer.encode_plus(
             text=text,
             add_special_tokens=False,
-            max_length=self._max_length,
+            max_length=max_length,
             stride=self._stride,
-            truncation=self._truncation_strategy if self._max_length is not None else False,
             return_tensors=None,
             return_offsets_mapping=self.tokenizer.is_fast,
             return_attention_mask=False,