Fix RoBERTa SST (#4548)

dirkgr · web-flow · commit 5a07009b5ae8 · 2020-08-17T09:04:33.000-07:00
* Fix RobertaSST

* Fix unrelated formatting issue

* Changelog

* Be slightly more flexible about tokens
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   Also, when `max_length` was set to a non-`None` value, several warnings would appear
   for certain transformer models around the use of the `truncation` parameter.
 - Fixed evaluation of all metrics when using distributed training.
+- Fixed problem with automatically detecting whether tokenization is necessary.
+  This affected primarily the Roberta SST model.
+
 
 ## [v1.1.0rc2](https://github.com/allenai/allennlp/releases/tag/v1.1.0rc2) - 2020-07-31
 
diff --git a/allennlp/common/tqdm.py b/allennlp/common/tqdm.py
@@ -3,6 +3,7 @@
 global defaults for certain tqdm parameters.
 """
 import logging
+from allennlp.common import logging as common_logging
 import sys
 from time import time
 from typing import Optional
@@ -17,8 +18,6 @@
 else:
     from tqdm import tqdm as _tqdm
 
-from allennlp.common import logging as common_logging
-
 
 # This is necessary to stop tqdm from hanging
 # when exceptions are raised inside iterators.
diff --git a/allennlp/data/tokenizers/token.py b/allennlp/data/tokenizers/token.py
@@ -81,6 +81,9 @@ def __init__(
         text_id: int = None,
         type_id: int = None,
     ) -> None:
+        assert text is None or isinstance(
+            text, str
+        )  # Some very hard to debug errors happen when this is not true.
         self.text = text
         self.idx = idx
         self.idx_end = idx_end
diff --git a/allennlp/predictors/text_classifier.py b/allennlp/predictors/text_classifier.py
@@ -30,9 +30,11 @@ def _json_to_instance(self, json_dict: JsonDict) -> Instance:
         Runs the underlying model, and adds the `"label"` to the output.
         """
         sentence = json_dict["sentence"]
-        if not hasattr(self._dataset_reader, "tokenizer") and not hasattr(
-            self._dataset_reader, "_tokenizer"
-        ):
+        reader_has_tokenizer = (
+            getattr(self._dataset_reader, "tokenizer", None) is not None
+            or getattr(self._dataset_reader, "_tokenizer", None) is not None
+        )
+        if not reader_has_tokenizer:
             tokenizer = SpacyTokenizer()
             sentence = tokenizer.tokenize(sentence)
         return self._dataset_reader.text_to_instance(sentence)