Add Tokenizer Test, complete Documentation

Abhishek-P · Abhishek-P · commit edf2681fed24 · 2021-04-10T20:24:54.000+05:30
Signed-off-by: Abhishek P (VMware) &lt;pab@vmware.com&gt;
diff --git a/allennlp/data/dataset_readers/huggingface_datasets_reader.py b/allennlp/data/dataset_readers/huggingface_datasets_reader.py
@@ -8,13 +8,12 @@
 from datasets.features import Value
 
 
-# TODO pab-vmware complete the documentation comments
+@DatasetReader.register("huggingface-datasets")
 class HuggingfaceDatasetReader(DatasetReader):
     """
+    Reads instances from the given huggingface supported dataset
+
     This reader implementation wraps the huggingface datasets package
-    to utilize it's dataset management functionality and load the information in AllenNLP friendly formats
-    Note: Reader works w.r.t to only one split of the dataset,
-    i.e. you would need to create separate reader for separate splits
 
     Following dataset and configurations have been verified and work with this reader
 
@@ -39,10 +38,20 @@ class HuggingfaceDatasetReader(DatasetReader):
             `trec`                        `NA`
             `emotion`                     `NA`
 
-        # Parameters
-        dataset_name : `str`
-        config_name  : `str`, optional (default=`None`)
-        pre_load     : `bool`, optional (default='False`)
+    Registered as a `DatasetReader` with name `huggingface-datasets`
+
+    # Parameters
+
+    dataset_name : `str`
+        Name of the dataset from huggingface datasets the reader will be used for
+    config_name  : `str`, optional (default=`None`)
+        Configuration(mandatory for some datasets) of the dataset
+    pre_load     : `bool`, optional (default='False`)
+        If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
+        otherwise each split is loaded on when `read()` is used for the same for the first time
+    tokenizer    : `Tokenizer`, optional (default=`None`)
+        If specified is used for tokenization of string and text fields from the dataset
+        This is useful since Text in allennlp is dealt with as a series of tokens.
     """
 
     SUPPORTED_SPLITS = [Split.TRAIN, Split.TEST, Split.VALIDATION]
@@ -62,6 +71,10 @@ def __init__(
         )
 
         # It would be cleaner to create a separate reader object for diferent dataset
+        if dataset_name not in load_dataset():
+            raise NotImplementedError(
+                f"Dataset {dataset_name} does not seem to available in huggingface datasets"
+            )
         self.dataset: DatasetDict = DatasetDict()
         self.dataset_name = dataset_name
         self.config_name = config_name
@@ -105,7 +118,7 @@ def _read(self, file_path: str) -> Iterable[Instance]:
     def raise_feature_not_supported_value_error(self, value):
         raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")
 
-    def text_to_instance(self,  *inputs) -> Instance:
+    def text_to_instance(self, *inputs) -> Instance:
         """
         Takes care of converting dataset entry into AllenNLP friendly instance
         Currently it is implemented in an unseemly catch-up model
@@ -175,7 +188,7 @@ def text_to_instance(self,  *inputs) -> Instance:
                     # We do not know if the string is token or text, we will assume text and make each a TextField
                     # datasets.features.Sequence of strings maps to ListField of TextField
                     if value.feature.dtype == "string":
-                        field_list = list()
+                        field_list = list[TextField]()
                         for item in inputs[1][feature]:
                             # If tokenizer is provided we will use it to split it to tokens
                             # Else put whole text as a single token
@@ -188,7 +201,7 @@ def text_to_instance(self,  *inputs) -> Instance:
 
                             item_field = TextField(tokens)
                             field_list.append(item_field)
-                        # TODO pab-vmware detect that field_list has different types of object and throw apt error
+
                         fields_to_be_added[feature] = ListField(field_list)
 
                     # datasets Sequence of strings to ListField of LabelField
@@ -268,4 +281,6 @@ def text_to_instance(self,  *inputs) -> Instance:
             return Instance(fields)
 
         else:
-            raise RuntimeError(f"Dataset was not loaded due to unknown error")
+            raise RuntimeError(
+                f"Dataset split {split} was not loaded as expected due to unknown error"
+            )
diff --git a/tests/data/dataset_readers/huggingface_datasets_test.py b/tests/data/dataset_readers/huggingface_datasets_test.py
@@ -1,19 +1,27 @@
 import pytest
+from allennlp.data import Tokenizer
 
 from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetReader
+from allennlp.data.tokenizers import WhitespaceTokenizer
 
 
-# TODO add UT were we compare huggingface wrapped reader with an explicitly coded dataset
-# TODO pab-vmware Add test with tokenizer and validate the same
+# TODO Add test where we compare huggingface wrapped reader with an explicitly coded dataset
+# TODO pab-vmware/Abhishek-P Add test where we load conll2003 and test it
+#  the way tested for conll2003 specific reader
 class HuggingfaceDatasetReaderTest:
     """
-    Running the tests for supported datasets which require config name to be specified
+    Test read for some lightweight datasets
     """
 
     @pytest.mark.parametrize(
-        "dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test"))
+        "dataset, config, split",
+        (
+            ("glue", "cola", "train"),
+            ("glue", "cola", "test"),
+            ("universal_dependencies", "en_lines", "validation"),
+        ),
     )
-    def test_read_for_datasets_requiring_config(self, dataset, config, split):
+    def test_read(self, dataset, config, split):
         huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
         instances = list(huggingface_reader.read(split))
         # Confirm instance were made for all rows
@@ -24,16 +32,35 @@ def test_read_for_datasets_requiring_config(self, dataset, config, split):
 
         # Confirm all features were mapped
         assert len(instance.fields) == len(entry)
-        print(entry)
-        print(instance)
+
+    def test_read_with_tokenizer(self):
+        dataset = "glue"
+        config = "cola"
+        split = "train"
+        tokenizer: Tokenizer = WhitespaceTokenizer()
+        huggingface_reader = HuggingfaceDatasetReader(
+            dataset_name=dataset, config_name=config, tokenizer=tokenizer
+        )
+        instances = list(huggingface_reader.read(split))
+        # Confirm instance were made for all rows
+        assert len(instances) == len(huggingface_reader.dataset[split])
+
+        entry = huggingface_reader.dataset[split][0]
+        instance = instances[0]
+
+        # Confirm all features were mapped
+        assert len(instance.fields) == len(entry)
+
+        # Confirm it was tokenized
+        assert len(instance["sentence"]) > 1
 
     """
     Test mapping of the datasets.feature.Translation and datasets.feature.TranslationVariableLanguages
     """
 
     @pytest.mark.skip(
         reason="Here to help developers validate the reader locally,"
-               + "this should not be run by default since it downloads 10MB file"
+        + "this should not be run by default since it downloads 10MB file"
     )
     def test_xnli_all_languages(self):
         dataset = "xnli"
@@ -48,22 +75,3 @@ def test_xnli_all_languages(self):
         # datasets.features.TranslationVariableLanguages into two fields each
         # For XNLI that means 3 fields become 5
         assert len(instance.fields) == 5
-
-    """
-    Test mapping of the datasets.features.Sequence with single type
-    """
-
-    def test_universal_dependencies_en_lines(self):
-        dataset = "universal_dependencies"
-        config = "en_lines"
-        split = "validation"
-        huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
-        instances = list(huggingface_reader.read(split))
-        # Confirm instance were made for all rows
-        assert len(instances) == len(huggingface_reader.dataset[split])
-
-        entry = huggingface_reader.dataset[split][0]
-        instance = instances[0]
-
-        # Confirm all features were mapped
-        assert len(instance.fields) == len(entry)