Fix Doc mistake and the dataset availability check

Abhishek-P · Abhishek-P · commit d082e55655c3 · 2021-04-10T22:14:09.000+05:30
Signed-off-by: Abhishek P (VMware) &lt;pab@vmware.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 
 ### Added
-- Add `HuggingfaceDatasetSplitReader` for using huggingface datasets in AllenNLP with limited support
+- Add `HuggingfaceDatasetReader` for using huggingface datasets in AllenNLP with limited support
 - Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
 
 ### Changed
diff --git a/allennlp/data/dataset_readers/huggingface_datasets_reader.py b/allennlp/data/dataset_readers/huggingface_datasets_reader.py
@@ -1,9 +1,10 @@
+import typing
 from typing import Iterable, Optional
 
 from allennlp.data import DatasetReader, Token, Field, Tokenizer
 from allennlp.data.fields import TextField, LabelField, ListField
 from allennlp.data.instance import Instance
-from datasets import load_dataset, DatasetDict, Split
+from datasets import load_dataset, DatasetDict, Split, list_datasets
 from datasets.features import ClassLabel, Sequence, Translation, TranslationVariableLanguages
 from datasets.features import Value
 
@@ -43,15 +44,15 @@ class HuggingfaceDatasetReader(DatasetReader):
     # Parameters
 
     dataset_name : `str`
-        Name of the dataset from huggingface datasets the reader will be used for
-    config_name  : `str`, optional (default=`None`)
-        Configuration(mandatory for some datasets) of the dataset
-    pre_load     : `bool`, optional (default='False`)
+        Name of the dataset from huggingface datasets the reader will be used for.
+    config_name : `str`, optional (default=`None`)
+        Configuration(mandatory for some datasets) of the dataset.
+    preload : `bool`, optional (default=`False`)
         If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
-        otherwise each split is loaded on when `read()` is used for the same for the first time
-    tokenizer    : `Tokenizer`, optional (default=`None`)
-        If specified is used for tokenization of string and text fields from the dataset
-        This is useful since Text in allennlp is dealt with as a series of tokens.
+        otherwise each split is loaded on when `read()` is used for the same for the first time.
+    tokenizer : `Tokenizer`, optional (default=`None`)
+        If specified is used for tokenization of string and text fields from the dataset.
+        This is useful since text in allennlp is dealt with as a series of tokens.
     """
 
     SUPPORTED_SPLITS = [Split.TRAIN, Split.TEST, Split.VALIDATION]
@@ -60,7 +61,7 @@ def __init__(
         self,
         dataset_name: str = None,
         config_name: Optional[str] = None,
-        pre_load: Optional[bool] = False,
+        preload: Optional[bool] = False,
         tokenizer: Optional[Tokenizer] = None,
         **kwargs,
     ) -> None:
@@ -71,17 +72,17 @@ def __init__(
         )
 
         # It would be cleaner to create a separate reader object for diferent dataset
-        if dataset_name not in load_dataset():
-            raise NotImplementedError(
+        if dataset_name not in list_datasets():
+            raise ValueError(
                 f"Dataset {dataset_name} does not seem to available in huggingface datasets"
             )
         self.dataset: DatasetDict = DatasetDict()
         self.dataset_name = dataset_name
         self.config_name = config_name
         self.tokenizer = tokenizer
 
-        if pre_load:
-            load_dataset()
+        if preload:
+            self.load_dataset()
 
     def load_dataset(self):
         if self.config_name is not None:
@@ -152,7 +153,7 @@ def text_to_instance(self, *inputs) -> Instance:
             # TODO we need to support all different datasets features described
             # in https://huggingface.co/docs/datasets/features.html
             for feature in features:
-                fields_to_be_added = dict[str, Field]()
+                fields_to_be_added: typing.Dict[str, Field] = dict()
                 item_field: Field
                 field_list: list
                 value = features[feature]
@@ -188,21 +189,21 @@ def text_to_instance(self, *inputs) -> Instance:
                     # We do not know if the string is token or text, we will assume text and make each a TextField
                     # datasets.features.Sequence of strings maps to ListField of TextField
                     if value.feature.dtype == "string":
-                        field_list = list[TextField]()
+                        field_list2: typing.List[TextField] = list()
                         for item in inputs[1][feature]:
                             # If tokenizer is provided we will use it to split it to tokens
                             # Else put whole text as a single token
-                            tokens: list[Token]
+                            tokens: typing.List[Token]
                             if self.tokenizer is not None:
                                 tokens = self.tokenizer.tokenize(item)
 
                             else:
                                 tokens = [Token(item)]
 
                             item_field = TextField(tokens)
-                            field_list.append(item_field)
+                            field_list2.append(item_field)
 
-                        fields_to_be_added[feature] = ListField(field_list)
+                        fields_to_be_added[feature] = ListField(field_list2)
 
                     # datasets Sequence of strings to ListField of LabelField
                     elif isinstance(value.feature, ClassLabel):
diff --git a/tests/data/dataset_readers/huggingface_datasets_test.py b/tests/data/dataset_readers/huggingface_datasets_test.py
@@ -15,11 +15,7 @@ class HuggingfaceDatasetReaderTest:
 
     @pytest.mark.parametrize(
         "dataset, config, split",
-        (
-            ("glue", "cola", "train"),
-            ("glue", "cola", "test"),
-            ("universal_dependencies", "en_lines", "validation"),
-        ),
+        (("glue", "cola", "train"), ("glue", "cola", "test")),
     )
     def test_read(self, dataset, config, split):
         huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
@@ -75,3 +71,7 @@ def test_xnli_all_languages(self):
         # datasets.features.TranslationVariableLanguages into two fields each
         # For XNLI that means 3 fields become 5
         assert len(instance.fields) == 5
+
+    def test_non_available_dataset(self):
+        with pytest.raises(ValueError):
+            HuggingfaceDatasetReader(dataset_name="surely-such-a-dataset-cannot-exist")