Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Commit edf2681

Browse files
committed
Add Tokenizer Test, complete Documentation
Signed-off-by: Abhishek P (VMware) <[email protected]>
1 parent a84c9a8 commit edf2681

File tree

2 files changed

+62
-39
lines changed

2 files changed

+62
-39
lines changed

allennlp/data/dataset_readers/huggingface_datasets_reader.py

+27-12
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88
from datasets.features import Value
99

1010

11-
# TODO pab-vmware complete the documentation comments
11+
@DatasetReader.register("huggingface-datasets")
1212
class HuggingfaceDatasetReader(DatasetReader):
1313
"""
14+
Reads instances from the given huggingface supported dataset
15+
1416
This reader implementation wraps the huggingface datasets package
15-
to utilize it's dataset management functionality and load the information in AllenNLP friendly formats
16-
Note: Reader works w.r.t to only one split of the dataset,
17-
i.e. you would need to create separate reader for separate splits
1817
1918
Following dataset and configurations have been verified and work with this reader
2019
@@ -39,10 +38,20 @@ class HuggingfaceDatasetReader(DatasetReader):
3938
`trec` `NA`
4039
`emotion` `NA`
4140
42-
# Parameters
43-
dataset_name : `str`
44-
config_name : `str`, optional (default=`None`)
45-
pre_load : `bool`, optional (default='False`)
41+
Registered as a `DatasetReader` with name `huggingface-datasets`
42+
43+
# Parameters
44+
45+
dataset_name : `str`
46+
Name of the dataset from huggingface datasets the reader will be used for
47+
config_name : `str`, optional (default=`None`)
48+
Configuration(mandatory for some datasets) of the dataset
49+
pre_load : `bool`, optional (default='False`)
50+
If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
51+
otherwise each split is loaded on when `read()` is used for the same for the first time
52+
tokenizer : `Tokenizer`, optional (default=`None`)
53+
If specified is used for tokenization of string and text fields from the dataset
54+
This is useful since Text in allennlp is dealt with as a series of tokens.
4655
"""
4756

4857
SUPPORTED_SPLITS = [Split.TRAIN, Split.TEST, Split.VALIDATION]
@@ -62,6 +71,10 @@ def __init__(
6271
)
6372

6473
# It would be cleaner to create a separate reader object for diferent dataset
74+
if dataset_name not in load_dataset():
75+
raise NotImplementedError(
76+
f"Dataset {dataset_name} does not seem to available in huggingface datasets"
77+
)
6578
self.dataset: DatasetDict = DatasetDict()
6679
self.dataset_name = dataset_name
6780
self.config_name = config_name
@@ -105,7 +118,7 @@ def _read(self, file_path: str) -> Iterable[Instance]:
105118
def raise_feature_not_supported_value_error(self, value):
106119
raise ValueError(f"Datasets feature type {type(value)} is not supported yet.")
107120

108-
def text_to_instance(self, *inputs) -> Instance:
121+
def text_to_instance(self, *inputs) -> Instance:
109122
"""
110123
Takes care of converting dataset entry into AllenNLP friendly instance
111124
Currently it is implemented in an unseemly catch-up model
@@ -175,7 +188,7 @@ def text_to_instance(self, *inputs) -> Instance:
175188
# We do not know if the string is token or text, we will assume text and make each a TextField
176189
# datasets.features.Sequence of strings maps to ListField of TextField
177190
if value.feature.dtype == "string":
178-
field_list = list()
191+
field_list = list[TextField]()
179192
for item in inputs[1][feature]:
180193
# If tokenizer is provided we will use it to split it to tokens
181194
# Else put whole text as a single token
@@ -188,7 +201,7 @@ def text_to_instance(self, *inputs) -> Instance:
188201

189202
item_field = TextField(tokens)
190203
field_list.append(item_field)
191-
# TODO pab-vmware detect that field_list has different types of object and throw apt error
204+
192205
fields_to_be_added[feature] = ListField(field_list)
193206

194207
# datasets Sequence of strings to ListField of LabelField
@@ -268,4 +281,6 @@ def text_to_instance(self, *inputs) -> Instance:
268281
return Instance(fields)
269282

270283
else:
271-
raise RuntimeError(f"Dataset was not loaded due to unknown error")
284+
raise RuntimeError(
285+
f"Dataset split {split} was not loaded as expected due to unknown error"
286+
)

tests/data/dataset_readers/huggingface_datasets_test.py

+35-27
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,27 @@
11
import pytest
2+
from allennlp.data import Tokenizer
23

34
from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetReader
5+
from allennlp.data.tokenizers import WhitespaceTokenizer
46

57

6-
# TODO add UT were we compare huggingface wrapped reader with an explicitly coded dataset
7-
# TODO pab-vmware Add test with tokenizer and validate the same
8+
# TODO Add test where we compare huggingface wrapped reader with an explicitly coded dataset
9+
# TODO pab-vmware/Abhishek-P Add test where we load conll2003 and test it
10+
# the way tested for conll2003 specific reader
811
class HuggingfaceDatasetReaderTest:
912
"""
10-
Running the tests for supported datasets which require config name to be specified
13+
Test read for some lightweight datasets
1114
"""
1215

1316
@pytest.mark.parametrize(
14-
"dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test"))
17+
"dataset, config, split",
18+
(
19+
("glue", "cola", "train"),
20+
("glue", "cola", "test"),
21+
("universal_dependencies", "en_lines", "validation"),
22+
),
1523
)
16-
def test_read_for_datasets_requiring_config(self, dataset, config, split):
24+
def test_read(self, dataset, config, split):
1725
huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
1826
instances = list(huggingface_reader.read(split))
1927
# Confirm instance were made for all rows
@@ -24,16 +32,35 @@ def test_read_for_datasets_requiring_config(self, dataset, config, split):
2432

2533
# Confirm all features were mapped
2634
assert len(instance.fields) == len(entry)
27-
print(entry)
28-
print(instance)
35+
36+
def test_read_with_tokenizer(self):
37+
dataset = "glue"
38+
config = "cola"
39+
split = "train"
40+
tokenizer: Tokenizer = WhitespaceTokenizer()
41+
huggingface_reader = HuggingfaceDatasetReader(
42+
dataset_name=dataset, config_name=config, tokenizer=tokenizer
43+
)
44+
instances = list(huggingface_reader.read(split))
45+
# Confirm instance were made for all rows
46+
assert len(instances) == len(huggingface_reader.dataset[split])
47+
48+
entry = huggingface_reader.dataset[split][0]
49+
instance = instances[0]
50+
51+
# Confirm all features were mapped
52+
assert len(instance.fields) == len(entry)
53+
54+
# Confirm it was tokenized
55+
assert len(instance["sentence"]) > 1
2956

3057
"""
3158
Test mapping of the datasets.feature.Translation and datasets.feature.TranslationVariableLanguages
3259
"""
3360

3461
@pytest.mark.skip(
3562
reason="Here to help developers validate the reader locally,"
36-
+ "this should not be run by default since it downloads 10MB file"
63+
+ "this should not be run by default since it downloads 10MB file"
3764
)
3865
def test_xnli_all_languages(self):
3966
dataset = "xnli"
@@ -48,22 +75,3 @@ def test_xnli_all_languages(self):
4875
# datasets.features.TranslationVariableLanguages into two fields each
4976
# For XNLI that means 3 fields become 5
5077
assert len(instance.fields) == 5
51-
52-
"""
53-
Test mapping of the datasets.features.Sequence with single type
54-
"""
55-
56-
def test_universal_dependencies_en_lines(self):
57-
dataset = "universal_dependencies"
58-
config = "en_lines"
59-
split = "validation"
60-
huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config)
61-
instances = list(huggingface_reader.read(split))
62-
# Confirm instance were made for all rows
63-
assert len(instances) == len(huggingface_reader.dataset[split])
64-
65-
entry = huggingface_reader.dataset[split][0]
66-
instance = instances[0]
67-
68-
# Confirm all features were mapped
69-
assert len(instance.fields) == len(entry)

0 commit comments

Comments
 (0)