1
1
import pytest
2
+ from allennlp .data import Tokenizer
2
3
3
4
from allennlp .data .dataset_readers .huggingface_datasets_reader import HuggingfaceDatasetReader
5
+ from allennlp .data .tokenizers import WhitespaceTokenizer
4
6
5
7
6
- # TODO add UT were we compare huggingface wrapped reader with an explicitly coded dataset
7
- # TODO pab-vmware Add test with tokenizer and validate the same
8
+ # TODO Add test where we compare huggingface wrapped reader with an explicitly coded dataset
9
+ # TODO pab-vmware/Abhishek-P Add test where we load conll2003 and test it
10
+ # the way tested for conll2003 specific reader
8
11
class HuggingfaceDatasetReaderTest :
9
12
"""
10
- Running the tests for supported datasets which require config name to be specified
13
+ Test read for some lightweight datasets
11
14
"""
12
15
13
16
@pytest .mark .parametrize (
14
- "dataset, config, split" , (("glue" , "cola" , "train" ), ("glue" , "cola" , "test" ))
17
+ "dataset, config, split" ,
18
+ (
19
+ ("glue" , "cola" , "train" ),
20
+ ("glue" , "cola" , "test" ),
21
+ ("universal_dependencies" , "en_lines" , "validation" ),
22
+ ),
15
23
)
16
- def test_read_for_datasets_requiring_config (self , dataset , config , split ):
24
+ def test_read (self , dataset , config , split ):
17
25
huggingface_reader = HuggingfaceDatasetReader (dataset_name = dataset , config_name = config )
18
26
instances = list (huggingface_reader .read (split ))
19
27
# Confirm instance were made for all rows
@@ -24,16 +32,35 @@ def test_read_for_datasets_requiring_config(self, dataset, config, split):
24
32
25
33
# Confirm all features were mapped
26
34
assert len (instance .fields ) == len (entry )
27
- print (entry )
28
- print (instance )
35
+
36
+ def test_read_with_tokenizer (self ):
37
+ dataset = "glue"
38
+ config = "cola"
39
+ split = "train"
40
+ tokenizer : Tokenizer = WhitespaceTokenizer ()
41
+ huggingface_reader = HuggingfaceDatasetReader (
42
+ dataset_name = dataset , config_name = config , tokenizer = tokenizer
43
+ )
44
+ instances = list (huggingface_reader .read (split ))
45
+ # Confirm instance were made for all rows
46
+ assert len (instances ) == len (huggingface_reader .dataset [split ])
47
+
48
+ entry = huggingface_reader .dataset [split ][0 ]
49
+ instance = instances [0 ]
50
+
51
+ # Confirm all features were mapped
52
+ assert len (instance .fields ) == len (entry )
53
+
54
+ # Confirm it was tokenized
55
+ assert len (instance ["sentence" ]) > 1
29
56
30
57
"""
31
58
Test mapping of the datasets.feature.Translation and datasets.feature.TranslationVariableLanguages
32
59
"""
33
60
34
61
@pytest .mark .skip (
35
62
reason = "Here to help developers validate the reader locally,"
36
- + "this should not be run by default since it downloads 10MB file"
63
+ + "this should not be run by default since it downloads 10MB file"
37
64
)
38
65
def test_xnli_all_languages (self ):
39
66
dataset = "xnli"
@@ -48,22 +75,3 @@ def test_xnli_all_languages(self):
48
75
# datasets.features.TranslationVariableLanguages into two fields each
49
76
# For XNLI that means 3 fields become 5
50
77
assert len (instance .fields ) == 5
51
-
52
- """
53
- Test mapping of the datasets.features.Sequence with single type
54
- """
55
-
56
- def test_universal_dependencies_en_lines (self ):
57
- dataset = "universal_dependencies"
58
- config = "en_lines"
59
- split = "validation"
60
- huggingface_reader = HuggingfaceDatasetReader (dataset_name = dataset , config_name = config )
61
- instances = list (huggingface_reader .read (split ))
62
- # Confirm instance were made for all rows
63
- assert len (instances ) == len (huggingface_reader .dataset [split ])
64
-
65
- entry = huggingface_reader .dataset [split ][0 ]
66
- instance = instances [0 ]
67
-
68
- # Confirm all features were mapped
69
- assert len (instance .fields ) == len (entry )
0 commit comments