1
+ import typing
1
2
from typing import Iterable , Optional
2
3
3
4
from allennlp .data import DatasetReader , Token , Field , Tokenizer
4
5
from allennlp .data .fields import TextField , LabelField , ListField
5
6
from allennlp .data .instance import Instance
6
- from datasets import load_dataset , DatasetDict , Split
7
+ from datasets import load_dataset , DatasetDict , Split , list_datasets
7
8
from datasets .features import ClassLabel , Sequence , Translation , TranslationVariableLanguages
8
9
from datasets .features import Value
9
10
@@ -43,15 +44,15 @@ class HuggingfaceDatasetReader(DatasetReader):
43
44
# Parameters
44
45
45
46
dataset_name : `str`
46
- Name of the dataset from huggingface datasets the reader will be used for
47
- config_name : `str`, optional (default=`None`)
48
- Configuration(mandatory for some datasets) of the dataset
49
- pre_load : `bool`, optional (default=' False`)
47
+ Name of the dataset from huggingface datasets the reader will be used for.
48
+ config_name : `str`, optional (default=`None`)
49
+ Configuration(mandatory for some datasets) of the dataset.
50
+ preload : `bool`, optional (default=` False`)
50
51
If `True` all splits for the dataset is loaded(includes download etc) as part of the initialization,
51
- otherwise each split is loaded on when `read()` is used for the same for the first time
52
- tokenizer : `Tokenizer`, optional (default=`None`)
53
- If specified is used for tokenization of string and text fields from the dataset
54
- This is useful since Text in allennlp is dealt with as a series of tokens.
52
+ otherwise each split is loaded on when `read()` is used for the same for the first time.
53
+ tokenizer : `Tokenizer`, optional (default=`None`)
54
+ If specified is used for tokenization of string and text fields from the dataset.
55
+ This is useful since text in allennlp is dealt with as a series of tokens.
55
56
"""
56
57
57
58
SUPPORTED_SPLITS = [Split .TRAIN , Split .TEST , Split .VALIDATION ]
@@ -60,7 +61,7 @@ def __init__(
60
61
self ,
61
62
dataset_name : str = None ,
62
63
config_name : Optional [str ] = None ,
63
- pre_load : Optional [bool ] = False ,
64
+ preload : Optional [bool ] = False ,
64
65
tokenizer : Optional [Tokenizer ] = None ,
65
66
** kwargs ,
66
67
) -> None :
@@ -71,17 +72,17 @@ def __init__(
71
72
)
72
73
73
74
# It would be cleaner to create a separate reader object for diferent dataset
74
- if dataset_name not in load_dataset ():
75
- raise NotImplementedError (
75
+ if dataset_name not in list_datasets ():
76
+ raise ValueError (
76
77
f"Dataset { dataset_name } does not seem to available in huggingface datasets"
77
78
)
78
79
self .dataset : DatasetDict = DatasetDict ()
79
80
self .dataset_name = dataset_name
80
81
self .config_name = config_name
81
82
self .tokenizer = tokenizer
82
83
83
- if pre_load :
84
- load_dataset ()
84
+ if preload :
85
+ self . load_dataset ()
85
86
86
87
def load_dataset (self ):
87
88
if self .config_name is not None :
@@ -152,7 +153,7 @@ def text_to_instance(self, *inputs) -> Instance:
152
153
# TODO we need to support all different datasets features described
153
154
# in https://huggingface.co/docs/datasets/features.html
154
155
for feature in features :
155
- fields_to_be_added = dict [str , Field ]()
156
+ fields_to_be_added : typing . Dict [str , Field ] = dict ()
156
157
item_field : Field
157
158
field_list : list
158
159
value = features [feature ]
@@ -188,21 +189,21 @@ def text_to_instance(self, *inputs) -> Instance:
188
189
# We do not know if the string is token or text, we will assume text and make each a TextField
189
190
# datasets.features.Sequence of strings maps to ListField of TextField
190
191
if value .feature .dtype == "string" :
191
- field_list = list [TextField ]()
192
+ field_list2 : typing . List [TextField ] = list ()
192
193
for item in inputs [1 ][feature ]:
193
194
# If tokenizer is provided we will use it to split it to tokens
194
195
# Else put whole text as a single token
195
- tokens : list [Token ]
196
+ tokens : typing . List [Token ]
196
197
if self .tokenizer is not None :
197
198
tokens = self .tokenizer .tokenize (item )
198
199
199
200
else :
200
201
tokens = [Token (item )]
201
202
202
203
item_field = TextField (tokens )
203
- field_list .append (item_field )
204
+ field_list2 .append (item_field )
204
205
205
- fields_to_be_added [feature ] = ListField (field_list )
206
+ fields_to_be_added [feature ] = ListField (field_list2 )
206
207
207
208
# datasets Sequence of strings to ListField of LabelField
208
209
elif isinstance (value .feature , ClassLabel ):
0 commit comments