|
1 |
| -import json |
2 |
| -from typing import Dict, List |
3 |
| -import logging |
| 1 | +# pylint: disable=unused-import |
| 2 | +import warnings |
| 3 | +from allennlp.data.dataset_readers.semantic_parsing.atis import AtisDatasetReader |
4 | 4 |
|
5 |
| -from overrides import overrides |
6 |
| -from parsimonious.exceptions import ParseError |
7 |
| - |
8 |
| -from allennlp.common.file_utils import cached_path |
9 |
| -from allennlp.data.dataset_readers.dataset_reader import DatasetReader |
10 |
| -from allennlp.data.fields import Field, ArrayField, ListField, IndexField, \ |
11 |
| - ProductionRuleField, TextField, MetadataField |
12 |
| -from allennlp.data.instance import Instance |
13 |
| -from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer |
14 |
| -from allennlp.data.tokenizers import Tokenizer, WordTokenizer |
15 |
| -from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter |
16 |
| - |
17 |
| -from allennlp.semparse.worlds.atis_world import AtisWorld |
18 |
| - |
19 |
| -logger = logging.getLogger(__name__) # pylint: disable=invalid-name |
20 |
| - |
21 |
| -def _lazy_parse(text: str): |
22 |
| - for interaction in text.split("\n"): |
23 |
| - if interaction: |
24 |
| - yield json.loads(interaction) |
25 |
| - |
26 |
| -@DatasetReader.register("atis") |
27 |
| -class AtisDatasetReader(DatasetReader): |
28 |
| - # pylint: disable=line-too-long |
29 |
| - """ |
30 |
| - This ``DatasetReader`` takes json files and converts them into ``Instances`` for the |
31 |
| - ``AtisSemanticParser``. |
32 |
| -
|
33 |
| - Each line in the file is a JSON object that represent an interaction in the ATIS dataset |
34 |
| - that has the following keys and values: |
35 |
| - ``` |
36 |
| - "id": The original filepath in the LDC corpus |
37 |
| - "interaction": <list where each element represents a turn in the interaction> |
38 |
| - "scenario": A code that refers to the scenario that served as the prompt for this interaction |
39 |
| - "ut_date": Date of the interaction |
40 |
| - "zc09_path": Path that was used in the original paper `Learning Context-Dependent Mappings from |
41 |
| - Sentences to Logical Form |
42 |
| - <https://www.semanticscholar.org/paper/Learning-Context-Dependent-Mappings-from-Sentences-Zettlemoyer-Collins/44a8fcee0741139fa15862dc4b6ce1e11444878f>'_ by Zettlemoyer and Collins (ACL/IJCNLP 2009) |
43 |
| - ``` |
44 |
| -
|
45 |
| - Each element in the ``interaction`` list has the following keys and values: |
46 |
| - ``` |
47 |
| - "utterance": Natural language input |
48 |
| - "sql": A list of SQL queries that the utterance maps to, it could be multiple SQL queries |
49 |
| - or none at all. |
50 |
| - ``` |
51 |
| -
|
52 |
| - Parameters |
53 |
| - ---------- |
54 |
| - token_indexers : ``Dict[str, TokenIndexer]``, optional |
55 |
| - Token indexers for the utterances. Will default to ``{"tokens": SingleIdTokenIndexer()}``. |
56 |
| - lazy : ``bool`` (optional, default=False) |
57 |
| - Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will |
58 |
| - take longer per batch. |
59 |
| - tokenizer : ``Tokenizer``, optional |
60 |
| - Tokenizer to use for the utterances. Will default to ``WordTokenizer()`` with Spacy's tagger |
61 |
| - enabled. |
62 |
| - database_directory : ``str``, optional |
63 |
| - The directory to find the sqlite database file. We query the sqlite database to find the strings |
64 |
| - that are allowed. |
65 |
| - """ |
66 |
| - def __init__(self, |
67 |
| - token_indexers: Dict[str, TokenIndexer] = None, |
68 |
| - lazy: bool = False, |
69 |
| - tokenizer: Tokenizer = None, |
70 |
| - database_directory: str = None) -> None: |
71 |
| - super().__init__(lazy) |
72 |
| - self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} |
73 |
| - self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True)) |
74 |
| - self._database_directory = database_directory |
75 |
| - |
76 |
| - @overrides |
77 |
| - def _read(self, file_path: str): |
78 |
| - # if `file_path` is a URL, redirect to the cache |
79 |
| - file_path = cached_path(file_path) |
80 |
| - |
81 |
| - with open(file_path) as atis_file: |
82 |
| - logger.info("Reading ATIS instances from dataset at : %s", file_path) |
83 |
| - for line in _lazy_parse(atis_file.read()): |
84 |
| - utterances = [] |
85 |
| - for current_interaction in line['interaction']: |
86 |
| - if not current_interaction['utterance']: |
87 |
| - continue |
88 |
| - utterances.append(current_interaction['utterance']) |
89 |
| - instance = self.text_to_instance(utterances, current_interaction['sql']) |
90 |
| - if not instance: |
91 |
| - continue |
92 |
| - yield instance |
93 |
| - |
94 |
| - @overrides |
95 |
| - def text_to_instance(self, # type: ignore |
96 |
| - utterances: List[str], |
97 |
| - sql_query: str = None) -> Instance: |
98 |
| - # pylint: disable=arguments-differ |
99 |
| - """ |
100 |
| - Parameters |
101 |
| - ---------- |
102 |
| - utterances: ``List[str]``, required. |
103 |
| - List of utterances in the interaction, the last element is the current utterance. |
104 |
| - sql_query: ``str``, optional |
105 |
| - The SQL query, given as label during training or validation. |
106 |
| - """ |
107 |
| - utterance = utterances[-1] |
108 |
| - action_sequence: List[str] = [] |
109 |
| - |
110 |
| - if not utterance: |
111 |
| - return None |
112 |
| - |
113 |
| - world = AtisWorld(utterances=utterances, |
114 |
| - database_directory=self._database_directory) |
115 |
| - |
116 |
| - if sql_query: |
117 |
| - try: |
118 |
| - action_sequence = world.get_action_sequence(sql_query) |
119 |
| - except ParseError: |
120 |
| - logger.debug(f'Parsing error') |
121 |
| - |
122 |
| - tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) |
123 |
| - utterance_field = TextField(tokenized_utterance, self._token_indexers) |
124 |
| - |
125 |
| - production_rule_fields: List[Field] = [] |
126 |
| - |
127 |
| - for production_rule in world.all_possible_actions(): |
128 |
| - lhs, _ = production_rule.split(' ->') |
129 |
| - is_global_rule = not lhs in ['number', 'string'] |
130 |
| - # The whitespaces are not semantically meaningful, so we filter them out. |
131 |
| - production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws']) |
132 |
| - field = ProductionRuleField(production_rule, is_global_rule) |
133 |
| - production_rule_fields.append(field) |
134 |
| - |
135 |
| - action_field = ListField(production_rule_fields) |
136 |
| - action_map = {action.rule: i # type: ignore |
137 |
| - for i, action in enumerate(action_field.field_list)} |
138 |
| - index_fields: List[Field] = [] |
139 |
| - world_field = MetadataField(world) |
140 |
| - fields = {'utterance' : utterance_field, |
141 |
| - 'actions' : action_field, |
142 |
| - 'world' : world_field, |
143 |
| - 'linking_scores' : ArrayField(world.linking_scores)} |
144 |
| - |
145 |
| - if sql_query: |
146 |
| - if action_sequence: |
147 |
| - for production_rule in action_sequence: |
148 |
| - index_fields.append(IndexField(action_map[production_rule], action_field)) |
149 |
| - |
150 |
| - action_sequence_field: List[Field] = [] |
151 |
| - action_sequence_field.append(ListField(index_fields)) |
152 |
| - fields['target_action_sequence'] = ListField(action_sequence_field) |
153 |
| - else: |
154 |
| - # If we are given a SQL query, but we are unable to parse it, then we will skip it. |
155 |
| - return None |
156 |
| - |
157 |
| - return Instance(fields) |
| 5 | +warnings.warn("allennlp.data.dataset_readers.atis.* has been moved." |
| 6 | + "Please use allennlp.data.dataset_reader.semantic_parsing.atis.*", FutureWarning) |
0 commit comments