allenai
diff --git a/‎allennlp/data/dataset_readers/__init__.py
+2-3 b/‎allennlp/data/dataset_readers/__init__.py
+2-3
diff --git a/‎allennlp/data/dataset_readers/atis.py
+5-156 b/‎allennlp/data/dataset_readers/atis.py
+5-156
diff --git a/‎allennlp/data/dataset_readers/dataset_utils/text2sql_utils.py
+49-30 b/‎allennlp/data/dataset_readers/dataset_utils/text2sql_utils.py
+49-30
@@ -7,15 +7,13 @@
 """
 
 # pylint: disable=line-too-long
-from allennlp.data.dataset_readers.atis import AtisDatasetReader
 from allennlp.data.dataset_readers.ccgbank import CcgBankDatasetReader
 from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
 from allennlp.data.dataset_readers.conll2000 import Conll2000DatasetReader
 from allennlp.data.dataset_readers.ontonotes_ner import OntonotesNamedEntityRecognition
 from allennlp.data.dataset_readers.coreference_resolution import ConllCorefReader, WinobiasReader
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.data.dataset_readers.language_modeling import LanguageModelingReader
-from allennlp.data.dataset_readers.nlvr import NlvrDatasetReader
 from allennlp.data.dataset_readers.penn_tree_bank import PennTreeBankConstituencySpanDatasetReader
 from allennlp.data.dataset_readers.reading_comprehension import SquadReader, TriviaQaReader, QuACReader
 from allennlp.data.dataset_readers.semantic_role_labeling import SrlReader
@@ -25,5 +23,6 @@
 from allennlp.data.dataset_readers.universal_dependencies import UniversalDependenciesDatasetReader
 from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import (
         StanfordSentimentTreeBankDatasetReader)
-from allennlp.data.dataset_readers.wikitables import WikiTablesDatasetReader
 from allennlp.data.dataset_readers.quora_paraphrase import QuoraParaphraseDatasetReader
+from allennlp.data.dataset_readers.semantic_parsing import (
+        WikiTablesDatasetReader, AtisDatasetReader, NlvrDatasetReader, TemplateText2SqlDatasetReader)
@@ -1,157 +1,6 @@
-import json
-from typing import Dict, List
-import logging
+# pylint: disable=unused-import
+import warnings
+from allennlp.data.dataset_readers.semantic_parsing.atis import AtisDatasetReader
 
-from overrides import overrides
-from parsimonious.exceptions import ParseError
-
-from allennlp.common.file_utils import cached_path
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.fields import Field, ArrayField, ListField, IndexField, \
-        ProductionRuleField, TextField, MetadataField
-from allennlp.data.instance import Instance
-from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
-from allennlp.data.tokenizers import Tokenizer, WordTokenizer
-from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
-
-from allennlp.semparse.worlds.atis_world import AtisWorld
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-def _lazy_parse(text: str):
-    for interaction in text.split("\n"):
-        if interaction:
-            yield json.loads(interaction)
-
-@DatasetReader.register("atis")
-class AtisDatasetReader(DatasetReader):
-    # pylint: disable=line-too-long
-    """
-    This ``DatasetReader`` takes json files and converts them into ``Instances`` for the
-    ``AtisSemanticParser``.
-
-    Each line in the file is a JSON object that represent an interaction in the ATIS dataset
-    that has the following keys and values:
-    ```
-    "id": The original filepath in the LDC corpus
-    "interaction": <list where each element represents a turn in the interaction>
-    "scenario": A code that refers to the scenario that served as the prompt for this interaction
-    "ut_date": Date of the interaction
-    "zc09_path": Path that was used in the original paper `Learning Context-Dependent Mappings from
-    Sentences to Logical Form
-    <https://www.semanticscholar.org/paper/Learning-Context-Dependent-Mappings-from-Sentences-Zettlemoyer-Collins/44a8fcee0741139fa15862dc4b6ce1e11444878f>'_ by Zettlemoyer and Collins (ACL/IJCNLP 2009)
-    ```
-
-    Each element in the ``interaction`` list has the following keys and values:
-    ```
-    "utterance": Natural language input
-    "sql": A list of SQL queries that the utterance maps to, it could be multiple SQL queries
-    or none at all.
-    ```
-
-    Parameters
-    ----------
-    token_indexers : ``Dict[str, TokenIndexer]``, optional
-        Token indexers for the utterances. Will default to ``{"tokens": SingleIdTokenIndexer()}``.
-    lazy : ``bool`` (optional, default=False)
-        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
-        take longer per batch.
-    tokenizer : ``Tokenizer``, optional
-        Tokenizer to use for the utterances. Will default to ``WordTokenizer()`` with Spacy's tagger
-        enabled.
-    database_directory : ``str``, optional
-        The directory to find the sqlite database file. We query the sqlite database to find the strings
-        that are allowed.
-    """
-    def __init__(self,
-                 token_indexers: Dict[str, TokenIndexer] = None,
-                 lazy: bool = False,
-                 tokenizer: Tokenizer = None,
-                 database_directory: str = None) -> None:
-        super().__init__(lazy)
-        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
-        self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
-        self._database_directory = database_directory
-
-    @overrides
-    def _read(self, file_path: str):
-        # if `file_path` is a URL, redirect to the cache
-        file_path = cached_path(file_path)
-
-        with open(file_path) as atis_file:
-            logger.info("Reading ATIS instances from dataset at : %s", file_path)
-            for line in _lazy_parse(atis_file.read()):
-                utterances = []
-                for current_interaction in line['interaction']:
-                    if not current_interaction['utterance']:
-                        continue
-                    utterances.append(current_interaction['utterance'])
-                    instance = self.text_to_instance(utterances, current_interaction['sql'])
-                    if not instance:
-                        continue
-                    yield instance
-
-    @overrides
-    def text_to_instance(self,  # type: ignore
-                         utterances: List[str],
-                         sql_query: str = None) -> Instance:
-        # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        utterances: ``List[str]``, required.
-            List of utterances in the interaction, the last element is the current utterance.
-        sql_query: ``str``, optional
-            The SQL query, given as label during training or validation.
-        """
-        utterance = utterances[-1]
-        action_sequence: List[str] = []
-
-        if not utterance:
-            return None
-
-        world = AtisWorld(utterances=utterances,
-                          database_directory=self._database_directory)
-
-        if sql_query:
-            try:
-                action_sequence = world.get_action_sequence(sql_query)
-            except ParseError:
-                logger.debug(f'Parsing error')
-
-        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
-        utterance_field = TextField(tokenized_utterance, self._token_indexers)
-
-        production_rule_fields: List[Field] = []
-
-        for production_rule in world.all_possible_actions():
-            lhs, _ = production_rule.split(' ->')
-            is_global_rule = not lhs in ['number', 'string']
-            # The whitespaces are not semantically meaningful, so we filter them out.
-            production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws'])
-            field = ProductionRuleField(production_rule, is_global_rule)
-            production_rule_fields.append(field)
-
-        action_field = ListField(production_rule_fields)
-        action_map = {action.rule: i # type: ignore
-                      for i, action in enumerate(action_field.field_list)}
-        index_fields: List[Field] = []
-        world_field = MetadataField(world)
-        fields = {'utterance' : utterance_field,
-                  'actions' : action_field,
-                  'world' : world_field,
-                  'linking_scores' : ArrayField(world.linking_scores)}
-
-        if sql_query:
-            if action_sequence:
-                for production_rule in action_sequence:
-                    index_fields.append(IndexField(action_map[production_rule], action_field))
-
-                action_sequence_field: List[Field] = []
-                action_sequence_field.append(ListField(index_fields))
-                fields['target_action_sequence'] = ListField(action_sequence_field)
-            else:
-                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
-                return None
-
-        return Instance(fields)
+warnings.warn("allennlp.data.dataset_readers.atis.* has been moved."
+              "Please use allennlp.data.dataset_reader.semantic_parsing.atis.*", FutureWarning)
@@ -3,7 +3,7 @@
 Utility functions for reading the standardised text2sql datasets presented in
 `"Improving Text to SQL Evaluation Methodology" <https://arxiv.org/abs/1806.09029>`_
 """
-from typing import List, Dict, NamedTuple, Iterable
+from typing import List, Dict, NamedTuple, Iterable, Tuple, Set
 
 from allennlp.common import JsonDict
 
@@ -19,6 +19,10 @@ class SqlData(NamedTuple):
     text_with_variables : ``List[str]``
         The tokens in the text of the query with variables
         mapped to table names/abstract variables.
+    variable_tags : ``List[str]``
+        Labels for each word in ``text`` which correspond to
+        which variable in the sql the token is linked to. "O"
+        is used to denote no tag.
     sql : ``List[str]``
         The tokens in the SQL query which corresponds to the text.
     text_variables : ``Dict[str, str]``
@@ -28,24 +32,28 @@ class SqlData(NamedTuple):
     """
     text: List[str]
     text_with_variables: List[str]
+    variable_tags: List[str]
     sql: List[str]
     text_variables: Dict[str, str]
     sql_variables: Dict[str, str]
 
 
 def replace_variables(sentence: List[str],
-                      sentence_variables: Dict[str, str]) -> List[str]:
+                      sentence_variables: Dict[str, str]) -> Tuple[List[str], List[str]]:
     """
     Replaces abstract variables in text with their concrete counterparts.
     """
     tokens = []
+    tags = []
     for token in sentence:
         if token not in sentence_variables:
             tokens.append(token)
+            tags.append("O")
         else:
             for word in sentence_variables[token].split():
                 tokens.append(word)
-    return tokens
+                tags.append(token)
+    return tokens, tags
 
 def clean_and_split_sql(sql: str) -> List[str]:
     """
@@ -63,10 +71,11 @@ def clean_and_split_sql(sql: str) -> List[str]:
     return sql_tokens
 
 
-def process_sql_data_blob(data: JsonDict,
-                          use_all_sql: bool = False) -> Iterable[SqlData]:
+def process_sql_data(data: List[JsonDict],
+                     use_all_sql: bool = False,
+                     use_all_queries: bool = False) -> Iterable[SqlData]:
     """
-    A utility function for reading in text2sql data blobs. The blob is
+    A utility function for reading in text2sql data. The blob is
     the result of loading the json from a file produced by the script
     ``scripts/reformat_text2sql_data.py``.
 
@@ -76,32 +85,42 @@ def process_sql_data_blob(data: JsonDict,
     use_all_sql : ``bool``, optional (default = False)
         Whether to use all of the sql queries which have identical semantics,
         or whether to just use the first one.
+    use_all_queries : ``bool``, (default = False)
+        Whether or not to enforce query sentence uniqueness. If false,
+        duplicated queries will occur in the dataset as separate instances,
+        as for a given SQL query, not only are there multiple queries with
+        the same template, but there are also duplicate queries.
     """
-    # TODO(Mark): currently this does not filter for duplicate _sentences_
-    # which have the same sql query. Really it should, because these instances
-    # are literally identical, so just magnify errors etc. However, doing this
-    # would make it really hard to compare to previous work. Sad times.
-    for sent_info in data['sentences']:
-        # Loop over the different sql statements with "equivalent" semantics
-        for sql in data["sql"]:
-            sql_variables = {}
-            for variable in data['variables']:
-                sql_variables[variable['name']] = variable['example']
+    for example in data:
+        seen_sentences: Set[str] = set()
+        for sent_info in example['sentences']:
+            # Loop over the different sql statements with "equivalent" semantics
+            for sql in example["sql"]:
+                text_with_variables = sent_info['text'].strip().split()
+                text_vars = sent_info['variables']
 
-            text_with_variables = sent_info['text'].strip().split()
-            text_vars = sent_info['variables']
+                query_tokens, tags = replace_variables(text_with_variables, text_vars)
+                if not use_all_queries:
+                    key = " ".join(query_tokens)
+                    if key in seen_sentences:
+                        continue
+                    else:
+                        seen_sentences.add(key)
 
-            query_tokens = replace_variables(text_with_variables, text_vars)
-            sql_tokens = clean_and_split_sql(sql)
+                sql_tokens = clean_and_split_sql(sql)
+                sql_variables = {}
+                for variable in example['variables']:
+                    sql_variables[variable['name']] = variable['example']
 
-            sql_data = SqlData(text=query_tokens,
-                               text_with_variables=text_with_variables,
-                               sql=sql_tokens,
-                               text_variables=text_vars,
-                               sql_variables=sql_variables)
-            yield sql_data
+                sql_data = SqlData(text=query_tokens,
+                                   text_with_variables=text_with_variables,
+                                   variable_tags=tags,
+                                   sql=sql_tokens,
+                                   text_variables=text_vars,
+                                   sql_variables=sql_variables)
+                yield sql_data
 
-            # Some questions might have multiple equivalent SQL statements.
-            # By default, we just use the first one. TODO(Mark): Use the shortest?
-            if not use_all_sql:
-                break
+                # Some questions might have multiple equivalent SQL statements.
+                # By default, we just use the first one. TODO(Mark): Use the shortest?
+                if not use_all_sql:
+                    break