allenai
diff --git a/‎allennlp/data/dataset_readers/semantic_parsing/atis.py
+18-8 b/‎allennlp/data/dataset_readers/semantic_parsing/atis.py
+18-8
diff --git a/‎allennlp/models/semantic_parsing/atis/atis_semantic_parser.py
+4-3 b/‎allennlp/models/semantic_parsing/atis/atis_semantic_parser.py
+4-3
diff --git a/‎allennlp/semparse/contexts/atis_sql_table_context.py
+15-2 b/‎allennlp/semparse/contexts/atis_sql_table_context.py
+15-2
diff --git a/‎allennlp/semparse/contexts/atis_tables.py
+85-16 b/‎allennlp/semparse/contexts/atis_tables.py
+85-16
@@ -16,9 +16,12 @@
 from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
 
 from allennlp.semparse.worlds.atis_world import AtisWorld
+from allennlp.semparse.contexts.atis_sql_table_context import NUMERIC_NONTERMINALS
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
+END_OF_UTTERANCE_TOKEN = "@@EOU@@"
+
 def _lazy_parse(text: str):
     for interaction in text.split("\n"):
         if interaction:
@@ -63,18 +66,22 @@ class AtisDatasetReader(DatasetReader):
     database_file: ``str``, optional
         The directory to find the sqlite database file. We query the sqlite database to find the strings
         that are allowed.
+    num_turns_to_concatenate: ``str``, optional
+        The number of utterances to concatenate as the conversation context.
     """
     def __init__(self,
                  token_indexers: Dict[str, TokenIndexer] = None,
+                 keep_if_unparseable: bool = False,
                  lazy: bool = False,
                  tokenizer: Tokenizer = None,
-                 database_file: str = None) -> None:
+                 database_file: str = None,
+                 num_turns_to_concatenate: int = 1) -> None:
         super().__init__(lazy)
+        self._keep_if_unparseable = keep_if_unparseable
         self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
         self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter())
         self._database_file = database_file
-        # TODO(kevin): Add a keep_unparseable_utterances flag so that during validation, we do not skip queries that
-        # cannot be parsed.
+        self._num_turns_to_concatenate = num_turns_to_concatenate
 
     @overrides
     def _read(self, file_path: str):
@@ -108,6 +115,9 @@ def text_to_instance(self,  # type: ignore
         sql_query_labels: ``List[str]``, optional
             The SQL queries that are given as labels during training or validation.
         """
+        if self._num_turns_to_concatenate:
+            utterances[-1] = f' {END_OF_UTTERANCE_TOKEN} '.join(utterances[-self._num_turns_to_concatenate:])
+
         utterance = utterances[-1]
         action_sequence: List[str] = []
 
@@ -149,21 +159,21 @@ def text_to_instance(self,  # type: ignore
 
         if sql_query_labels != None:
             fields['sql_queries'] = MetadataField(sql_query_labels)
-            if action_sequence:
+            if action_sequence and not self._keep_if_unparseable:
                 for production_rule in action_sequence:
                     index_fields.append(IndexField(action_map[production_rule], action_field))
-
                 action_sequence_field = ListField(index_fields)
                 fields['target_action_sequence'] = action_sequence_field
-            else:
-                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
+            elif not self._keep_if_unparseable:
+                # If we are given a SQL query, but we are unable to parse it, and we do not specify explicitly
+                # to keep it, then we will skip the it.
                 return None
 
         return Instance(fields)
 
     @staticmethod
     def _is_global_rule(nonterminal: str) -> bool:
-        if nonterminal in ['number', 'time_range_start', 'time_range_end']:
+        if nonterminal in NUMERIC_NONTERMINALS:
             return False
         elif nonterminal.endswith('string'):
             return False
 
@@ -14,6 +14,7 @@
 from allennlp.modules import Attention, Seq2SeqEncoder, TextFieldEmbedder, Embedding
 from allennlp.nn import util
 from allennlp.semparse.worlds import AtisWorld
+from allennlp.semparse.contexts.atis_sql_table_context import NUMERIC_NONTERMINALS
 from allennlp.semparse.contexts.sql_context_utils import action_sequence_to_sql
 from allennlp.state_machines.states import GrammarBasedState
 from allennlp.state_machines.transition_functions.linking_transition_function import LinkingTransitionFunction
@@ -326,7 +327,8 @@ def _get_type_vector(worlds: List[AtisWorld],
         for batch_index, world in enumerate(worlds):
             types = []
             entities = [('number', entity)
-                        if 'number' or 'time_range' in entity
+                        if any([entity.startswith(numeric_nonterminal)
+                                for numeric_nonterminal in NUMERIC_NONTERMINALS])
                         else ('string', entity)
                         for entity in world.entities]
 
@@ -475,8 +477,7 @@ def _create_grammar_state(self,
 
         return GrammarStatelet(['statement'],
                                translated_valid_actions,
-                               self.is_nonterminal,
-                               reverse_productions=False)
+                               self.is_nonterminal)
 
     @overrides
     def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
 
@@ -32,15 +32,18 @@
 GRAMMAR_DICTIONARY = {}
 GRAMMAR_DICTIONARY['statement'] = ['query ws ";" ws']
 GRAMMAR_DICTIONARY['query'] = ['(ws "(" ws "SELECT" ws distinct ws select_results ws '
+                               '"FROM" ws table_refs ws where_clause ws group_by_clause ws ")" ws)',
+                               '(ws "(" ws "SELECT" ws distinct ws select_results ws '
                                '"FROM" ws table_refs ws where_clause ws ")" ws)',
                                '(ws "SELECT" ws distinct ws select_results ws '
                                '"FROM" ws table_refs ws where_clause ws)']
 GRAMMAR_DICTIONARY['select_results'] = ['col_refs', 'agg']
-GRAMMAR_DICTIONARY['agg'] = ['agg_func ws "(" ws col_ref ws ")"']
+GRAMMAR_DICTIONARY['agg'] = ['( agg_func ws "(" ws col_ref ws ")" )', '(agg_func ws "(" ws col ws ")" )']
 GRAMMAR_DICTIONARY['agg_func'] = ['"MIN"', '"min"', '"MAX"', '"max"', '"COUNT"', '"count"']
 GRAMMAR_DICTIONARY['col_refs'] = ['(col_ref ws "," ws col_refs)', '(col_ref)']
 GRAMMAR_DICTIONARY['table_refs'] = ['(table_name ws "," ws table_refs)', '(table_name)']
 GRAMMAR_DICTIONARY['where_clause'] = ['("WHERE" ws "(" ws conditions ws ")" ws)', '("WHERE" ws conditions ws)']
+GRAMMAR_DICTIONARY['group_by_clause'] = ['("GROUP" ws "BY" ws col_ref)']
 GRAMMAR_DICTIONARY['conditions'] = ['(condition ws conj ws conditions)',
                                     '(condition ws conj ws "(" ws conditions ws ")")',
                                     '("(" ws conditions ws ")" ws conj ws conditions)',
@@ -71,6 +74,10 @@
 KEYWORDS = ['"SELECT"', '"FROM"', '"MIN"', '"MAX"', '"COUNT"', '"WHERE"', '"NOT"', '"IN"', '"LIKE"',
             '"IS"', '"BETWEEN"', '"AND"', '"ALL"', '"ANY"', '"NULL"', '"OR"', '"DISTINCT"']
 
+NUMERIC_NONTERMINALS = ['number', 'time_range_start', 'time_range_end',
+                        'fare_round_trip_cost', 'fare_one_direction_cost',
+                        'flight_number', 'day_number', 'month_number', 'year_number']
+
 class AtisSqlTableContext:
     """
     An ``AtisSqlTableContext`` represents the SQL context with a grammar of SQL and the valid actions
@@ -123,11 +130,14 @@ def create_grammar_dict_and_strings(self) -> Tuple[Dict[str, List[str]], List[Tu
             grammar_dictionary['table_name'] = \
                     sorted([f'"{table}"'
                             for table in list(self.all_tables.keys())], reverse=True)
-            grammar_dictionary['col_ref'] = ['"*"']
+            grammar_dictionary['col_ref'] = ['"*"', 'agg']
+            all_columns = []
             for table, columns in self.all_tables.items():
                 grammar_dictionary['col_ref'].extend([f'("{table}" ws "." ws "{column}")'
                                                       for column in columns])
+                all_columns.extend(columns)
             grammar_dictionary['col_ref'] = sorted(grammar_dictionary['col_ref'], reverse=True)
+            grammar_dictionary['col'] = sorted([f'"{column}"' for column in all_columns], reverse=True)
 
         biexprs = []
         if self.tables_with_strings:
@@ -138,6 +148,9 @@ def create_grammar_dict_and_strings(self) -> Tuple[Dict[str, List[str]], List[Tu
                     self.cursor.execute(f'SELECT DISTINCT {table} . {column} FROM {table}')
                     results = self.cursor.fetchall()
 
+                    # Almost all the query values are in the database, we hardcode the rare case here.
+                    if table == 'flight' and column == 'airline_code':
+                        results.append(('EA',))
                     strings_list.extend([(format_action(f"{table}_{column}_string",
                                                         str(row[0]),
                                                         is_string=not 'number' in column,
 
@@ -15,6 +15,7 @@
 APPROX_WORDS = ['about', 'around', 'approximately']
 WORDS_PRECEDING_TIME = ['at', 'between', 'to', 'before', 'after']
 
+
 def pm_map_match_to_query_value(match: str):
     if len(match.rstrip('pm')) < 3: # This will match something like ``5pm``.
         if match.startswith('12'):
@@ -82,12 +83,13 @@ def get_date_from_utterance(tokenized_utterance: List[Token],
     it is 1993 so we do the same here. If there is no mention of the month or day then
     we do not return any dates from the utterance.
     """
+
     dates = []
+
     utterance = ' '.join([token.text for token in tokenized_utterance])
     year_result = re.findall(r'199[0-4]', utterance)
     if year_result:
         year = int(year_result[0])
-
     trigrams = ngrams([token.text for token in tokenized_utterance], 3)
     for month, tens, digit in trigrams:
         # This will match something like ``september twenty first``.
@@ -107,6 +109,20 @@ def get_date_from_utterance(tokenized_utterance: List[Token],
             except ValueError:
                 print('invalid month day')
 
+    fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
+    for tens, digit, _, year_match, month in fivegrams:
+        # This will match something like ``twenty first of 1993 july``.
+        day = ' '.join([tens, digit])
+        if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
+            try:
+                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
+            except ValueError:
+                print('invalid month day')
+        if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
+            try:
+                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
+            except ValueError:
+                print('invalid month day')
     return dates
 
 def get_numbers_from_utterance(utterance: str, tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
@@ -189,6 +205,35 @@ def get_time_range_end_from_utterance(utterance: str, # pylint: disable=unused-a
 
     return time_range_end_linking_dict
 
+def get_costs_from_utterance(utterance: str, # pylint: disable=unused-argument
+                             tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
+    dollars_indices = {index for index, token in enumerate(tokenized_utterance)
+                       if token.text == 'dollars' or token.text == 'dollar'}
+
+    costs_linking_dict: Dict[str, List[int]] = defaultdict(list)
+    for token_index, token in enumerate(tokenized_utterance):
+        if token_index + 1 in dollars_indices and token.text.isdigit():
+            costs_linking_dict[token.text].append(token_index)
+    return costs_linking_dict
+
+def get_flight_numbers_from_utterance(utterance: str, # pylint: disable=unused-argument
+                                      tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
+    indices_words_preceding_flight_number = {index for index, token in enumerate(tokenized_utterance)
+                                             if token.text in {'flight', 'number'}
+                                             or token.text.upper() in AIRLINE_CODE_LIST
+                                             or token.text.lower() in AIRLINE_CODES.keys()}
+
+    indices_words_succeeding_flight_number = {index for index, token in enumerate(tokenized_utterance)
+                                              if token.text == 'flight'}
+
+    flight_numbers_linking_dict: Dict[str, List[int]] = defaultdict(list)
+    for token_index, token in enumerate(tokenized_utterance):
+        if token.text.isdigit():
+            if token_index - 1 in indices_words_preceding_flight_number:
+                flight_numbers_linking_dict[token.text].append(token_index)
+            if token_index + 1 in indices_words_succeeding_flight_number:
+                flight_numbers_linking_dict[token.text].append(token_index)
+    return flight_numbers_linking_dict
 
 def digit_to_query_time(digit: str) -> List[int]:
     """
@@ -303,6 +348,7 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
                  'mgm': ['MG'],
                  'midwest': ['YX'],
                  'nation': ['NX'],
+                 'nationair': ['NX'],
                  'northeast': ['2V'],
                  'northwest': ['NW'],
                  'ontario': ['GX'],
@@ -384,11 +430,14 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
 GROUND_SERVICE = {'air taxi': ['AIR TAXI OPERATION'],
                   'car': ['RENTAL CAR'],
                   'limo': ['LIMOUSINE'],
+                  'limousine': ['LIMOUSINE'],
                   'rapid': ['RAPID TRANSIT'],
                   'rental': ['RENTAL CAR'],
                   'taxi': ['TAXI']}
 
-MISC_STR = {"every day" : ["DAILY"]}
+MISC_STR = {"every day" : ["DAILY"],
+            "saint petersburg": ["ST. PETERSBURG"],
+            "saint louis": ["ST. LOUIS"]}
 
 DAY_NUMBERS = {'first': 1,
                'second': 2,
@@ -424,18 +473,27 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
 
 
 MISC_TIME_TRIGGERS = {'lunch': ['1400'],
-                      'noon': ['1200']}
+                      'noon': ['1200'],
+                      'early evening': ['1800', '2000'],
+                      'morning': ['0', '1200'],
+                      'night': ['1800', '2400']}
 
 TIME_RANGE_START_DICT = {'morning': ['0'],
+                         'mornings': ['1200'],
                          'afternoon': ['1200'],
+                         'afternoons': ['1200'],
+                         'after noon': ['1200'],
                          'late afternoon': ['1600'],
                          'evening': ['1800'],
                          'late evening': ['2000']}
 
 TIME_RANGE_END_DICT = {'early morning': ['800'],
-                       'morning': ['1200'],
+                       'morning': ['1200', '800'],
+                       'mornings': ['1200', '800'],
                        'early afternoon': ['1400'],
                        'afternoon': ['1800'],
+                       'afternoons': ['1800'],
+                       'after noon': ['1800'],
                        'evening': ['2200']}
 
 ALL_TABLES = {'aircraft': ['aircraft_code', 'aircraft_description', 'capacity',
@@ -477,18 +535,18 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
 
 TABLES_WITH_STRINGS = {'airline' : ['airline_code', 'airline_name'],
                        'city' : ['city_name', 'state_code', 'city_code'],
-                       'fare' : ['round_trip_required', 'fare_basis_code'],
-                       'flight' : ['airline_code', 'flight_days', 'flight_number'],
+                       'fare' : ['round_trip_required', 'fare_basis_code', 'restriction_code'],
+                       'flight' : ['airline_code', 'flight_days'],
                        'flight_stop' : ['stop_airport'],
-                       'airport' : ['airport_code'],
-                       'state' : ['state_name'],
-                       'fare_basis' : ['fare_basis_code', 'class_type', 'economy'],
-                       'class_of_service' : ['booking_class'],
-                       'aircraft' : ['basic_type', 'manufacturer'],
+                       'airport' : ['airport_code', 'airport_name'],
+                       'state' : ['state_name', 'state_code'],
+                       'fare_basis' : ['fare_basis_code', 'class_type', 'economy', 'booking_class'],
+                       'class_of_service' : ['booking_class', 'class_description'],
+                       'aircraft' : ['basic_type', 'manufacturer', 'aircraft_code', 'propulsion'],
                        'restriction' : ['restriction_code'],
                        'ground_service' : ['transport_type'],
-                       'days' : ['day_name'],
-                       'food_service': ['meal_description']}
+                       'days' : ['day_name', 'days_code'],
+                       'food_service': ['meal_description', 'compartment']}
 
 DAY_OF_WEEK = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
 
@@ -518,7 +576,10 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
                       'charlotte': ['CLT'],
                       'dallas': ['DFW'],
                       'detroit': ['DTW'],
+                      'houston': ['IAH'],
                       'la guardia': ['LGA'],
+                      'love field': ['DAL'],
+                      'los angeles': ['LAX'],
                       'oakland': ['OAK'],
                       'philadelphia': ['PHL'],
                       'pittsburgh': ['PIT'],
@@ -537,7 +598,7 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
                      'OK', 'DL', '9E', 'QD', 'LH', 'XJ', 'MG',
                      'YX', 'NX', '2V', 'NW', 'RP', 'AT', 'SN',
                      'OO', 'WN', 'TG', 'FF', '9N', 'TW', 'RZ',
-                     'UA', 'US', 'OE']
+                     'UA', 'US', 'OE', 'EA']
 CITIES = ['NASHVILLE', 'BOSTON', 'BURBANK', 'BALTIMORE', 'CHICAGO', 'CLEVELAND',
           'CHARLOTTE', 'COLUMBUS', 'CINCINNATI', 'DENVER', 'DALLAS', 'DETROIT',
           'FORT WORTH', 'HOUSTON', 'WESTCHESTER COUNTY', 'INDIANAPOLIS', 'NEWARK',
@@ -551,7 +612,12 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
                   'MATL', 'MMEM', 'MMIA', 'MMKC', 'MMKE', 'MMSP', 'NNYC', 'OOAK', 'OONT', 'OORL',
                   'PPHL', 'PPHX', 'PPIT', 'SMSP', 'SSAN', 'SSEA', 'SSFO', 'SSJC', 'SSLC', 'SSTL',
                   'STPA', 'TSEA', 'TTPA', 'WWAS', 'YYMQ', 'YYTO']
-CLASS = ['COACH', 'BUSINESS', 'FIRST', 'THRIST', 'STANDARD', 'SHUTTLE']
+
+CLASS = ['COACH', 'BUSINESS', 'FIRST', 'THRIFT', 'STANDARD', 'SHUTTLE']
+
+AIRCRAFT_MANUFACTURERS = ['BOEING', 'MCDONNELL DOUGLAS', 'FOKKER']
+
+AIRCRAFT_BASIC_CODES = ['DC9', '737', '767', '747', 'DC10', '757', 'MD80']
 
 DAY_OF_WEEK_INDEX = {idx : [day] for idx, day in enumerate(DAY_OF_WEEK)}
 
@@ -560,7 +626,10 @@ def convert_to_string_list_value_dict(trigger_dict: Dict[str, int]) -> Dict[str,
                  FARE_BASIS_CODE, CLASS,
                  AIRLINE_CODE_LIST, DAY_OF_WEEK,
                  CITY_CODE_LIST, MEALS,
-                 RESTRICT_CODES]
+                 RESTRICT_CODES,
+                 AIRCRAFT_MANUFACTURERS,
+                 AIRCRAFT_BASIC_CODES]
+
 TRIGGER_DICTS = [CITY_AIRPORT_CODES,
                  AIRLINE_CODES,
                  CITY_CODES,