Fast grammar generation (#1852)

kl2806 · web-flow · commit a7da2abf248d · 2018-10-02T22:21:07.000-07:00
* 1 grammar object

* test, pylint

* fix typo

* pylint

* remove pos from spacey model

* private method, decrease max_decoding_steps

* test helper methods
diff --git a/allennlp/data/dataset_readers/semantic_parsing/atis.py b/allennlp/data/dataset_readers/semantic_parsing/atis.py
@@ -71,7 +71,7 @@ def __init__(self,
                  database_file: str = None) -> None:
         super().__init__(lazy)
         self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
-        self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
+        self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter())
         self._database_file = database_file
         # TODO(kevin): Add a keep_unparseable_utterances flag so that during validation, we do not skip queries that
         # cannot be parsed.
diff --git a/allennlp/semparse/worlds/atis_world.py b/allennlp/semparse/worlds/atis_world.py
@@ -1,13 +1,14 @@
 from typing import List, Dict, Tuple, Set, Callable
-from copy import deepcopy
+from copy import copy
 import numpy
 from nltk import ngrams
 
 from parsimonious.grammar import Grammar
+from parsimonious.expressions import Expression, OneOf, Sequence, Literal
 
 from allennlp.semparse.contexts.atis_tables import * # pylint: disable=wildcard-import,unused-wildcard-import
 from allennlp.semparse.contexts.atis_sql_table_context import AtisSqlTableContext, KEYWORDS
-from allennlp.semparse.contexts.sql_context_utils import SqlVisitor, format_action
+from allennlp.semparse.contexts.sql_context_utils import SqlVisitor, format_action, initialize_valid_actions
 
 from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
 
@@ -67,14 +68,125 @@ def __init__(self,
         self.linked_entities = self._get_linked_entities()
         self.dates = self._get_dates()
 
-        self.valid_actions: Dict[str, List[str]] = self._update_valid_actions()
         entities, linking_scores = self._flatten_entities()
         # This has shape (num_entities, num_utterance_tokens).
         self.linking_scores: numpy.ndarray = linking_scores
         self.entities: List[str] = entities
-        self.grammar_dictionary = self.update_grammar_dictionary()
-        self.grammar_string: str = self.get_grammar_string()
-        self.grammar_with_context: Grammar = Grammar(self.grammar_string)
+        self.grammar: Grammar = self._update_grammar()
+        self.valid_actions = initialize_valid_actions(self.grammar,
+                                                      KEYWORDS)
+
+    def _update_grammar(self):
+        """
+        We create a new ``Grammar`` object from the one in ``AtisSqlTableContext``, that also
+        has the new entities that are extracted from the utterance. Stitching together the expressions
+        to form the grammar is a little tedious here, but it is worth it because we don't have to create
+        a new grammar from scratch. Creating a new grammar is expensive because we have many production
+        rules that have all database values in the column on the right hand side. We update the expressions
+        bottom up, since the higher level expressions may refer to the lower level ones. For example, the
+        ternary expression will refer to the start and end times.
+        """
+
+        # This will give us a shallow copy, but that's OK because everything
+        # inside is immutable so we get a new copy of it.
+        new_grammar = copy(AtisWorld.sql_table_context.grammar)
+
+        numbers = self._get_numeric_database_values('number')
+        number_literals = [Literal(number) for number in numbers]
+        new_grammar['number'] = OneOf(*number_literals, name='number')
+        self._update_expression_reference(new_grammar, 'pos_value', 'number')
+
+        time_range_start = self._get_numeric_database_values('time_range_start')
+        time_range_start_literals = [Literal(time) for time in time_range_start]
+        new_grammar['time_range_start'] = OneOf(*time_range_start_literals, name='time_range_start')
+
+        time_range_end = self._get_numeric_database_values('time_range_end')
+        time_range_end_literals = [Literal(time) for time in time_range_end]
+        new_grammar['time_range_end'] = OneOf(*time_range_end_literals, name='time_range_end')
+
+        ternary_expressions = [self._get_sequence_with_spacing(new_grammar,
+                                                               [new_grammar['col_ref'],
+                                                                Literal('BETWEEN'),
+                                                                new_grammar['time_range_start'],
+                                                                Literal(f'AND'),
+                                                                new_grammar['time_range_end']]),
+                               self._get_sequence_with_spacing(new_grammar,
+                                                               [new_grammar['col_ref'],
+                                                                Literal('NOT'),
+                                                                Literal('BETWEEN'),
+                                                                new_grammar['time_range_start'],
+                                                                Literal(f'AND'),
+                                                                new_grammar['time_range_end']]),
+                               self._get_sequence_with_spacing(new_grammar,
+                                                               [new_grammar['col_ref'],
+                                                                Literal('not'),
+                                                                Literal('BETWEEN'),
+                                                                new_grammar['time_range_start'],
+                                                                Literal(f'AND'),
+                                                                new_grammar['time_range_end']])]
+
+        new_grammar['ternaryexpr'] = OneOf(*ternary_expressions, name='ternaryexpr')
+        self._update_expression_reference(new_grammar, 'condition', 'ternaryexpr')
+
+        if self.dates:
+            new_binary_expressions = []
+            year_binary_expression = self._get_sequence_with_spacing(new_grammar,
+                                                                     [Literal('date_day'),
+                                                                      Literal('.'),
+                                                                      Literal('year'),
+                                                                      new_grammar['binaryop'],
+                                                                      Literal(f'{self.dates[0].year}')])
+            new_binary_expressions.append(year_binary_expression)
+            for date in self.dates:
+                month_binary_expression = self._get_sequence_with_spacing(new_grammar,
+                                                                          [Literal('date_day'),
+                                                                           Literal('.'),
+                                                                           Literal('month_number'),
+                                                                           new_grammar['binaryop'],
+                                                                           Literal(f'{date.month}')])
+
+                day_binary_expression = self._get_sequence_with_spacing(new_grammar,
+                                                                        [Literal('date_day'),
+                                                                         Literal('.'),
+                                                                         Literal('day_number'),
+                                                                         new_grammar['binaryop'],
+                                                                         Literal(f'{date.day}')])
+                new_binary_expressions.extend([month_binary_expression,
+                                               day_binary_expression])
+
+            new_grammar['biexpr'].members = new_grammar['biexpr'].members + tuple(new_binary_expressions)
+        return new_grammar
+
+    def _get_numeric_database_values(self,
+                                     nonterminal: str) -> List[str]:
+        return sorted([value[1] for key, value in self.linked_entities['number'].items()
+                       if value[0] == nonterminal], reverse=True)
+
+    def _update_expression_reference(self, # pylint: disable=no-self-use
+                                     grammar: Grammar,
+                                     parent_expression_nonterminal: str,
+                                     child_expression_nonterminal: str) -> None:
+        """
+        When we add a new expression, there may be other expressions that refer to
+        it, and we need to update those to point to the new expression.
+        """
+        grammar[parent_expression_nonterminal].members = \
+                [member if member.name != child_expression_nonterminal
+                 else grammar[child_expression_nonterminal]
+                 for member in grammar[parent_expression_nonterminal].members]
+
+    def _get_sequence_with_spacing(self, # pylint: disable=no-self-use
+                                   new_grammar,
+                                   expressions: List[Expression],
+                                   name: str = '') -> Sequence:
+        """
+        This is a helper method for generating sequences, since we often want a list of expressions
+        with whitespaces between them.
+        """
+        expressions = [subexpression
+                       for expression in expressions
+                       for subexpression in (expression, new_grammar['ws'])]
+        return Sequence(*expressions, name=name)
 
     def get_valid_actions(self) -> Dict[str, List[str]]:
         return self.valid_actions
@@ -165,79 +277,14 @@ def _get_linked_entities(self) -> Dict[str, Dict[str, Tuple[str, str, List[int]]
         entity_linking_scores['string'] = string_linking_scores
         return entity_linking_scores
 
-    def _update_valid_actions(self) -> Dict[str, List[str]]:
-        valid_actions = deepcopy(self.sql_table_context.get_valid_actions())
-        valid_actions['time_range_start'] = []
-        valid_actions['time_range_end'] = []
-        for action, value in self.linked_entities['number'].items():
-            valid_actions[value[0]].append(action)
-
-        for date in self.dates:
-            for biexpr_rule in [f'biexpr -> ["date_day", ".", "year", binaryop, "{date.year}"]',
-                                f'biexpr -> ["date_day", ".", "month_number", binaryop, "{date.month}"]',
-                                f'biexpr -> ["date_day", ".", "day_number", binaryop, "{date.day}"]']:
-                if biexpr_rule not in valid_actions:
-                    valid_actions['biexpr'].append(biexpr_rule)
-
-        valid_actions['ternaryexpr'] = \
-                ['ternaryexpr -> [col_ref, "BETWEEN", time_range_start, "AND", time_range_end]',
-                 'ternaryexpr -> [col_ref, "NOT", "BETWEEN", time_range_start, "AND", time_range_end]']
-
-        return valid_actions
-
     def _get_dates(self):
         dates = []
         for tokenized_utterance in self.tokenized_utterances:
             dates.extend(get_date_from_utterance(tokenized_utterance))
         return dates
 
-    def update_grammar_dictionary(self) -> Dict[str, List[str]]:
-        """
-        We modify the ``grammar_dictionary`` with additional constraints
-        we want for the ATIS dataset. We then add numbers to the grammar dictionary. The strings in the
-        database are already added in by the ``SqlTableContext``.
-        """
-        self.grammar_dictionary = deepcopy(self.sql_table_context.get_grammar_dictionary())
-        if self.dates:
-            year_binary_expression = f'("date_day" ws "." ws "year" ws binaryop ws "{self.dates[0].year}")'
-            self.grammar_dictionary['biexpr'].append(year_binary_expression)
-
-            for date in self.dates:
-                month_day_binary_expressions = \
-                        [f'("date_day" ws "." ws "month_number" ws binaryop ws "{date.month}")',
-                         f'("date_day" ws "." ws "day_number" ws binaryop ws "{date.day}")']
-                self.grammar_dictionary['biexpr'].extend(month_day_binary_expressions)
-
-
-        self.grammar_dictionary['ternaryexpr'] = \
-                ['(col_ref ws "not" ws "BETWEEN" ws time_range_start ws "AND" ws time_range_end ws)',
-                 '(col_ref ws "NOT" ws "BETWEEN" ws time_range_start  ws "AND" ws time_range_end ws)',
-                 '(col_ref ws "BETWEEN" ws time_range_start ws "AND" ws time_range_end ws)']
-
-        # We need to add the numbers, starting, ending time ranges to the grammar.
-        numbers = sorted([value[1] for key, value in self.linked_entities['number'].items()
-                          if value[0] == 'number'], reverse=True)
-        self.grammar_dictionary['number'] = [f'"{number}"' for number in numbers]
-
-        time_range_start = sorted([value[1] for key, value in self.linked_entities['number'].items()
-                                   if value[0] == 'time_range_start'], reverse=True)
-        self.grammar_dictionary['time_range_start'] = [f'"{time}"' for time in time_range_start]
-
-        time_range_end = sorted([value[1] for key, value in self.linked_entities['number'].items()
-                                 if value[0] == 'time_range_end'], reverse=True)
-        self.grammar_dictionary['time_range_end'] = [f'"{time}"' for time in time_range_end]
-        return self.grammar_dictionary
-
-    def get_grammar_string(self) -> str:
-        """
-        Generate a string that can be used to instantiate a ``Grammar`` object. The string is a sequence
-        of rules that define the grammar.
-        """
-        return '\n'.join([f"{nonterminal} = {' / '.join(right_hand_side)}"
-                          for nonterminal, right_hand_side in self.grammar_dictionary.items()])
-
     def get_action_sequence(self, query: str) -> List[str]:
-        sql_visitor = SqlVisitor(self.grammar_with_context, keywords_to_uppercase=KEYWORDS)
+        sql_visitor = SqlVisitor(self.grammar, keywords_to_uppercase=KEYWORDS)
         if query:
             action_sequence = sql_visitor.parse(query)
             return action_sequence
@@ -277,6 +324,5 @@ def __eq__(self, other):
         if isinstance(self, other.__class__):
             return all([self.valid_actions == other.valid_actions,
                         numpy.array_equal(self.linking_scores, other.linking_scores),
-                        self.utterances == other.utterances,
-                        self.grammar_string == other.grammar_string])
+                        self.utterances == other.utterances])
         return False
diff --git a/allennlp/tests/data/dataset_readers/semantic_parsing/atis_test.py b/allennlp/tests/data/dataset_readers/semantic_parsing/atis_test.py
@@ -32,9 +32,9 @@ def test_atis_read_from_file(self):
         assert isinstance(instance.fields['world'].as_tensor({}), AtisWorld)
 
         world = instance.fields['world'].metadata
-        assert world.valid_actions['number'] == \
-                ['number -> ["1"]',
-                 'number -> ["0"]']
+        assert set(world.valid_actions['number']) == \
+                {'number -> ["1"]',
+                 'number -> ["0"]'}
 
         assert world.linked_entities['string']['airport_airport_code_string -> ["\'DTW\'"]'][2] == \
                 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] # ``detroit`` -> ``DTW``
diff --git a/allennlp/tests/fixtures/semantic_parsing/atis/experiment.json b/allennlp/tests/fixtures/semantic_parsing/atis/experiment.json
@@ -25,7 +25,7 @@
     "decoder_beam_search": {
     "beam_size": 5
     },
-    "max_decoding_steps": 100,
+    "max_decoding_steps": 10,
     "input_attention": {"type": "dot_product"},
     "dropout": 0.5,
     "database_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/atis/atis.db"
diff --git a/allennlp/tests/semparse/worlds/atis_world_test.py b/allennlp/tests/semparse/worlds/atis_world_test.py
@@ -1,6 +1,9 @@
 # pylint: disable=too-many-lines
+from datetime import datetime
 import json
 
+from parsimonious.expressions import Literal, Sequence
+
 from allennlp.common.file_utils import cached_path
 from allennlp.semparse.contexts.atis_tables import * # pylint: disable=wildcard-import,unused-wildcard-import
 from allennlp.common.testing import AllenNlpTestCase
@@ -734,3 +737,25 @@ def test_time_extraction(self): # pylint: disable=no-self-use
         pm_times = [pm_map_match_to_query_value(string)
                     for string in ['12pm', '1pm', '830pm', '1230pm', '115pm']]
         assert pm_times == [[1200], [1300], [2030], [1230], [1315]]
+
+    def test_atis_helper_methods(self): # pylint: disable=no-self-use
+        world = AtisWorld([("what is the earliest flight in morning "
+                            "1993 june fourth from boston to pittsburgh")])
+        assert world.dates == [datetime(1993, 6, 4, 0, 0)]
+        assert world._get_numeric_database_values('time_range_end') == ['1200'] # pylint: disable=protected-access
+        assert world._get_sequence_with_spacing(world.grammar, # pylint: disable=protected-access
+                                                [world.grammar['col_ref'],
+                                                 Literal('BETWEEN'),
+                                                 world.grammar['time_range_start'],
+                                                 Literal(f'AND'),
+                                                 world.grammar['time_range_end']]) == \
+                                                Sequence(world.grammar['col_ref'],
+                                                         world.grammar['ws'],
+                                                         Literal('BETWEEN'),
+                                                         world.grammar['ws'],
+                                                         world.grammar['time_range_start'],
+                                                         world.grammar['ws'],
+                                                         Literal(f'AND'),
+                                                         world.grammar['ws'],
+                                                         world.grammar['time_range_end'],
+                                                         world.grammar['ws'])