use wordsplit with taggers (#1981)

WrRan · matt-gardner · commit 021f8bb1b98a · 2018-10-30T08:45:34.000-07:00
* make pos_tag/ner_tag consistent checking empty/missing value

* add test case: blank ner-/pos-tag

* rename namespace of ner-/pos-tag-indexer

* modify previous test-cases related to `namespace`

* minor fix

* Update pos_tag_indexer_test.py

add a trailing newline
diff --git a/allennlp/data/token_indexers/ner_tag_indexer.py b/allennlp/data/token_indexers/ner_tag_indexer.py
@@ -19,11 +19,11 @@ class NerTagIndexer(TokenIndexer[int]):
 
     Parameters
     ----------
-    namespace : ``str``, optional (default=``ner_tags``)
+    namespace : ``str``, optional (default=``ner_tokens``)
         We will use this namespace in the :class:`Vocabulary` to map strings to indices.
     """
     # pylint: disable=no-self-use
-    def __init__(self, namespace: str = 'ner_tags') -> None:
+    def __init__(self, namespace: str = 'ner_tokens') -> None:
         self._namespace = namespace
 
     @overrides
@@ -38,7 +38,7 @@ def tokens_to_indices(self,
                           tokens: List[Token],
                           vocabulary: Vocabulary,
                           index_name: str) -> Dict[str, List[int]]:
-        tags = ['NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens]
+        tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]
 
         return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
 
diff --git a/allennlp/data/token_indexers/pos_tag_indexer.py b/allennlp/data/token_indexers/pos_tag_indexer.py
@@ -20,13 +20,13 @@ class PosTagIndexer(TokenIndexer[int]):
 
     Parameters
     ----------
-    namespace : ``str``, optional (default=``pos_tags``)
+    namespace : ``str``, optional (default=``pos_tokens``)
         We will use this namespace in the :class:`Vocabulary` to map strings to indices.
     coarse_tags : ``bool``, optional (default=``False``)
         If ``True``, we will use coarse POS tags instead of the default fine-grained POS tags.
     """
     # pylint: disable=no-self-use
-    def __init__(self, namespace: str = 'pos_tags', coarse_tags: bool = False) -> None:
+    def __init__(self, namespace: str = 'pos_tokens', coarse_tags: bool = False) -> None:
         self._namespace = namespace
         self._coarse_tags = coarse_tags
         self._logged_errors: Set[str] = set()
@@ -56,7 +56,7 @@ def tokens_to_indices(self,
                 tag = token.pos_
             else:
                 tag = token.tag_
-            if tag is None:
+            if not tag:
                 tag = 'NONE'
 
             tags.append(tag)
diff --git a/allennlp/tests/data/token_indexers/ner_tag_indexer_test.py b/allennlp/tests/data/token_indexers/ner_tag_indexer_test.py
@@ -19,7 +19,7 @@ def test_count_vocab_items_uses_ner_tags(self):
         counter = defaultdict(lambda: defaultdict(int))
         for token in tokens:
             indexer.count_vocab_items(token, counter)
-        assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
+        assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
 
     def test_tokens_to_indices_uses_ner_tags(self):
         tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
@@ -28,7 +28,7 @@ def test_tokens_to_indices_uses_ner_tags(self):
         person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
         none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
         vocab.add_token_to_namespace('ORG', namespace='ner_tags')
-        indexer = NerTagIndexer()
+        indexer = NerTagIndexer(namespace='ner_tags')
         assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]}
         assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
 
@@ -41,3 +41,20 @@ def test_as_array_produces_token_sequence(self):
         indexer = NerTagIndexer()
         padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
         assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
+
+    def test_blank_ner_tag(self):
+        tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
+        for token in tokens:
+            token.ent_type_ = ""
+        indexer = NerTagIndexer()
+        counter = defaultdict(lambda: defaultdict(int))
+        for token in tokens:
+            indexer.count_vocab_items(token, counter)
+        # spacy uses a empty string to indicate "no NER tag"
+        # we convert it to "NONE"
+        assert counter["ner_tokens"]["NONE"] == 4
+        vocab = Vocabulary(counter)
+        none_index = vocab.get_token_index('NONE', 'ner_tokens')
+        # should raise no exception
+        indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
+        assert {"ner": [none_index, none_index, none_index, none_index]} == indices
diff --git a/allennlp/tests/data/token_indexers/pos_tag_indexer_test.py b/allennlp/tests/data/token_indexers/pos_tag_indexer_test.py
@@ -19,13 +19,13 @@ def test_count_vocab_items_uses_pos_tags(self):
         counter = defaultdict(lambda: defaultdict(int))
         for token in tokens:
             indexer.count_vocab_items(token, counter)
-        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}
+        assert counter["pos_tokens"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}
 
         indexer._coarse_tags = True  # pylint: disable=protected-access
         counter = defaultdict(lambda: defaultdict(int))
         for token in tokens:
             indexer.count_vocab_items(token, counter)
-        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
+        assert counter["pos_tokens"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
 
     def test_tokens_to_indices_uses_pos_tags(self):
         tokens = self.tokenizer.split_words("This is a sentence.")
@@ -39,7 +39,7 @@ def test_tokens_to_indices_uses_pos_tags(self):
         vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
         vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')
 
-        indexer = PosTagIndexer(coarse_tags=True)
+        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)
 
         indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
         assert len(indices) == 1
@@ -59,3 +59,20 @@ def test_as_array_produces_token_sequence(self):
         indexer = PosTagIndexer()
         padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
         assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
+
+    def test_blank_pos_tag(self):
+        tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
+        for token in tokens:
+            token.pos_ = ""
+        indexer = PosTagIndexer()
+        counter = defaultdict(lambda: defaultdict(int))
+        for token in tokens:
+            indexer.count_vocab_items(token, counter)
+        # spacy uses a empty string to indicate "no POS tag"
+        # we convert it to "NONE"
+        assert counter["pos_tokens"]["NONE"] == 4
+        vocab = Vocabulary(counter)
+        none_index = vocab.get_token_index('NONE', 'pos_tokens')
+        # should raise no exception
+        indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
+        assert {"pos": [none_index, none_index, none_index, none_index]} == indices