Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Commit 021f8bb

Browse files
WrRanmatt-gardner
authored andcommitted
use wordsplit with taggers (#1981)
* make pos_tag/ner_tag consistent checking empty/missing value * add test case: blank ner-/pos-tag * rename namespace of ner-/pos-tag-indexer * modify previous test-cases related to `namespace` * minor fix * Update pos_tag_indexer_test.py add a trailing newline
1 parent c262ef5 commit 021f8bb

File tree

4 files changed

+45
-11
lines changed

4 files changed

+45
-11
lines changed

allennlp/data/token_indexers/ner_tag_indexer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ class NerTagIndexer(TokenIndexer[int]):
1919
2020
Parameters
2121
----------
22-
namespace : ``str``, optional (default=``ner_tags``)
22+
namespace : ``str``, optional (default=``ner_tokens``)
2323
We will use this namespace in the :class:`Vocabulary` to map strings to indices.
2424
"""
2525
# pylint: disable=no-self-use
26-
def __init__(self, namespace: str = 'ner_tags') -> None:
26+
def __init__(self, namespace: str = 'ner_tokens') -> None:
2727
self._namespace = namespace
2828

2929
@overrides
@@ -38,7 +38,7 @@ def tokens_to_indices(self,
3838
tokens: List[Token],
3939
vocabulary: Vocabulary,
4040
index_name: str) -> Dict[str, List[int]]:
41-
tags = ['NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens]
41+
tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]
4242

4343
return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
4444

allennlp/data/token_indexers/pos_tag_indexer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ class PosTagIndexer(TokenIndexer[int]):
2020
2121
Parameters
2222
----------
23-
namespace : ``str``, optional (default=``pos_tags``)
23+
namespace : ``str``, optional (default=``pos_tokens``)
2424
We will use this namespace in the :class:`Vocabulary` to map strings to indices.
2525
coarse_tags : ``bool``, optional (default=``False``)
2626
If ``True``, we will use coarse POS tags instead of the default fine-grained POS tags.
2727
"""
2828
# pylint: disable=no-self-use
29-
def __init__(self, namespace: str = 'pos_tags', coarse_tags: bool = False) -> None:
29+
def __init__(self, namespace: str = 'pos_tokens', coarse_tags: bool = False) -> None:
3030
self._namespace = namespace
3131
self._coarse_tags = coarse_tags
3232
self._logged_errors: Set[str] = set()
@@ -56,7 +56,7 @@ def tokens_to_indices(self,
5656
tag = token.pos_
5757
else:
5858
tag = token.tag_
59-
if tag is None:
59+
if not tag:
6060
tag = 'NONE'
6161

6262
tags.append(tag)

allennlp/tests/data/token_indexers/ner_tag_indexer_test.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def test_count_vocab_items_uses_ner_tags(self):
1919
counter = defaultdict(lambda: defaultdict(int))
2020
for token in tokens:
2121
indexer.count_vocab_items(token, counter)
22-
assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
22+
assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
2323

2424
def test_tokens_to_indices_uses_ner_tags(self):
2525
tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
@@ -28,7 +28,7 @@ def test_tokens_to_indices_uses_ner_tags(self):
2828
person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
2929
none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
3030
vocab.add_token_to_namespace('ORG', namespace='ner_tags')
31-
indexer = NerTagIndexer()
31+
indexer = NerTagIndexer(namespace='ner_tags')
3232
assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]}
3333
assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
3434

@@ -41,3 +41,20 @@ def test_as_array_produces_token_sequence(self):
4141
indexer = NerTagIndexer()
4242
padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
4343
assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
44+
45+
def test_blank_ner_tag(self):
46+
tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
47+
for token in tokens:
48+
token.ent_type_ = ""
49+
indexer = NerTagIndexer()
50+
counter = defaultdict(lambda: defaultdict(int))
51+
for token in tokens:
52+
indexer.count_vocab_items(token, counter)
53+
# spacy uses a empty string to indicate "no NER tag"
54+
# we convert it to "NONE"
55+
assert counter["ner_tokens"]["NONE"] == 4
56+
vocab = Vocabulary(counter)
57+
none_index = vocab.get_token_index('NONE', 'ner_tokens')
58+
# should raise no exception
59+
indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
60+
assert {"ner": [none_index, none_index, none_index, none_index]} == indices

allennlp/tests/data/token_indexers/pos_tag_indexer_test.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ def test_count_vocab_items_uses_pos_tags(self):
1919
counter = defaultdict(lambda: defaultdict(int))
2020
for token in tokens:
2121
indexer.count_vocab_items(token, counter)
22-
assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}
22+
assert counter["pos_tokens"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}
2323

2424
indexer._coarse_tags = True # pylint: disable=protected-access
2525
counter = defaultdict(lambda: defaultdict(int))
2626
for token in tokens:
2727
indexer.count_vocab_items(token, counter)
28-
assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
28+
assert counter["pos_tokens"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
2929

3030
def test_tokens_to_indices_uses_pos_tags(self):
3131
tokens = self.tokenizer.split_words("This is a sentence.")
@@ -39,7 +39,7 @@ def test_tokens_to_indices_uses_pos_tags(self):
3939
vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
4040
vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')
4141

42-
indexer = PosTagIndexer(coarse_tags=True)
42+
indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)
4343

4444
indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
4545
assert len(indices) == 1
@@ -59,3 +59,20 @@ def test_as_array_produces_token_sequence(self):
5959
indexer = PosTagIndexer()
6060
padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
6161
assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
62+
63+
def test_blank_pos_tag(self):
64+
tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
65+
for token in tokens:
66+
token.pos_ = ""
67+
indexer = PosTagIndexer()
68+
counter = defaultdict(lambda: defaultdict(int))
69+
for token in tokens:
70+
indexer.count_vocab_items(token, counter)
71+
# spacy uses a empty string to indicate "no POS tag"
72+
# we convert it to "NONE"
73+
assert counter["pos_tokens"]["NONE"] == 4
74+
vocab = Vocabulary(counter)
75+
none_index = vocab.get_token_index('NONE', 'pos_tokens')
76+
# should raise no exception
77+
indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
78+
assert {"pos": [none_index, none_index, none_index, none_index]} == indices

0 commit comments

Comments
 (0)