Fix SpanBasedF1Measure for tags without conll labels (#1491)

nelson-liu · web-flow · commit 5fc7a00dee3a · 2018-07-16T11:10:13.000+10:00
Fixes `SpanBasedF1Measure` to work if the tags don't have conll labels (i.e., the labels are simply `{B, I, O}`).

The only thing that had to be changed was the check to see whether the `active_conll_tag` is not `None`, since `""` is `False`.
diff --git a/allennlp/data/dataset_readers/dataset_utils/span_utils.py b/allennlp/data/dataset_readers/dataset_utils/span_utils.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Callable, TypeVar
+from typing import Callable, List, Set, Tuple, TypeVar
 
 from allennlp.data.dataset_readers.dataset_utils.ontonotes import TypedStringSpan
 from allennlp.data.tokenizers.token import Token
@@ -70,7 +70,8 @@ def bio_tags_to_spans(tag_sequence: List[str],
     Spans are inclusive and can be of zero length, representing a single word span.
     Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"),
     as otherwise it is possible to get a perfect precision score whilst still predicting
-    ill-formed spans in addition to the correct spans.
+    ill-formed spans in addition to the correct spans. This function works properly when
+    the spans are unlabeled (i.e., your labels are simply "B", "I", and "O").
 
     Parameters
     ----------
@@ -87,7 +88,7 @@ def bio_tags_to_spans(tag_sequence: List[str],
         Note that the label `does not` contain any BIO tag prefixes.
     """
     classes_to_ignore = classes_to_ignore or []
-    spans = set()
+    spans: Set[Tuple[str, Tuple[int, int]]] = set()
     span_start = 0
     span_end = 0
     active_conll_tag = None
@@ -99,7 +100,7 @@ def bio_tags_to_spans(tag_sequence: List[str],
         conll_tag = string_tag[2:]
         if bio_tag == "O" or conll_tag in classes_to_ignore:
             # The span has ended.
-            if active_conll_tag:
+            if active_conll_tag is not None:
                 spans.add((active_conll_tag, (span_start, span_end)))
             active_conll_tag = None
             # We don't care about tags we are
@@ -108,7 +109,7 @@ def bio_tags_to_spans(tag_sequence: List[str],
         elif bio_tag == "B":
             # We are entering a new span; reset indices
             # and active tag to new span.
-            if active_conll_tag:
+            if active_conll_tag is not None:
                 spans.add((active_conll_tag, (span_start, span_end)))
             active_conll_tag = conll_tag
             span_start = index
@@ -124,13 +125,13 @@ def bio_tags_to_spans(tag_sequence: List[str],
             # include this span. This is important, because otherwise,
             # a model may get a perfect F1 score whilst still including
             # false positive ill-formed spans.
-            if active_conll_tag:
+            if active_conll_tag is not None:
                 spans.add((active_conll_tag, (span_start, span_end)))
             active_conll_tag = conll_tag
             span_start = index
             span_end = index
     # Last token might have been a part of a valid span.
-    if active_conll_tag:
+    if active_conll_tag is not None:
         spans.add((active_conll_tag, (span_start, span_end)))
     return list(spans)
 
@@ -141,6 +142,8 @@ def bioul_tags_to_spans(tag_sequence: List[str],
     Given a sequence corresponding to BIOUL tags, extracts spans.
     Spans are inclusive and can be of zero length, representing a single word span.
     Ill-formed spans are not allowed and will raise ``InvalidTagSequence``.
+    This function works properly when the spans are unlabeled (i.e., your labels are
+    simply "B", "I", "O", "U", and "L").
 
     Parameters
     ----------
diff --git a/allennlp/tests/data/dataset_readers/dataset_utils/span_utils_test.py b/allennlp/tests/data/dataset_readers/dataset_utils/span_utils_test.py
@@ -24,6 +24,21 @@ def test_bio_tags_to_spans_extracts_correct_spans(self):
         assert set(spans) == {("ARG1", (1, 2)), ("ARG2", (5, 6)), ("ARG1", (7, 7)),
                               ("ARG1", (4, 4)), ("ARG2", (8, 9))}
 
+    def test_bio_tags_to_spans_extracts_correct_spans_without_labels(self):
+        tag_sequence = ["O", "B", "I", "O", "B", "I", "B", "B"]
+        spans = span_utils.bio_tags_to_spans(tag_sequence)
+        assert set(spans) == {("", (1, 2)), ("", (4, 5)), ("", (6, 6)), ("", (7, 7))}
+
+        # Check that it raises when we use U- tags for single tokens.
+        tag_sequence = ["O", "B", "I", "O", "B", "I", "U", "U"]
+        with self.assertRaises(span_utils.InvalidTagSequence):
+            spans = span_utils.bio_tags_to_spans(tag_sequence)
+
+        # Check that invalid BIO sequences are also handled as spans.
+        tag_sequence = ["O", "B", "I", "O", "I", "B", "I", "B", "I", "I"]
+        spans = span_utils.bio_tags_to_spans(tag_sequence)
+        assert set(spans) == {('', (1, 2)), ('', (4, 4)), ('', (5, 6)), ('', (7, 9))}
+
     def test_bio_tags_to_spans_ignores_specified_tags(self):
         tag_sequence = ["B-V", "I-V", "O", "B-ARG1", "I-ARG1",
                         "O", "B-ARG2", "I-ARG2", "B-ARG1", "B-ARG2"]
@@ -66,6 +81,15 @@ def test_bioul_tags_to_spans(self):
         with self.assertRaises(span_utils.InvalidTagSequence):
             spans = span_utils.bioul_tags_to_spans(tag_sequence)
 
+    def test_bioul_tags_to_spans_without_labels(self):
+        tag_sequence = ['B', 'I', 'L', 'U', 'U', 'O']
+        spans = span_utils.bioul_tags_to_spans(tag_sequence)
+        assert spans == [('', (0, 2)), ('', (3, 3)), ('', (4, 4))]
+
+        tag_sequence = ['B', 'I', 'O']
+        with self.assertRaises(span_utils.InvalidTagSequence):
+            spans = span_utils.bioul_tags_to_spans(tag_sequence)
+
     def test_iob1_to_bioul(self):
         tag_sequence = ['I-ORG', 'O', 'I-MISC', 'O']
         bioul_sequence = span_utils.iob1_to_bioul(tag_sequence)
diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py
@@ -24,7 +24,10 @@ class SpanBasedF1Measure(Metric):
     is not exactly the same as the perl script used to evaluate the CONLL 2005
     data - particularly, it does not consider continuations or reference spans
     as constituents of the original span. However, it is a close proxy, which
-    can be helpful for judging model peformance during training.
+    can be helpful for judging model peformance during training. This metric
+    works properly when the spans are unlabeled (i.e., your labels are
+    simply "B", "I", "O" if using the "BIO" label encoding).
+
     """
     def __init__(self,
                  vocabulary: Vocabulary,