new: incldue parsing in reference_to_token_annotations

ivyleavedtoadflax · ivyleavedtoadflax · commit 115ada2751ea · 2020-02-20T10:13:11.000-03:00
Previously this command would only work in the parsing scenario of converting reference spans (BI, BE, IE, II) spans to token spans (b-r, i-r, e-r, o). This commit allows it also to be used for parsing spans where a reference span (author) can be converted to a series of token spans (author).
diff --git a/deep_reference_parser/prodigy/reference_to_token_annotations.py b/deep_reference_parser/prodigy/reference_to_token_annotations.py
@@ -10,31 +10,52 @@
 
 
 class TokenTagger:
-    """
-    Converts data in prodigy format with full reference spans to per-token spans
 
-    Expects one of four lables for the spans:
+    def __init__(self, task="splitting"):
+        """
+        Converts data in prodigy format with full reference spans to per-token
+            spans
 
-    * BE: A complete reference
-    * BI: A frgament of reference that captures the beginning but not the end
-    * IE: A frgament of reference that captures the end but not the beginning
-    * II: A fragment of a reference that captures neither the beginning nor the
-        end .
-    """
+        Args:
+            task (str): One of ["parsing", "splitting"]. See below further
+                explanation.
+
+        Since the parsing, splitting, and classification tasks have quite
+        different labelling requirements, this class behaves differently
+        depending on which task is specified in the task argument.
+
+        For splitting:
+
+        Expects one of four labels for the spans:
 
-    def __init__(self):
+        * BE: A complete reference
+        * BI: A frgament of reference that captures the beginning but not the end
+        * IE: A frgament of reference that captures the end but not the beginning
+        * II: A fragment of a reference that captures neither the beginning nor the
+            end .
+
+        Depending on which label is applied the tokens within the span will be
+        labelled differently as one of ["b-r", "i-r", "e-r", "o"].
+
+        For parsing:
+
+        Expects any arbitrary label for spans. All tokens within that span will
+        be labelled with the same span.
+
+        """
 
         self.out = []
+        self.task = task
 
     def tag_doc(self, doc):
         """
-        Tags a document with the appropriate labels
+        Tags a document with appropriate labels for the parsing task
 
         Args:
             doc(dict): A single document in prodigy dict format to be labelled.
         """
 
-        bie_spans = self.reference_spans(doc["spans"], doc["tokens"])
+        bie_spans = self.reference_spans(doc["spans"], doc["tokens"], task=self.task)
         o_spans = self.outside_spans(bie_spans, doc["tokens"])
 
         # Flatten into one list.
@@ -63,37 +84,47 @@ def run(self, docs):
 
         return self.out
 
-    def reference_spans(self, spans, tokens):
+
+    def reference_spans(self, spans, tokens, task):
         """
         Given a whole reference span as labelled in prodigy, break this into
         appropriate single token spans depending on the label that was applied to
         the whole reference span.
         """
         split_spans = []
 
-        for span in spans:
-            if span["label"] in ["BE", "be"]:
+        if task == "splitting":
 
-                split_spans.extend(
-                    self.split_long_span(tokens, span, "b-r", "e-r")
-                )
+            for span in spans:
+                if span["label"] in ["BE", "be"]:
 
-            elif span["label"] in ["BI", "bi"]:
+                    split_spans.extend(
+                        self.split_long_span(tokens, span, "b-r", "e-r", "i-r")
+                    )
 
-                split_spans.extend(
-                    self.split_long_span(tokens, span, "b-r", "i-r")
-                )
+                elif span["label"] in ["BI", "bi"]:
 
-            elif span["label"] in ["IE", "ie"]:
+                    split_spans.extend(
+                        self.split_long_span(tokens, span, "b-r", "i-r", "i-r")
+                    )
 
-                split_spans.extend(
-                    self.split_long_span(tokens, span, "i-r", "e-r")
-                )
+                elif span["label"] in ["IE", "ie"]:
+
+                    split_spans.extend(
+                        self.split_long_span(tokens, span, "i-r", "e-r", "i-r")
+                    )
 
-            elif span["label"] in ["II", "ii"]:
+                elif span["label"] in ["II", "ii"]:
 
+                    split_spans.extend(
+                        self.split_long_span(tokens, span, "i-r", "i-r", "i-r")
+                    )
+
+        elif task == "parsing":
+
+            for span in spans:
                 split_spans.extend(
-                    self.split_long_span(tokens, span, "i-r", "i-r")
+                    self.split_long_span(tokens, span, span["label"], span["label"], span["label"])
                 )
 
         return split_spans
@@ -146,17 +177,17 @@ def create_span(self, tokens, index, label):
         return span
 
 
-    def split_long_span(self, tokens, span, start_label, end_label):
+    def split_long_span(self, tokens, span, start_label, end_label, inside_label):
         """
-        Split a milti-token span into `n` spans of lengh `1`, where `n=len(tokens)`
+        Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
         """
 
         spans = []
         spans.append(self.create_span(tokens, span["token_start"], start_label))
         spans.append(self.create_span(tokens, span["token_end"], end_label))
 
         for index in range(span["token_start"] + 1, span["token_end"]):
-            spans.append(self.create_span(tokens, index, "i-r"))
+                spans.append(self.create_span(tokens, index, inside_label))
 
         spans = sorted(spans, key=lambda k: k['token_start'])
 
@@ -174,10 +205,16 @@ def split_long_span(self, tokens, span, start_label, end_label):
         "positional",
         None,
         str
+    ),
+    task=(
+        "Which task is being performed. Either splitting or parsing.",
+        "positional",
+        None,
+        str
     )
 )
 
-def reference_to_token_annotations(input_file, output_file):
+def reference_to_token_annotations(input_file, output_file, task="splitting"):
     """ Converts a file output by prodigy (using prodigy db-out) from
     references level annotations to individual level annotations. The rationale
     for this is that reference level annotations are much easier for humans to
@@ -195,7 +232,7 @@ def reference_to_token_annotations(input_file, output_file):
 
     logger.info("Loaded %s documents with reference annotations", len(partially_annotated))
 
-    annotator = TokenTagger(partially_annotated)
+    annotator = TokenTagger(partially_annotated, task=task)
 
     fully_annotated = annotator.run()
 
diff --git a/tests/prodigy/test_reference_to_token_annotations.py b/tests/prodigy/test_reference_to_token_annotations.py
@@ -42,108 +42,6 @@ def test_TokenTagger(tagger):
     assert out == tagged[0]["spans"]
 
 
-#def test_real_case():
-#    """
-#    Test real case observed where no `b-r` or `e-r` is present, the first and
-#    last `i-r` tokens are being replicated as `o` tokens when no bounding
-#    `b-r` or `e-r` tokens are present.
-#    """
-#
-#    doc = {
-#        "text": "d\n 2010, Actual",
-#        "spans":[
-#            {
-#                "start": 3,
-#                "end": 7,
-#                "token_start": 2,
-#                "token_end": 2,
-#                "label": "i-r"
-#            },
-#            {
-#                "start": 9,
-#                "end": 15,
-#                "token_start": 4,
-#                "token_end": 4,
-#                "label": "i-r"
-#            }
-#        ],
-#        "tokens":[
-#            {
-#                "text": "d",
-#                "start": 0,
-#                "end": 1,
-#                "id": 0
-#            },
-#            {
-#                "text": "\n ",
-#                "start": 1,
-#                "end": 3,
-#                "id": 1
-#            },
-#            {
-#                "text": "2010",
-#                "start": 3,
-#                "end": 7,
-#                "id": 2
-#            },
-#            {
-#                "text": ",",
-#                "start": 7,
-#                "end": 8,
-#                "id": 3
-#            },
-#            {
-#                "text": "Actual",
-#                "start": 9,
-#                "end": 15,
-#                "id": 4
-#            }
-#        ]}
-#
-#    after_spans = [
-#        {
-#            "start": 0,
-#            "end": 1,
-#            "token_start": 0,
-#            "token_end": 0,
-#            "label": "o"
-#        },
-#        {
-#            "start": 1,
-#            "end": 3,
-#            "token_start": 1,
-#            "token_end": 1,
-#            "label": "o"
-#        },
-#        {
-#            "start": 3,
-#            "end": 7,
-#            "token_start": 2,
-#            "token_end": 2,
-#            "label": "i-r"
-#        },
-#        {
-#            "start": 7,
-#            "end": 8,
-#            "token_start": 3,
-#            "token_end": 3,
-#            "label": "i-r"
-#        },
-#        {
-#            "start": 9,
-#            "end": 15,
-#            "token_start": 4,
-#            "token_end": 4,
-#            "label": "i-r"
-#        }
-#    ]
-#
-#
-#    tagger = TokenTagger([doc])
-#    tagged = tagger.run()
-#
-#    assert after_spans == tagged[0]["spans"]
-
 def test_create_span(tagger):
 
     tokens = [
@@ -179,7 +77,7 @@ def test_split_long_span(tagger):
         {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
     ]
 
-    out = tagger.split_long_span(tokens, span, start_label="b-r", end_label="e-r")
+    out = tagger.split_long_span(tokens, span, start_label="b-r", end_label="e-r", inside_label="i-r")
 
     assert out == after
 
@@ -206,7 +104,7 @@ def test_reference_spans_be(tagger):
         {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
     ]
 
-    out = tagger.reference_spans(spans, tokens)
+    out = tagger.reference_spans(spans, tokens, task="splitting")
 
     assert out == after
 
@@ -232,7 +130,7 @@ def test_reference_spans_bi(tagger):
         {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'i-r'},
     ]
 
-    out = tagger.reference_spans(spans, tokens)
+    out = tagger.reference_spans(spans, tokens, task="splitting")
 
     assert out == after
 
@@ -258,7 +156,7 @@ def test_reference_spans_ie(tagger):
         {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
     ]
 
-    out = tagger.reference_spans(spans, tokens)
+    out = tagger.reference_spans(spans, tokens, task="splitting")
 
     assert out == after
 
@@ -284,7 +182,33 @@ def test_reference_spans_ii(tagger):
         {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'i-r'},
     ]
 
-    out = tagger.reference_spans(spans, tokens)
+    out = tagger.reference_spans(spans, tokens, task="splitting")
+
+    assert out == after
+
+def test_reference_spans_author(tagger):
+
+    tokens = [
+        {'start': 0, 'end': 0, 'id': 0},
+        {'start': 1, 'end': 1, 'id': 1},
+        {'start': 2, 'end': 2, 'id': 2},
+        {'start': 3, 'end': 3, 'id': 3},
+        {'start': 4, 'end': 4, 'id': 4},
+        {'start': 5, 'end': 5, 'id': 5},
+        {'start': 6, 'end': 6, 'id': 6},
+    ]
+
+    spans = [
+        {'start': 2, 'end': 4, 'token_start': 2, 'token_end': 4, 'label': 'author'}
+    ]
+
+    after = [
+        {'start': 2, 'end': 2, 'token_start': 2, 'token_end': 2, 'label': 'author'},
+        {'start': 3, 'end': 3, 'token_start': 3, 'token_end': 3, 'label': 'author'},
+        {'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'author'},
+    ]
+
+    out = tagger.reference_spans(spans, tokens, task="parsing")
 
     assert out == after