Skip to content

Commit 115ada2

Browse files
new: incldue parsing in reference_to_token_annotations
Previously this command would only work in the parsing scenario of converting reference spans (BI, BE, IE, II) spans to token spans (b-r, i-r, e-r, o). This commit allows it also to be used for parsing spans where a reference span (author) can be converted to a series of token spans (author).
1 parent 3a8afa3 commit 115ada2

File tree

2 files changed

+101
-140
lines changed

2 files changed

+101
-140
lines changed

deep_reference_parser/prodigy/reference_to_token_annotations.py

Lines changed: 70 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,31 +10,52 @@
1010

1111

1212
class TokenTagger:
13-
"""
14-
Converts data in prodigy format with full reference spans to per-token spans
1513

16-
Expects one of four lables for the spans:
14+
def __init__(self, task="splitting"):
15+
"""
16+
Converts data in prodigy format with full reference spans to per-token
17+
spans
1718
18-
* BE: A complete reference
19-
* BI: A frgament of reference that captures the beginning but not the end
20-
* IE: A frgament of reference that captures the end but not the beginning
21-
* II: A fragment of a reference that captures neither the beginning nor the
22-
end .
23-
"""
19+
Args:
20+
task (str): One of ["parsing", "splitting"]. See below further
21+
explanation.
22+
23+
Since the parsing, splitting, and classification tasks have quite
24+
different labelling requirements, this class behaves differently
25+
depending on which task is specified in the task argument.
26+
27+
For splitting:
28+
29+
Expects one of four labels for the spans:
2430
25-
def __init__(self):
31+
* BE: A complete reference
32+
* BI: A frgament of reference that captures the beginning but not the end
33+
* IE: A frgament of reference that captures the end but not the beginning
34+
* II: A fragment of a reference that captures neither the beginning nor the
35+
end .
36+
37+
Depending on which label is applied the tokens within the span will be
38+
labelled differently as one of ["b-r", "i-r", "e-r", "o"].
39+
40+
For parsing:
41+
42+
Expects any arbitrary label for spans. All tokens within that span will
43+
be labelled with the same span.
44+
45+
"""
2646

2747
self.out = []
48+
self.task = task
2849

2950
def tag_doc(self, doc):
3051
"""
31-
Tags a document with the appropriate labels
52+
Tags a document with appropriate labels for the parsing task
3253
3354
Args:
3455
doc(dict): A single document in prodigy dict format to be labelled.
3556
"""
3657

37-
bie_spans = self.reference_spans(doc["spans"], doc["tokens"])
58+
bie_spans = self.reference_spans(doc["spans"], doc["tokens"], task=self.task)
3859
o_spans = self.outside_spans(bie_spans, doc["tokens"])
3960

4061
# Flatten into one list.
@@ -63,37 +84,47 @@ def run(self, docs):
6384

6485
return self.out
6586

66-
def reference_spans(self, spans, tokens):
87+
88+
def reference_spans(self, spans, tokens, task):
6789
"""
6890
Given a whole reference span as labelled in prodigy, break this into
6991
appropriate single token spans depending on the label that was applied to
7092
the whole reference span.
7193
"""
7294
split_spans = []
7395

74-
for span in spans:
75-
if span["label"] in ["BE", "be"]:
96+
if task == "splitting":
7697

77-
split_spans.extend(
78-
self.split_long_span(tokens, span, "b-r", "e-r")
79-
)
98+
for span in spans:
99+
if span["label"] in ["BE", "be"]:
80100

81-
elif span["label"] in ["BI", "bi"]:
101+
split_spans.extend(
102+
self.split_long_span(tokens, span, "b-r", "e-r", "i-r")
103+
)
82104

83-
split_spans.extend(
84-
self.split_long_span(tokens, span, "b-r", "i-r")
85-
)
105+
elif span["label"] in ["BI", "bi"]:
86106

87-
elif span["label"] in ["IE", "ie"]:
107+
split_spans.extend(
108+
self.split_long_span(tokens, span, "b-r", "i-r", "i-r")
109+
)
88110

89-
split_spans.extend(
90-
self.split_long_span(tokens, span, "i-r", "e-r")
91-
)
111+
elif span["label"] in ["IE", "ie"]:
112+
113+
split_spans.extend(
114+
self.split_long_span(tokens, span, "i-r", "e-r", "i-r")
115+
)
92116

93-
elif span["label"] in ["II", "ii"]:
117+
elif span["label"] in ["II", "ii"]:
94118

119+
split_spans.extend(
120+
self.split_long_span(tokens, span, "i-r", "i-r", "i-r")
121+
)
122+
123+
elif task == "parsing":
124+
125+
for span in spans:
95126
split_spans.extend(
96-
self.split_long_span(tokens, span, "i-r", "i-r")
127+
self.split_long_span(tokens, span, span["label"], span["label"], span["label"])
97128
)
98129

99130
return split_spans
@@ -146,17 +177,17 @@ def create_span(self, tokens, index, label):
146177
return span
147178

148179

149-
def split_long_span(self, tokens, span, start_label, end_label):
180+
def split_long_span(self, tokens, span, start_label, end_label, inside_label):
150181
"""
151-
Split a milti-token span into `n` spans of lengh `1`, where `n=len(tokens)`
182+
Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
152183
"""
153184

154185
spans = []
155186
spans.append(self.create_span(tokens, span["token_start"], start_label))
156187
spans.append(self.create_span(tokens, span["token_end"], end_label))
157188

158189
for index in range(span["token_start"] + 1, span["token_end"]):
159-
spans.append(self.create_span(tokens, index, "i-r"))
190+
spans.append(self.create_span(tokens, index, inside_label))
160191

161192
spans = sorted(spans, key=lambda k: k['token_start'])
162193

@@ -174,10 +205,16 @@ def split_long_span(self, tokens, span, start_label, end_label):
174205
"positional",
175206
None,
176207
str
208+
),
209+
task=(
210+
"Which task is being performed. Either splitting or parsing.",
211+
"positional",
212+
None,
213+
str
177214
)
178215
)
179216

180-
def reference_to_token_annotations(input_file, output_file):
217+
def reference_to_token_annotations(input_file, output_file, task="splitting"):
181218
""" Converts a file output by prodigy (using prodigy db-out) from
182219
references level annotations to individual level annotations. The rationale
183220
for this is that reference level annotations are much easier for humans to
@@ -195,7 +232,7 @@ def reference_to_token_annotations(input_file, output_file):
195232

196233
logger.info("Loaded %s documents with reference annotations", len(partially_annotated))
197234

198-
annotator = TokenTagger(partially_annotated)
235+
annotator = TokenTagger(partially_annotated, task=task)
199236

200237
fully_annotated = annotator.run()
201238

tests/prodigy/test_reference_to_token_annotations.py

Lines changed: 31 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -42,108 +42,6 @@ def test_TokenTagger(tagger):
4242
assert out == tagged[0]["spans"]
4343

4444

45-
#def test_real_case():
46-
# """
47-
# Test real case observed where no `b-r` or `e-r` is present, the first and
48-
# last `i-r` tokens are being replicated as `o` tokens when no bounding
49-
# `b-r` or `e-r` tokens are present.
50-
# """
51-
#
52-
# doc = {
53-
# "text": "d\n 2010, Actual",
54-
# "spans":[
55-
# {
56-
# "start": 3,
57-
# "end": 7,
58-
# "token_start": 2,
59-
# "token_end": 2,
60-
# "label": "i-r"
61-
# },
62-
# {
63-
# "start": 9,
64-
# "end": 15,
65-
# "token_start": 4,
66-
# "token_end": 4,
67-
# "label": "i-r"
68-
# }
69-
# ],
70-
# "tokens":[
71-
# {
72-
# "text": "d",
73-
# "start": 0,
74-
# "end": 1,
75-
# "id": 0
76-
# },
77-
# {
78-
# "text": "\n ",
79-
# "start": 1,
80-
# "end": 3,
81-
# "id": 1
82-
# },
83-
# {
84-
# "text": "2010",
85-
# "start": 3,
86-
# "end": 7,
87-
# "id": 2
88-
# },
89-
# {
90-
# "text": ",",
91-
# "start": 7,
92-
# "end": 8,
93-
# "id": 3
94-
# },
95-
# {
96-
# "text": "Actual",
97-
# "start": 9,
98-
# "end": 15,
99-
# "id": 4
100-
# }
101-
# ]}
102-
#
103-
# after_spans = [
104-
# {
105-
# "start": 0,
106-
# "end": 1,
107-
# "token_start": 0,
108-
# "token_end": 0,
109-
# "label": "o"
110-
# },
111-
# {
112-
# "start": 1,
113-
# "end": 3,
114-
# "token_start": 1,
115-
# "token_end": 1,
116-
# "label": "o"
117-
# },
118-
# {
119-
# "start": 3,
120-
# "end": 7,
121-
# "token_start": 2,
122-
# "token_end": 2,
123-
# "label": "i-r"
124-
# },
125-
# {
126-
# "start": 7,
127-
# "end": 8,
128-
# "token_start": 3,
129-
# "token_end": 3,
130-
# "label": "i-r"
131-
# },
132-
# {
133-
# "start": 9,
134-
# "end": 15,
135-
# "token_start": 4,
136-
# "token_end": 4,
137-
# "label": "i-r"
138-
# }
139-
# ]
140-
#
141-
#
142-
# tagger = TokenTagger([doc])
143-
# tagged = tagger.run()
144-
#
145-
# assert after_spans == tagged[0]["spans"]
146-
14745
def test_create_span(tagger):
14846

14947
tokens = [
@@ -179,7 +77,7 @@ def test_split_long_span(tagger):
17977
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
18078
]
18179

182-
out = tagger.split_long_span(tokens, span, start_label="b-r", end_label="e-r")
80+
out = tagger.split_long_span(tokens, span, start_label="b-r", end_label="e-r", inside_label="i-r")
18381

18482
assert out == after
18583

@@ -206,7 +104,7 @@ def test_reference_spans_be(tagger):
206104
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
207105
]
208106

209-
out = tagger.reference_spans(spans, tokens)
107+
out = tagger.reference_spans(spans, tokens, task="splitting")
210108

211109
assert out == after
212110

@@ -232,7 +130,7 @@ def test_reference_spans_bi(tagger):
232130
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'i-r'},
233131
]
234132

235-
out = tagger.reference_spans(spans, tokens)
133+
out = tagger.reference_spans(spans, tokens, task="splitting")
236134

237135
assert out == after
238136

@@ -258,7 +156,7 @@ def test_reference_spans_ie(tagger):
258156
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'e-r'},
259157
]
260158

261-
out = tagger.reference_spans(spans, tokens)
159+
out = tagger.reference_spans(spans, tokens, task="splitting")
262160

263161
assert out == after
264162

@@ -284,7 +182,33 @@ def test_reference_spans_ii(tagger):
284182
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'i-r'},
285183
]
286184

287-
out = tagger.reference_spans(spans, tokens)
185+
out = tagger.reference_spans(spans, tokens, task="splitting")
186+
187+
assert out == after
188+
189+
def test_reference_spans_author(tagger):
190+
191+
tokens = [
192+
{'start': 0, 'end': 0, 'id': 0},
193+
{'start': 1, 'end': 1, 'id': 1},
194+
{'start': 2, 'end': 2, 'id': 2},
195+
{'start': 3, 'end': 3, 'id': 3},
196+
{'start': 4, 'end': 4, 'id': 4},
197+
{'start': 5, 'end': 5, 'id': 5},
198+
{'start': 6, 'end': 6, 'id': 6},
199+
]
200+
201+
spans = [
202+
{'start': 2, 'end': 4, 'token_start': 2, 'token_end': 4, 'label': 'author'}
203+
]
204+
205+
after = [
206+
{'start': 2, 'end': 2, 'token_start': 2, 'token_end': 2, 'label': 'author'},
207+
{'start': 3, 'end': 3, 'token_start': 3, 'token_end': 3, 'label': 'author'},
208+
{'start': 4, 'end': 4, 'token_start': 4, 'token_end': 4, 'label': 'author'},
209+
]
210+
211+
out = tagger.reference_spans(spans, tokens, task="parsing")
288212

289213
assert out == after
290214

0 commit comments

Comments
 (0)