Skip to content

Commit 9052fcc

Browse files
chg: Convert UPPER case ref labels to lowercase token labels
1 parent 70f9e59 commit 9052fcc

File tree

1 file changed

+15
-4
lines changed

1 file changed

+15
-4
lines changed

deep_reference_parser/prodigy/reference_to_token_annotations.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@
1111

1212
class TokenTagger:
1313

14-
def __init__(self, task="splitting"):
14+
def __init__(self, task="splitting", lowercase=True):
1515
"""
1616
Converts data in prodigy format with full reference spans to per-token
1717
spans
1818
1919
Args:
2020
task (str): One of ["parsing", "splitting"]. See below further
2121
explanation.
22+
lowercase (bool): Automatically convert upper case annotations to
23+
lowercase under the parsing scenario.
2224
2325
Since the parsing, splitting, and classification tasks have quite
2426
different labelling requirements, this class behaves differently
@@ -46,6 +48,7 @@ def __init__(self, task="splitting"):
4648

4749
self.out = []
4850
self.task = task
51+
self.lowercase = lowercase
4952

5053
def tag_doc(self, doc):
5154
"""
@@ -123,8 +126,10 @@ def reference_spans(self, spans, tokens, task):
123126
elif task == "parsing":
124127

125128
for span in spans:
129+
if self.lowercase:
130+
label = span["label"].lower()
126131
split_spans.extend(
127-
self.split_long_span(tokens, span, span["label"], span["label"], span["label"])
132+
self.split_long_span(tokens, span, label, label, label)
128133
)
129134

130135
return split_spans
@@ -211,10 +216,16 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label):
211216
"positional",
212217
None,
213218
str
219+
),
220+
lowercase=(
221+
"Convert UPPER case reference labels to lower case token labels?",
222+
"flag",
223+
"f",
224+
bool
214225
)
215226
)
216227

217-
def reference_to_token_annotations(input_file, output_file, task="splitting"):
228+
def reference_to_token_annotations(input_file, output_file, task="splitting", lowercase=False):
218229
"""
219230
Creates a span for every token from existing multi-token spans
220231
@@ -254,7 +265,7 @@ def reference_to_token_annotations(input_file, output_file, task="splitting"):
254265
logger.info("Loaded %s documents with reference annotations", len(ref_annotated_docs))
255266
logger.info("Loaded %s documents with no reference annotations", len(not_annotated_docs))
256267

257-
annotator = TokenTagger(task)
268+
annotator = TokenTagger(task=task, lowercase=lowercase)
258269

259270
token_annotated_docs = annotator.run(ref_annotated_docs)
260271
all_docs = token_annotated_docs + token_annotated_docs

0 commit comments

Comments
 (0)