|
11 | 11 |
|
12 | 12 | class TokenTagger: |
13 | 13 |
|
14 | | - def __init__(self, task="splitting"): |
| 14 | + def __init__(self, task="splitting", lowercase=True): |
15 | 15 | """ |
16 | 16 | Converts data in prodigy format with full reference spans to per-token |
17 | 17 | spans |
18 | 18 |
|
19 | 19 | Args: |
20 | 20 | task (str): One of ["parsing", "splitting"]. See below further |
21 | 21 | explanation. |
| 22 | + lowercase (bool): Automatically convert upper case annotations to |
| 23 | + lowercase under the parsing scenario. |
22 | 24 |
|
23 | 25 | Since the parsing, splitting, and classification tasks have quite |
24 | 26 | different labelling requirements, this class behaves differently |
@@ -46,6 +48,7 @@ def __init__(self, task="splitting"): |
46 | 48 |
|
47 | 49 | self.out = [] |
48 | 50 | self.task = task |
| 51 | + self.lowercase = lowercase |
49 | 52 |
|
50 | 53 | def tag_doc(self, doc): |
51 | 54 | """ |
@@ -123,8 +126,10 @@ def reference_spans(self, spans, tokens, task): |
123 | 126 | elif task == "parsing": |
124 | 127 |
|
125 | 128 | for span in spans: |
| 129 | + if self.lowercase: |
| 130 | + label = span["label"].lower() |
126 | 131 | split_spans.extend( |
127 | | - self.split_long_span(tokens, span, span["label"], span["label"], span["label"]) |
| 132 | + self.split_long_span(tokens, span, label, label, label) |
128 | 133 | ) |
129 | 134 |
|
130 | 135 | return split_spans |
@@ -211,10 +216,16 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label): |
211 | 216 | "positional", |
212 | 217 | None, |
213 | 218 | str |
| 219 | + ), |
| 220 | + lowercase=( |
| 221 | + "Convert UPPER case reference labels to lower case token labels?", |
| 222 | + "flag", |
| 223 | + "f", |
| 224 | + bool |
214 | 225 | ) |
215 | 226 | ) |
216 | 227 |
|
217 | | -def reference_to_token_annotations(input_file, output_file, task="splitting"): |
| 228 | +def reference_to_token_annotations(input_file, output_file, task="splitting", lowercase=False): |
218 | 229 | """ |
219 | 230 | Creates a span for every token from existing multi-token spans |
220 | 231 |
|
@@ -254,7 +265,7 @@ def reference_to_token_annotations(input_file, output_file, task="splitting"): |
254 | 265 | logger.info("Loaded %s documents with reference annotations", len(ref_annotated_docs)) |
255 | 266 | logger.info("Loaded %s documents with no reference annotations", len(not_annotated_docs)) |
256 | 267 |
|
257 | | - annotator = TokenTagger(task) |
| 268 | + annotator = TokenTagger(task=task, lowercase=lowercase) |
258 | 269 |
|
259 | 270 | token_annotated_docs = annotator.run(ref_annotated_docs) |
260 | 271 | all_docs = token_annotated_docs + token_annotated_docs |
|
0 commit comments