Skip to content

Commit 1c7f7bc

Browse files
chg: 💄 linting
1 parent 9052fcc commit 1c7f7bc

File tree

1 file changed

+26
-21
lines changed

1 file changed

+26
-21
lines changed

deep_reference_parser/prodigy/reference_to_token_annotations.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111

1212
class TokenTagger:
13-
1413
def __init__(self, task="splitting", lowercase=True):
1514
"""
1615
Converts data in prodigy format with full reference spans to per-token
@@ -67,7 +66,7 @@ def tag_doc(self, doc):
6766

6867
# Sort by token id to ensure it is ordered.
6968

70-
spans = sorted(spans, key=lambda k: k['token_start'])
69+
spans = sorted(spans, key=lambda k: k["token_start"])
7170

7271
doc["spans"] = spans
7372

@@ -87,7 +86,6 @@ def run(self, docs):
8786

8887
return self.out
8988

90-
9189
def reference_spans(self, spans, tokens, task):
9290
"""
9391
Given a whole reference span as labelled in prodigy, break this into
@@ -134,7 +132,6 @@ def reference_spans(self, spans, tokens, task):
134132

135133
return split_spans
136134

137-
138135
def outside_spans(self, spans, tokens):
139136
"""
140137
Label tokens with `o` if they are outside a reference
@@ -161,7 +158,6 @@ def outside_spans(self, spans, tokens):
161158

162159
return outside_spans
163160

164-
165161
def create_span(self, tokens, index, label):
166162
"""
167163
Given a list of tokens, (in prodigy format) and an index relating to one of
@@ -181,7 +177,6 @@ def create_span(self, tokens, index, label):
181177

182178
return span
183179

184-
185180
def split_long_span(self, tokens, span, start_label, end_label, inside_label):
186181
"""
187182
Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
@@ -192,40 +187,42 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label):
192187
spans.append(self.create_span(tokens, span["token_end"], end_label))
193188

194189
for index in range(span["token_start"] + 1, span["token_end"]):
195-
spans.append(self.create_span(tokens, index, inside_label))
190+
spans.append(self.create_span(tokens, index, inside_label))
196191

197-
spans = sorted(spans, key=lambda k: k['token_start'])
192+
spans = sorted(spans, key=lambda k: k["token_start"])
198193

199194
return spans
200195

196+
201197
@plac.annotations(
202198
input_file=(
203199
"Path to jsonl file containing chunks of references in prodigy format.",
204200
"positional",
205201
None,
206-
str
202+
str,
207203
),
208204
output_file=(
209205
"Path to jsonl file into which fully annotate files will be saved.",
210206
"positional",
211207
None,
212-
str
208+
str,
213209
),
214210
task=(
215211
"Which task is being performed. Either splitting or parsing.",
216212
"positional",
217213
None,
218-
str
214+
str,
219215
),
220216
lowercase=(
221217
"Convert UPPER case reference labels to lower case token labels?",
222218
"flag",
223219
"f",
224-
bool
225-
)
220+
bool,
221+
),
226222
)
227-
228-
def reference_to_token_annotations(input_file, output_file, task="splitting", lowercase=False):
223+
def reference_to_token_annotations(
224+
input_file, output_file, task="splitting", lowercase=False
225+
):
229226
"""
230227
Creates a span for every token from existing multi-token spans
231228
@@ -262,8 +259,12 @@ def reference_to_token_annotations(input_file, output_file, task="splitting", lo
262259
not_annotated_docs = [doc for doc in ref_annotated_docs if not doc.get("spans")]
263260
ref_annotated_docs = [doc for doc in ref_annotated_docs if doc.get("spans")]
264261

265-
logger.info("Loaded %s documents with reference annotations", len(ref_annotated_docs))
266-
logger.info("Loaded %s documents with no reference annotations", len(not_annotated_docs))
262+
logger.info(
263+
"Loaded %s documents with reference annotations", len(ref_annotated_docs)
264+
)
265+
logger.info(
266+
"Loaded %s documents with no reference annotations", len(not_annotated_docs)
267+
)
267268

268269
annotator = TokenTagger(task=task, lowercase=lowercase)
269270

@@ -272,7 +273,11 @@ def reference_to_token_annotations(input_file, output_file, task="splitting", lo
272273

273274
write_jsonl(all_docs, output_file=output_file)
274275

275-
logger.info("Wrote %s docs with token annotations to %s",
276-
len(token_annotated_docs), output_file)
277-
logger.info("Wrote %s docs with no annotations to %s",
278-
len(not_annotated_docs), output_file)
276+
logger.info(
277+
"Wrote %s docs with token annotations to %s",
278+
len(token_annotated_docs),
279+
output_file,
280+
)
281+
logger.info(
282+
"Wrote %s docs with no annotations to %s", len(not_annotated_docs), output_file
283+
)

0 commit comments

Comments
 (0)