1010
1111
1212class TokenTagger :
13-
1413 def __init__ (self , task = "splitting" , lowercase = True ):
1514 """
1615 Converts data in prodigy format with full reference spans to per-token
@@ -67,7 +66,7 @@ def tag_doc(self, doc):
6766
6867 # Sort by token id to ensure it is ordered.
6968
70- spans = sorted (spans , key = lambda k : k [' token_start' ])
69+ spans = sorted (spans , key = lambda k : k [" token_start" ])
7170
7271 doc ["spans" ] = spans
7372
@@ -87,7 +86,6 @@ def run(self, docs):
8786
8887 return self .out
8988
90-
9189 def reference_spans (self , spans , tokens , task ):
9290 """
9391 Given a whole reference span as labelled in prodigy, break this into
@@ -134,7 +132,6 @@ def reference_spans(self, spans, tokens, task):
134132
135133 return split_spans
136134
137-
138135 def outside_spans (self , spans , tokens ):
139136 """
140137 Label tokens with `o` if they are outside a reference
@@ -161,7 +158,6 @@ def outside_spans(self, spans, tokens):
161158
162159 return outside_spans
163160
164-
165161 def create_span (self , tokens , index , label ):
166162 """
167163 Given a list of tokens, (in prodigy format) and an index relating to one of
@@ -181,7 +177,6 @@ def create_span(self, tokens, index, label):
181177
182178 return span
183179
184-
185180 def split_long_span (self , tokens , span , start_label , end_label , inside_label ):
186181 """
187182 Split a multi-token span into `n` spans of lengh `1`, where `n=len(tokens)`
@@ -192,40 +187,42 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label):
192187 spans .append (self .create_span (tokens , span ["token_end" ], end_label ))
193188
194189 for index in range (span ["token_start" ] + 1 , span ["token_end" ]):
195- spans .append (self .create_span (tokens , index , inside_label ))
190+ spans .append (self .create_span (tokens , index , inside_label ))
196191
197- spans = sorted (spans , key = lambda k : k [' token_start' ])
192+ spans = sorted (spans , key = lambda k : k [" token_start" ])
198193
199194 return spans
200195
196+
201197@plac .annotations (
202198 input_file = (
203199 "Path to jsonl file containing chunks of references in prodigy format." ,
204200 "positional" ,
205201 None ,
206- str
202+ str ,
207203 ),
208204 output_file = (
209205 "Path to jsonl file into which fully annotate files will be saved." ,
210206 "positional" ,
211207 None ,
212- str
208+ str ,
213209 ),
214210 task = (
215211 "Which task is being performed. Either splitting or parsing." ,
216212 "positional" ,
217213 None ,
218- str
214+ str ,
219215 ),
220216 lowercase = (
221217 "Convert UPPER case reference labels to lower case token labels?" ,
222218 "flag" ,
223219 "f" ,
224- bool
225- )
220+ bool ,
221+ ),
226222)
227-
228- def reference_to_token_annotations (input_file , output_file , task = "splitting" , lowercase = False ):
223+ def reference_to_token_annotations (
224+ input_file , output_file , task = "splitting" , lowercase = False
225+ ):
229226 """
230227 Creates a span for every token from existing multi-token spans
231228
@@ -262,8 +259,12 @@ def reference_to_token_annotations(input_file, output_file, task="splitting", lo
262259 not_annotated_docs = [doc for doc in ref_annotated_docs if not doc .get ("spans" )]
263260 ref_annotated_docs = [doc for doc in ref_annotated_docs if doc .get ("spans" )]
264261
265- logger .info ("Loaded %s documents with reference annotations" , len (ref_annotated_docs ))
266- logger .info ("Loaded %s documents with no reference annotations" , len (not_annotated_docs ))
262+ logger .info (
263+ "Loaded %s documents with reference annotations" , len (ref_annotated_docs )
264+ )
265+ logger .info (
266+ "Loaded %s documents with no reference annotations" , len (not_annotated_docs )
267+ )
267268
268269 annotator = TokenTagger (task = task , lowercase = lowercase )
269270
@@ -272,7 +273,11 @@ def reference_to_token_annotations(input_file, output_file, task="splitting", lo
272273
273274 write_jsonl (all_docs , output_file = output_file )
274275
275- logger .info ("Wrote %s docs with token annotations to %s" ,
276- len (token_annotated_docs ), output_file )
277- logger .info ("Wrote %s docs with no annotations to %s" ,
278- len (not_annotated_docs ), output_file )
276+ logger .info (
277+ "Wrote %s docs with token annotations to %s" ,
278+ len (token_annotated_docs ),
279+ output_file ,
280+ )
281+ logger .info (
282+ "Wrote %s docs with no annotations to %s" , len (not_annotated_docs ), output_file
283+ )
0 commit comments