@@ -215,27 +215,53 @@ def split_long_span(self, tokens, span, start_label, end_label, inside_label):
215215)
216216
217217def reference_to_token_annotations (input_file , output_file , task = "splitting" ):
218- """ Converts a file output by prodigy (using prodigy db-out) from
219- references level annotations to individual level annotations. The rationale
220- for this is that reference level annotations are much easier for humans to
221- do, but not useful when training a token level model.
222-
223- This function is predominantly useful fot tagging reference spans, but may
224- also have a function with other references annotations.
218+ """
219+ Creates a span for every token from existing multi-token spans
220+
221+ Converts a jsonl file output by prodigy (using prodigy db-out) with spans
222+ extending over more than a single token to individual token level spans.
223+
224+ The rationale for this is that reference level annotations are much easier
225+ for humans to do, but not useful when training a token level model.
226+
227+ This command functions in two ways:
228+
229+ * task=splitting: For the splitting task where we are interested in
230+ labelling the beginning (b-r) and end (e-r) of references, reference
231+ spans are labelled with one of BI, BE, IE, II. These are then converted
232+ to token level spans b-r, i-r, e-r, and o using logic. Symbolically:
233+ * BE: [BE, BE, BE] becomes [b-r][i-r][e-r]
234+ * BI: [BI, BI, BI] becomes [b-r][i-r][i-r]
235+ * IE: [IE, IE, IE] becomes [i-r][i-r][e-r]
236+ * II: [II, II, II] becomes [i-r][i-r][i-r]
237+ * All other tokens become [o]
238+
239+ * task=parsing: For the parsing task, multi-task annotations are much
240+ simpler and would tend to be just 'author', or 'title'. These simple
241+ labels can be applied directly to the individual tokens contained within
242+ these multi-token spans; for each token in the multi-token span, a span
243+ is created with the same label. Symbolically:
244+ * [author author author] becomes [author][author][author]
225245 """
226246
227- partially_annotated = read_jsonl (input_file )
247+ ref_annotated_docs = read_jsonl (input_file )
228248
229249 # Only run the tagger on annotated examples.
230250
231- partially_annotated = [doc for doc in partially_annotated if doc .get ("spans" )]
251+ not_annotated_docs = [doc for doc in ref_annotated_docs if not doc .get ("spans" )]
252+ ref_annotated_docs = [doc for doc in ref_annotated_docs if doc .get ("spans" )]
232253
233- logger .info ("Loaded %s documents with reference annotations" , len (partially_annotated ))
254+ logger .info ("Loaded %s documents with reference annotations" , len (ref_annotated_docs ))
255+ logger .info ("Loaded %s documents with no reference annotations" , len (not_annotated_docs ))
234256
235- annotator = TokenTagger (partially_annotated , task = task )
257+ annotator = TokenTagger (task )
236258
237- fully_annotated = annotator .run ()
259+ token_annotated_docs = annotator .run (ref_annotated_docs )
260+ all_docs = token_annotated_docs + token_annotated_docs
238261
239- write_jsonl (fully_annotated , output_file = output_file )
262+ write_jsonl (all_docs , output_file = output_file )
240263
241- logger .info ("Fully annotated references written to %s" , output_file )
264+ logger .info ("Wrote %s docs with token annotations to %s" ,
265+ len (token_annotated_docs ), output_file )
266+ logger .info ("Wrote %s docs with no annotations to %s" ,
267+ len (not_annotated_docs ), output_file )
0 commit comments