1010
1111
1212class TokenTagger :
13- """
14- Converts data in prodigy format with full reference spans to per-token spans
1513
16- Expects one of four lables for the spans:
14+ def __init__ (self , task = "splitting" ):
15+ """
16+ Converts data in prodigy format with full reference spans to per-token
17+ spans
1718
18- * BE: A complete reference
19- * BI: A frgament of reference that captures the beginning but not the end
20- * IE: A frgament of reference that captures the end but not the beginning
21- * II: A fragment of a reference that captures neither the beginning nor the
22- end .
23- """
19+ Args:
20+ task (str): One of ["parsing", "splitting"]. See below further
21+ explanation.
22+
23+ Since the parsing, splitting, and classification tasks have quite
24+ different labelling requirements, this class behaves differently
25+ depending on which task is specified in the task argument.
26+
27+ For splitting:
28+
29+ Expects one of four labels for the spans:
2430
25- def __init__ (self ):
31+ * BE: A complete reference
32+ * BI: A frgament of reference that captures the beginning but not the end
33+ * IE: A frgament of reference that captures the end but not the beginning
34+ * II: A fragment of a reference that captures neither the beginning nor the
35+ end .
36+
37+ Depending on which label is applied the tokens within the span will be
38+ labelled differently as one of ["b-r", "i-r", "e-r", "o"].
39+
40+ For parsing:
41+
42+ Expects any arbitrary label for spans. All tokens within that span will
43+ be labelled with the same span.
44+
45+ """
2646
2747 self .out = []
48+ self .task = task
2849
2950 def tag_doc (self , doc ):
3051 """
31- Tags a document with the appropriate labels
52+ Tags a document with appropriate labels for the parsing task
3253
3354 Args:
3455 doc(dict): A single document in prodigy dict format to be labelled.
3556 """
3657
37- bie_spans = self .reference_spans (doc ["spans" ], doc ["tokens" ])
58+ bie_spans = self .reference_spans (doc ["spans" ], doc ["tokens" ], task = self . task )
3859 o_spans = self .outside_spans (bie_spans , doc ["tokens" ])
3960
4061 # Flatten into one list.
@@ -63,37 +84,47 @@ def run(self, docs):
6384
6485 return self .out
6586
66- def reference_spans (self , spans , tokens ):
87+
88+ def reference_spans (self , spans , tokens , task ):
6789 """
6890 Given a whole reference span as labelled in prodigy, break this into
6991 appropriate single token spans depending on the label that was applied to
7092 the whole reference span.
7193 """
7294 split_spans = []
7395
74- for span in spans :
75- if span ["label" ] in ["BE" , "be" ]:
96+ if task == "splitting" :
7697
77- split_spans .extend (
78- self .split_long_span (tokens , span , "b-r" , "e-r" )
79- )
98+ for span in spans :
99+ if span ["label" ] in ["BE" , "be" ]:
80100
81- elif span ["label" ] in ["BI" , "bi" ]:
101+ split_spans .extend (
102+ self .split_long_span (tokens , span , "b-r" , "e-r" , "i-r" )
103+ )
82104
83- split_spans .extend (
84- self .split_long_span (tokens , span , "b-r" , "i-r" )
85- )
105+ elif span ["label" ] in ["BI" , "bi" ]:
86106
87- elif span ["label" ] in ["IE" , "ie" ]:
107+ split_spans .extend (
108+ self .split_long_span (tokens , span , "b-r" , "i-r" , "i-r" )
109+ )
88110
89- split_spans .extend (
90- self .split_long_span (tokens , span , "i-r" , "e-r" )
91- )
111+ elif span ["label" ] in ["IE" , "ie" ]:
112+
113+ split_spans .extend (
114+ self .split_long_span (tokens , span , "i-r" , "e-r" , "i-r" )
115+ )
92116
93- elif span ["label" ] in ["II" , "ii" ]:
117+ elif span ["label" ] in ["II" , "ii" ]:
94118
119+ split_spans .extend (
120+ self .split_long_span (tokens , span , "i-r" , "i-r" , "i-r" )
121+ )
122+
123+ elif task == "parsing" :
124+
125+ for span in spans :
95126 split_spans .extend (
96- self .split_long_span (tokens , span , "i-r" , "i-r" )
127+ self .split_long_span (tokens , span , span [ "label" ], span [ "label" ], span [ "label" ] )
97128 )
98129
99130 return split_spans
@@ -146,17 +177,17 @@ def create_span(self, tokens, index, label):
146177 return span
147178
148179
149- def split_long_span (self , tokens , span , start_label , end_label ):
180+ def split_long_span (self , tokens , span , start_label , end_label , inside_label ):
150181 """
151- Split a milti -token span into `n` spans of lengh `1`, where `n=len(tokens)`
182+ Split a multi -token span into `n` spans of lengh `1`, where `n=len(tokens)`
152183 """
153184
154185 spans = []
155186 spans .append (self .create_span (tokens , span ["token_start" ], start_label ))
156187 spans .append (self .create_span (tokens , span ["token_end" ], end_label ))
157188
158189 for index in range (span ["token_start" ] + 1 , span ["token_end" ]):
159- spans .append (self .create_span (tokens , index , "i-r" ))
190+ spans .append (self .create_span (tokens , index , inside_label ))
160191
161192 spans = sorted (spans , key = lambda k : k ['token_start' ])
162193
@@ -174,10 +205,16 @@ def split_long_span(self, tokens, span, start_label, end_label):
174205 "positional" ,
175206 None ,
176207 str
208+ ),
209+ task = (
210+ "Which task is being performed. Either splitting or parsing." ,
211+ "positional" ,
212+ None ,
213+ str
177214 )
178215)
179216
180- def reference_to_token_annotations (input_file , output_file ):
217+ def reference_to_token_annotations (input_file , output_file , task = "splitting" ):
181218 """ Converts a file output by prodigy (using prodigy db-out) from
182219 references level annotations to individual level annotations. The rationale
183220 for this is that reference level annotations are much easier for humans to
@@ -195,7 +232,7 @@ def reference_to_token_annotations(input_file, output_file):
195232
196233 logger .info ("Loaded %s documents with reference annotations" , len (partially_annotated ))
197234
198- annotator = TokenTagger (partially_annotated )
235+ annotator = TokenTagger (partially_annotated , task = task )
199236
200237 fully_annotated = annotator .run ()
201238
0 commit comments