|
| 1 | +# coding: utf-8 |
| 2 | +#!/usr/bin/env python3 |
| 3 | + |
| 4 | +import re |
| 5 | + |
| 6 | +import plac |
| 7 | + |
| 8 | +from ..io import read_jsonl, write_jsonl |
| 9 | +from ..logger import logger |
| 10 | + |
| 11 | +REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)" |
| 12 | + |
| 13 | +class NumberedReferenceAnnotator: |
| 14 | + """ |
| 15 | + Takes reference sections with numeric labelling scraped by Reach in prodigy |
| 16 | + format, and labels the references as spans by splitting them using regex. |
| 17 | +
|
| 18 | + Note that you must identify numbered reference section first. This can be |
| 19 | + done with a simple textcat model trained in prodigy. |
| 20 | + """ |
| 21 | + |
| 22 | + def __init__(self): |
| 23 | + |
| 24 | + self.regex = r"" |
| 25 | + |
| 26 | + def run(self, docs, regex=REGEX): |
| 27 | + |
| 28 | + self.regex = regex |
| 29 | + |
| 30 | + for doc in docs: |
| 31 | + |
| 32 | + spans = self.label_numbered_references(doc["text"], doc["tokens"]) |
| 33 | + doc["spans"] = spans |
| 34 | + |
| 35 | + yield doc |
| 36 | + |
| 37 | + def label_numbered_references(self, text, tokens): |
| 38 | + |
| 39 | + # Search for number reference using regex |
| 40 | + |
| 41 | + splits = list(re.finditer(self.regex, text)) |
| 42 | + spans = [] |
| 43 | + |
| 44 | + for index in range(0, len(splits) - 1): |
| 45 | + |
| 46 | + # Calculate the approximate start and end of the reference using |
| 47 | + # the character offsets returned by re.finditer. |
| 48 | + |
| 49 | + start = splits[index].end() |
| 50 | + end = splits[index + 1].start() |
| 51 | + |
| 52 | + # Calculate which is the closest token to the character offset |
| 53 | + # returned above. |
| 54 | + |
| 55 | + token_start = self._find_closest_token(tokens, start, "start") |
| 56 | + token_end = self._find_closest_token(tokens, end, "end") |
| 57 | + |
| 58 | + # To avoid the possibility of mismatches between the character |
| 59 | + # offset and the token offset, reset the character offsets |
| 60 | + # based on the token offsets. |
| 61 | + |
| 62 | + start = self._get_token_offset(tokens, token_start, "start") |
| 63 | + end = self._get_token_offset(tokens, token_end, "end") |
| 64 | + |
| 65 | + # Create dict and append |
| 66 | + |
| 67 | + span = { |
| 68 | + "start": start, |
| 69 | + "end": end, |
| 70 | + "token_start": token_start, |
| 71 | + "token_end": token_end, |
| 72 | + "label": "BE" |
| 73 | + } |
| 74 | + |
| 75 | + spans.append(span) |
| 76 | + |
| 77 | + return spans |
| 78 | + |
| 79 | + |
| 80 | + def _find_closest_token(self, tokens, char_offset, pos_string): |
| 81 | + """ |
| 82 | + Find the token start/end closest to "number" |
| 83 | +
|
| 84 | + Args: |
| 85 | + tokens: A list of token dicts from a prodigy document. |
| 86 | + char_offset(int): A character offset relating to either the start or the |
| 87 | + end of a token. |
| 88 | + pos_string(str): One of ["start", "end"] denoting whether `char_offset` |
| 89 | + is a start or the end of a token |
| 90 | + """ |
| 91 | + token_map = self._token_start_mapper(tokens, pos_string) |
| 92 | + token_key = self._find_closest_number(token_map.keys(), char_offset) |
| 93 | + |
| 94 | + return token_map[token_key] |
| 95 | + |
| 96 | + def _get_token_offset(self, tokens, token_id, pos_string): |
| 97 | + """ |
| 98 | + Return the character offset for the token with id == token_id |
| 99 | + """ |
| 100 | + |
| 101 | + token_match = (token[pos_string] for token in tokens if token["id"] == token_id) |
| 102 | + |
| 103 | + return next(token_match, None) |
| 104 | + |
| 105 | + def _find_closest_number(self, numbers, number): |
| 106 | + """ Find the closest match in a list of numbers when presented with |
| 107 | + a number |
| 108 | + """ |
| 109 | + |
| 110 | + return min(numbers, key=lambda x:abs(x - number)) |
| 111 | + |
| 112 | + def _token_start_mapper(self, tokens, pos_string): |
| 113 | + """ Map token id by the token start/end position |
| 114 | + """ |
| 115 | + |
| 116 | + return {token[pos_string]:token["id"] for token in tokens} |
| 117 | + |
| 118 | + |
| 119 | +@plac.annotations( |
| 120 | + input_file=( |
| 121 | + "Path to jsonl file containing numbered reference sections as docs.", |
| 122 | + "positional", |
| 123 | + None, |
| 124 | + str |
| 125 | + ), |
| 126 | + output_file=( |
| 127 | + "Path to output jsonl file containing prodigy docs with numbered references labelled.", |
| 128 | + "positional", |
| 129 | + None, |
| 130 | + str |
| 131 | + ) |
| 132 | +) |
| 133 | +def annotate_numbered_references(input_file, output_file): |
| 134 | + """ |
| 135 | + Takes reference sections with numeric labelling scraped by Reach in prodigy |
| 136 | + format, and labels the references as spans by splitting them using regex. |
| 137 | + """ |
| 138 | + |
| 139 | + numbered_reference_sections = read_jsonl(input_file) |
| 140 | + |
| 141 | + logger.info("Loaded %s prodigy docs", len(numbered_reference_sections)) |
| 142 | + |
| 143 | + nra = NumberedReferenceAnnotator() |
| 144 | + docs = list(nra.run[numbered_reference_sections]) |
| 145 | + |
| 146 | + write_jsonl(output_file) |
| 147 | + |
| 148 | + logger.info("Wrote %s annotated references to %s", len(docs), |
| 149 | + output_file) |
0 commit comments