|
8 | 8 | import json |
9 | 9 | import os |
10 | 10 | import pickle |
| 11 | +import pandas as pd |
11 | 12 |
|
12 | 13 | import spacy |
13 | 14 |
|
14 | 15 | from .logger import logger |
15 | 16 |
|
16 | 17 |
|
17 | | -def load_data(filepath): |
| 18 | +def split_list_by_linebreaks(tokens): |
| 19 | + """Cycle through a list of tokens (or labels) and split them into lists |
| 20 | + based on the presence of Nones or more likely math.nan caused by converting |
| 21 | + pd.DataFrame columns to lists. |
18 | 22 | """ |
19 | | - Load and return the data stored in the given path. |
20 | | -
|
21 | | - Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing |
22 | | -
|
23 | | - The data is structured as follows: |
24 | | - * Each line contains four columns separated by a single space. |
25 | | - * Each word has been put on a separate line and there is an empty line |
26 | | - after each sentence. |
27 | | - * The first item on each line is a word, the second, third and fourth are |
28 | | - tags related to the word. |
29 | | -
|
30 | | - Example: |
31 | | -
|
32 | | - The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna |
33 | | - 1983." is represented in the dataset as: |
34 | | -
|
35 | | - ``` |
36 | | - L author b-secondary b-r |
37 | | - . author i-secondary i-r |
38 | | - Antonielli author i-secondary i-r |
39 | | - , author i-secondary i-r |
40 | | - Iprefetti title i-secondary i-r |
41 | | - dell title i-secondary i-r |
42 | | - ’ title i-secondary i-r |
43 | | - Italia title i-secondary i-r |
44 | | - napoleonica title i-secondary i-r |
45 | | - , title i-secondary i-r |
46 | | - Bologna publicationplace i-secondary i-r |
47 | | - 1983 year e-secondary i-r |
48 | | - . year e-secondary e-r |
49 | | - ``` |
50 | | -
|
51 | | - Args: |
52 | | - filepath (str): Path to the data. |
53 | | -
|
54 | | - Returns: |
55 | | - four lists: The first contains tokens, the next three contain |
56 | | - corresponding labels. |
57 | | -
|
58 | | - """ |
59 | | - |
60 | | - # Arrays to return |
61 | | - words = [] |
62 | | - tags_1 = [] |
63 | | - tags_2 = [] |
64 | | - tags_3 = [] |
65 | | - |
66 | | - word = tags1 = tags2 = tags3 = [] |
67 | | - with open(filepath, "r") as file: |
68 | | - for line in file: |
69 | | - # Do not take the first line into consideration |
70 | | - |
71 | | - if "DOCSTART" not in line: |
72 | | - # Check if empty line |
73 | | - |
74 | | - if line in ["\n", "\r\n"]: |
75 | | - # Append line |
76 | | - |
77 | | - words.append(word) |
78 | | - tags_1.append(tags1) |
79 | | - tags_2.append(tags2) |
80 | | - tags_3.append(tags3) |
81 | | - |
82 | | - # Reset |
83 | | - word = [] |
84 | | - tags1 = [] |
85 | | - tags2 = [] |
86 | | - tags3 = [] |
87 | | - |
88 | | - else: |
89 | | - # Split the line into words, tag #1 |
90 | | - w = line[:-1].split(" ") |
91 | | - |
92 | | - word.append(w[0]) |
93 | | - tags1.append(w[1]) |
94 | | - tags2.append(w[2]) |
95 | | - tags3.append(w[3]) |
96 | | - |
97 | | - logger.info("Loaded %s training examples", len(words)) |
98 | | - |
99 | | - return words, tags_1, tags_2, tags_3 |
100 | | - |
| 23 | + out = [] |
| 24 | + tokens_gen = iter(tokens) |
| 25 | + while True: |
| 26 | + try: |
| 27 | + token = next(tokens_gen) |
| 28 | + if isinstance(token, str) and token: |
| 29 | + out.append(token) |
| 30 | + else: |
| 31 | + yield out |
| 32 | + out = [] |
| 33 | + except StopIteration: |
| 34 | + if out: |
| 35 | + yield out |
| 36 | + break |
101 | 37 |
|
102 | 38 | def load_tsv(filepath, split_char="\t"): |
103 | 39 | """ |
104 | 40 | Load and return the data stored in the given path. |
105 | 41 |
|
106 | | - Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing |
| 42 | + Expects data in the following format (tab separations). |
| 43 | +
|
| 44 | + References o o |
| 45 | + o o |
| 46 | + 1 o o |
| 47 | + . o o |
| 48 | + o o |
| 49 | + WHO title b-r |
| 50 | + treatment title i-r |
| 51 | + guidelines title i-r |
| 52 | + for title i-r |
| 53 | + drug title i-r |
| 54 | + - title i-r |
| 55 | + resistant title i-r |
| 56 | + tuberculosis title i-r |
| 57 | + , title i-r |
| 58 | + 2016 title i-r |
107 | 59 |
|
108 | | - NOTE: In the current implementation in deep_reference_parser, only one set |
109 | | - of tags is used. The others will be used in a later PR. |
110 | 60 |
|
111 | | - The data is structured as follows: |
112 | | - * Each line contains four columns separated by a single space. |
113 | | - * Each word has been put on a separate line and there is an empty line |
114 | | - after each sentence. |
115 | | - * The first item on each line is a word, the second, third and fourth are |
116 | | - tags related to the word. |
117 | 61 |
|
118 | 62 | Args: |
119 | 63 | filepath (str): Path to the data. |
120 | 64 | split_char(str): Character to be used to split each line of the |
121 | 65 | document. |
122 | 66 |
|
123 | 67 | Returns: |
124 | | - two lists: The first contains tokens, the second contains corresponding |
125 | | - labels. |
| 68 | + a series of lists depending on the number of label columns provided in |
| 69 | + filepath. |
126 | 70 |
|
127 | 71 | """ |
128 | 72 |
|
129 | | - # Arrays to return |
130 | | - words = [] |
131 | | - tags_1 = [] |
132 | | - |
133 | | - word = [] |
134 | | - tags1 = [] |
135 | | - |
136 | | - with open(filepath, "r") as file: |
137 | | - for line in file: |
138 | | - # Check if empty line |
139 | | - |
140 | | - if line in ["\n", "\r\n", "\t\n"]: |
141 | | - # Append line |
142 | | - |
143 | | - words.append(word) |
144 | | - tags_1.append(tags1) |
145 | | - |
146 | | - # Reset |
147 | | - word = [] |
148 | | - tags1 = [] |
149 | | - |
150 | | - else: |
151 | | - |
152 | | - # Split the line into words, tag #1 |
153 | | - |
154 | | - w = line[:-1].split(split_char) |
155 | | - word.append(w[0]) |
156 | | - |
157 | | - # If tags are passed, (for training) then also add |
158 | | - |
159 | | - if len(w) == 2: |
160 | | - |
161 | | - tags1.append(w[1]) |
| 73 | + df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False) |
| 74 | + out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()] |
162 | 75 |
|
163 | | - logger.info("Loaded %s training examples", len(words)) |
| 76 | + logger.info("Loaded %s training examples", len(out[0])) |
164 | 77 |
|
165 | | - return words, tags_1 |
| 78 | + return tuple(out) |
166 | 79 |
|
167 | 80 |
|
168 | 81 | def prodigy_to_conll(docs): |
|
0 commit comments