-
Notifications
You must be signed in to change notification settings - Fork 1
/
eeqa.py
138 lines (113 loc) · 3.54 KB
/
eeqa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Convert the FGCR dataset json to the format that the EEQA model accepts.
Note: ACE doesn't support classification, so we're only generating the cause and
effect labels here.
Example input:
```json
{
"tid": 2771,
"info": "If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.", # noqa
"extraInfo": null,
"labelData": [
{
"type": "cause",
"reason": [
[
3,
76
]
],
"result": [
[
78,
149
]
]
}
],
"id": <same as 'tid'>
}
```
Example output:
```
{
"sentence": <tokenised 'info'>,
"s_start": 0,
"ner": [], # not used by the model
"relation": [], # not used by the model
"event": [ # I'm not sure why there are two levels of lists
[
[
span_start,
span_end,
type
]
]
]
}
```
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from nltk.tokenize import NLTKWordTokenizer
def convert_instance(instance: dict[str, Any]) -> dict[str, Any]:
"""Convert a FGCR-format instance into an EEQA-format instance.
This ignores the relationship and only annotates the causes and effects by
building a list of (start, end, label) triplets.
The spans are at the token level, so we tokenise the text using the
NLTKWordTokenizer.
The output is a dictionary with the following keys:
- sentence: the tokenised text
- s_start: always zero because we're considering a few sentences only, not a
document
- ner: unsupported, so always an empty list
- relation: unsupported, so always an empty list
- event: a list of lists of events, where each event is a triplet of
[start, end, label]. I'm not sure why this is a 3-level list instead of
just 2 levels (i.e. list of events).
"""
tokeniser = NLTKWordTokenizer()
text = instance["info"]
tokens = list(tokeniser.tokenize(text))
spans = list(tokeniser.span_tokenize(text))
label_map = {"reason": "Cause", "result": "Effect"}
out = {
"sentence": tokens,
"s_start": 0,
"ner": [],
"relation": [],
"id": instance["tid"],
}
events = []
for label_data in instance["labelData"]:
for ev_type in ["reason", "result"]:
for ev_start, ev_end in label_data[ev_type]:
start, end = -1, -1
for sindex, (t_start, t_end) in enumerate(spans):
if ev_start <= t_start and t_end <= ev_end:
if start == -1:
start = sindex
end = sindex
if start != -1 and end != -1:
events.append([start, end, label_map[ev_type]])
out["event"] = [events]
return out
def convert_file(infile: Path, outfile: Path) -> None:
with infile.open() as f:
dataset = json.load(f)
instances = [convert_instance(instance) for instance in dataset]
outfile.parent.mkdir(exist_ok=True, parents=True)
with outfile.open("w") as f:
for i in instances:
print(json.dumps(i), file=f)
def main() -> None:
raw_folder = Path("data/raw")
new_folder = Path("data/eeqa")
splits = ["dev", "test", "train"]
for split in splits:
raw_path = raw_folder / f"event_dataset_{split}.json"
new_path = new_folder / f"{split}.json"
convert_file(raw_path, new_path)
if __name__ == "__main__":
main()