-
Notifications
You must be signed in to change notification settings - Fork 1
/
reconstruct.py
121 lines (101 loc) · 3.6 KB
/
reconstruct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Convert the FGCR dataset json to the format that the GenQA model accepts. This is for
the reconstruction task.
See convert_instance.
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Any
from common import (
convert_file_qa,
extract_relation_span,
generate_answer_combined_tags,
hash_instance,
)
def convert_reconstruct(instance: dict[str, Any]) -> list[dict[str, str]]:
"""Convert a FGCR-format instance into a reconstruction-format instance.
This first generates the structured information from the input, then extracts the
truncated sentence version from the input and creates a dataset that maps structed
-> truncated.
This (raw input):
```json
{
"tid": 2771,
"info": "If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.", # noqa
"extraInfo": null,
"labelData": [
{
"type": "cause",
"reason": [
[
3,
76
]
],
"result": [
[
78,
149
]
]
}
]
},
```
Becomes:
```json
{
"context": "[Cause] one or more of Ecolab's customers were to experience a disastrous outcome [Relation] cause [Effect] the firm's reputation could suffer and it could lose multiple customers",
"question": "What is the reconstructed sentence?",
"question_type": "cause",
"answers": "one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers",
"id": "573a2c31"
},
```
"""
text = instance["info"]
label_map = {"reason": "Cause", "result": "Effect"}
instances: list[dict[str, str]] = []
for i, label_data in enumerate(instance["labelData"]):
relation = label_data["type"]
events: dict[str, list[str]] = {"reason": [], "result": []}
for ev_type in ["reason", "result"]:
for ev_start, ev_end in label_data[ev_type]:
event = text[ev_start:ev_end]
events[ev_type].append(event)
structured = generate_answer_combined_tags(events, label_map, relation)
answer = extract_relation_span(events["reason"], events["result"], text)
question = (
"What is the reconstructed sentence from the cause, relation and effect?"
)
if len(instance["labelData"]) > 1:
question = f"{i} {question}"
inst = {
"context": structured,
"question": question,
"question_type": relation,
"answers": answer,
}
# There are duplicate IDs in the dataset, so we hash instead.
inst["id"] = hash_instance(inst)
instances.append(inst)
return instances
def main() -> None:
argparser = argparse.ArgumentParser()
argparser.add_argument(
"--src",
type=Path,
default="data/raw",
help="Path to the folder containing the raw data",
)
argparser.add_argument(
"--dst", type=Path, default="data/reconstruct", help="Path to the output folder"
)
args = argparser.parse_args()
splits = ["dev", "test", "train"]
for split in splits:
raw_path = args.src / f"event_dataset_{split}.json"
new_path = args.dst / f"{split}.json"
convert_file_qa(raw_path, new_path, convert_instance=convert_reconstruct)
if __name__ == "__main__":
main()