-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathi2b2_2012_preprocessing.py
129 lines (112 loc) · 4.58 KB
/
i2b2_2012_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# coding: utf-8
import shutil
from pathlib import Path
import os
import sys
from collections import defaultdict
import numpy as np
import pathlib
import json
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from shutil import copyfile
import csv
from spacy.lang.en import English
from utils import check_exists, get_paths, make_dir, copy_text, dataset_xml2brat, get_ann_files,\
brat2bio_dict, make_if_nonexist, unzip_if_not_exists
import random
# change setting here
cwd = os.getcwd() # assume the tar.gz files are in the current directory
gzs = {"train": "2012-07-15.original-annotation.release.tar.gz",
"test": "2012-08-08.test-data.event-timex-groundtruth.tar.gz"}
BRAT_TEMP = "T{}\t{} {} {}\t{}"
EVENTS = {'PROBLEM', 'TEST', 'TREATMENT', 'CLINICAL_DEPT', 'EVIDENTIAL', 'OCCURRENCE'}
SPEC = {'&': 'AAMMPP'}
random_state = 13
train_val_split_ratio = 0.7 # 70 percent train, 30 percent val for train_gz
data_dir = "dataset"
verbose = False
folders = {"train": ".".join(gzs["train"].split(".")[:-2]),
"test": ".".join(gzs["test"].split(".")[:-2])}
folders["test"] = os.path.join(folders["test"], "xml") # test.gz has slightly different folder structure
# folder for raw files
infiles = {"train": os.path.join(cwd, folders["train"]),
"test": os.path.join(cwd, folders["test"])}
# folder for brat files
brat_out = {"train": "brat-train",
"test": "brat-test"}
# folder for bio files
bio_out = {"train": "bio-train",
"test": "bio-test"}
print("========preprocessing starts!===========")
# ## Step 1: convert xml to BRAT
# Code from Xi Yang (University of Florida)
#
# Modified by Lavender Jiang
print("========xml to BRAT===========")
for key in gzs:
if not check_exists(cwd, gzs[key]):
raise RuntimeError(f"Please make sure you have downloaded {gzs[key]} from n2c2 portal!")
for key in folders:
folder = folders[key]
if key == "test":
# for test folder, combine txt files and xml files to one folder
# this ensures test/xml has similar structure as train folder
folder_up = Path(folder).parent
unzip_if_not_exists(cwd, folder_up, gzs[key])
print(f"folder_up is {folder_up}")
print(f"copying txt file from {folder_up}/i2b2/*.txt to {folder}")
os.system(f"cp {folder_up}/i2b2/*.txt {folder}")
else:
unzip_if_not_exists(cwd, folder, gzs[key])
splits = gzs.keys()
in_paths = get_paths(infiles, splits)
out_paths = get_paths(brat_out, splits)
make_dir(out_paths)
copy_text(in_paths, out_paths)
dataset_xml2brat(in_paths, out_paths, BRAT_TEMP, EVENTS, verbose=verbose)
# ## Step 2: convert BRAT to BIO
# Code from Xi Yang (University of Florida)
#
# Modified by Lavender Jiang
print("========BRAT to BIO===========")
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe("sentencizer")
ann_files_d = get_ann_files(brat_out, ["train", "test"])
# print(f"ann_files_d is {ann_files_d}")
make_dir(bio_out)
brat2bio_dict(ann_files_d, brat_out, bio_out, nlp, EVENTS)
# ## Step 3: Combine files to dataset
print("========Building BIO dataset===========")
train_ids, dev_ids = train_test_split(list(ann_files_d['train']), train_size=train_val_split_ratio, random_state=random_state, shuffle=True)
test_ids = list(ann_files_d['test'])
random.Random(random_state).shuffle(test_ids)
print(f"train size {len(train_ids)}, val size {len(dev_ids)}, test size {len(test_ids)}")
i2b2_datasets = {"train":train_ids, "dev":dev_ids, "test":test_ids}
json.dump(i2b2_datasets, open("i2b2_2012_datasets.json", "w", encoding="utf-8"))
make_if_nonexist(data_dir)
# Merge BIO format train, validation and test datasets
for split in ["train", "dev", "test"]:
if split in ["train", "dev"]:
outputpath = bio_out["train"]
else:
outputpath = bio_out["test"]
split_dir = os.path.join(data_dir, split)
make_if_nonexist(split_dir)
with open(os.path.join(data_dir, f"{split}.txt"), "w", encoding="utf-8") as f:
for fid in i2b2_datasets[split]:
copyfile(f"{outputpath}/{fid}.bio.txt", os.path.join(split_dir,f"{fid}.bio.txt"))
with open(f"{outputpath}/{fid}.bio.txt", "r", encoding="utf-8") as fr:
txt = fr.read().strip()
if txt != '':
f.write(txt)
f.write("\n\n")
# ## Step 4: Bio 2 Nemo
# Code from nVidia NeMo
print("========BIO to NeMo===========")
for split in splits:
os.system(f'python bio2nemo.py --data_file {data_dir}/{split}.txt')
print("========preprocessing finished!===========")