forked from c-amr/camr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_traindir.sh
executable file
·211 lines (190 loc) · 4.87 KB
/
prepare_traindir.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/bin/bash
destdir=data
srcdir=golds
errlog=prepare.log
function usage {
echo "Prepare training data from gold training,dev,test fileset"
echo
echo "usage: $0 [--srcdir <src dir>] [--train <src fn>] [--dev <src fn>] [--test <src fn>] [--log <error.log>] [dest dir]"
echo
echo "--srcdir DIR source directory where to find training|train,dev,test files (default: golds)"
echo "--train FILE specify trainng source file individually (default: golds/train or golds/training)"
echo "--dev FILE specify dev source file individually (default: golds/dev)"
echo "--test FILE specify test source file individually (default: golds/test)"
echo "--log FILE specify error log file (default: $errlog)"
echo
echo "Default destination directory: $destdir"
echo
echo "NOTE: Requires Python 3.3+"
echo
}
for arg in $@; do
if [ "$arg" == "--help" ] || [ "$arg" == "-h" ]; then
usage
exit 0
fi
done
# if [ "`which python3`" == "" ]; then
# echo "error: requires Python 3.3+"
# exit 1
# fi
function check_srcdir {
if [ -f "$srcdir/training" ]; then
trainfile="$srcdir/training"
elif [ -f "$srcdir/train" ]; then
trainfile="$srcdir/train"
fi
if [ -f "$srcdir/dev" ]; then
devfile="$srcdir/dev"
fi
if [ -f "$srcdir/test" ]; then
testfile="$srcdir/test"
fi
}
check_srcdir
default_destdir="$destdir"
platinum=0
while [ $# -gt 0 ]; do
if [ "$1" == "--" ]; then
shift
# break
elif [ "$1" == "--srcdir" ]; then
shift
if [ -d "$1" ]; then
srcdir="$1"
check_srcdir
else
echo "error: invalid source directory: $1"
fi
shift
elif [ "$1" == "--train" ]; then
shift
if [ -f "$1" ]; then
trainfile="$1"
else
echo "error: invalid train file: $1"
fi
shift
elif [ "$1" == "--dev" ]; then
shift
if [ -f "$1" ]; then
devfile="$1"
else
echo "error: invalid dev file: $1"
fi
shift
elif [ "$1" == "--test" ]; then
shift
if [ -f "$1" ]; then
testfile="$1"
else
echo "error: invalid test file: $1"
fi
shift
elif [ "$1" == "--platinum" ]; then
shift
platinum=1
elif [ "$1" == "--log" ]; then
shift
errlog="$1"
shift
elif [ "$destdir" == "$default_destdir" ]; then
destdir="$1"
shift
else
echo "Warning: unexpected command line argument: $1"
shift
fi
done
DIR=`dirname $0`
srcfiles=("$trainfile" "$devfile" "$testfile")
if [ ${#srcfiles[@]} -eq 0 ]; then
echo "error: no source files found"
exit 1
fi
echo "Will prepare training data directory: $destdir"
mkdir -p $destdir
if [ $? -ne 0 ]; then
echo "error, aborting"
exit 1
fi
error=0
rm -f "$errlog" 2> /dev/null
if [ $platinum -eq 0 ]; then
for input in "${srcfiles[@]}"; do
output="$destdir/`basename $input`"
echo -n "Preprocessing $input to $output ... "
echo "Preprocessing $input to $output ... " >> "$errlog"
$DIR/preprocess.py "$input" > "$output" 2>> "$errlog"
if [ $? -eq 0 ]; then
echo "ok"
else
error=1
echo "error, check $errlog for error messages"
exit 1
fi
done
# write gold file
input="$testfile"
output="$destdir/test_gold"
echo -n "Writing gold file $output ..."
cp "$input" "$output"
if [ $? -eq 0 ]; then
echo "ok"
else
error=1
echo "error"
exit 1
fi
else
# sentences to be removed from train+dev set for platinum trainset
platinum_rm_snt="910 1025 1622 1838 1888 3023 4213 5067 5329 5330 5448 5671 5831 6427 7107 7136 7525 7626 7664 8131 8154 8457 8577 9094 9180 9786 10067 10078 10161 11131 11515 11598 11932 11933 12099 12201 12717 12849 13600 13738 13963 14240 14398 14491 14631 14745 15043 15423 15639"
# dev + train => train
output="$destdir/`basename $trainfile`"
echo -n "Preprocessing $trainfile + $devfile - non-platinum to $output ... "
echo "Preprocessing $trainfile + $devfile - non-platinum to $output ... " >> "$errlog"
$DIR/remove_sentences.py "$platinum_rm_snt" "$trainfile" 2>> "$errlog" > "$output.full"
if [ $? -eq 0 ]; then
$DIR/preprocess.py "$output.full" > "$output" 2>> "$errlog"
if [ $? -eq 0 ]; then
echo "ok"
rm -rf "$output.full"
else
error=1
echo "error, check $errlog for error messages"
exit 1
fi
else
error=1
echo "error, check $errlog for error messages"
exit 1
fi
input="$testfile"
output="$destdir/`basename $input`"
echo -n "Preprocessing $input to $output ... "
echo "Preprocessing $input to $output ... " >> "$errlog"
$DIR/preprocess.py "$input" > "$output" 2>> "$errlog"
if [ $? -eq 0 ]; then
echo "ok"
else
error=1
echo "error, check $errlog for error messages"
exit 1
fi
# write gold file
input="$testfile"
output="$destdir/test_gold"
echo -n "Writing gold file $output ..."
cp "$input" "$output"
if [ $? -eq 0 ]; then
echo "ok"
else
error=1
echo "error"
exit 1
fi
fi
# remove error log if executed cleanly
if [ $error -eq 0 ]; then
rm -f "$errlog" 2> /dev/null
fi