-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprepare_experiment.sh
executable file
·100 lines (90 loc) · 4.03 KB
/
prepare_experiment.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
SOURCE=$1 # ro
TARGET=$2 # en
LANGPAIR=${SOURCE}-${TARGET}
DATA=/mnt/disk/afm/data/${LANGPAIR}
ALIGNER=/mnt/disk/afm/fast_align/build
OPENNMT=/mnt/disk/afm/OpenNMT-py
SCRIPTS="`cd $(dirname $0);pwd`"
preprocess=true
align=true
fertilize=true
cd ${OPENNMT}
if $preprocess
then
for prefix in corpus newsdev2016 newstest2016
do
sed 's/$/ <sink>/' ${DATA}/$prefix.bpe.${SOURCE} > ${DATA}/$prefix.bpe.sink.${SOURCE}
done
rm -rf ${DATA}/preprocessed.sink.align*.pt
python -u preprocess.py \
-train_src ${DATA}/corpus.bpe.sink.${SOURCE} \
-train_tgt ${DATA}/corpus.bpe.${TARGET} \
-valid_src ${DATA}/newsdev2016.bpe.sink.${SOURCE} \
-valid_tgt ${DATA}/newsdev2016.bpe.${TARGET} \
-save_data ${DATA}/preprocessed.sink.align \
-write_txt
# Generate also data without the sink token (for baselines).
python -u preprocess.py \
-train_src ${DATA}/corpus.bpe.${SOURCE} \
-train_tgt ${DATA}/corpus.bpe.${TARGET} \
-valid_src ${DATA}/newsdev2016.bpe.${SOURCE} \
-valid_tgt ${DATA}/newsdev2016.bpe.${TARGET} \
-save_data ${DATA}/preprocessed.align
fi
if $align
then
echo "Training aligner..."
paste -d '\t' \
${DATA}/preprocessed.sink.align.train.1.pt.txt.src \
${DATA}/preprocessed.sink.align.train.1.pt.txt.tgt \
> ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt
sed -i 's/\t/ ||| /g' ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt
${ALIGNER}/fast_align -i ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt -d -o -v \
-p ${DATA}/a.s2t.params \
> ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt.forward.align \
2> ${DATA}/a.s2t.err
${ALIGNER}/fast_align -i ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt -d -o -v -r \
-p ${DATA}/a.t2s.params \
> ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt.reverse.align \
2> ${DATA}/a.t2s.err
echo "Running aligner on validation data..."
paste -d '\t' \
${DATA}/preprocessed.sink.align.valid.1.pt.txt.src \
${DATA}/preprocessed.sink.align.valid.1.pt.txt.tgt \
> ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src-tgt
sed -i 's/\t/ ||| /g' ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src-tgt
python ${SCRIPTS}/force_align.py \
${DATA}/a.s2t.params ${DATA}/a.s2t.err \
${DATA}/a.t2s.params ${DATA}/a.t2s.err \
fwd \
< ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src-tgt \
> ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src-tgt.forward.align
echo "Running aligner on test data..."
paste -d '\t' \
${DATA}/newstest2016.bpe.sink.${SOURCE} \
${DATA}/newstest2016.bpe.${TARGET} \
> ${DATA}/newstest2016.bpe.${SOURCE}-${TARGET}
sed -i 's/\t/ ||| /g' ${DATA}/newstest2016.bpe.${SOURCE}-${TARGET}
python ${SCRIPTS}/force_align.py \
${DATA}/a.s2t.params ${DATA}/a.s2t.err \
${DATA}/a.t2s.params ${DATA}/a.t2s.err \
fwd \
< ${DATA}/newstest2016.bpe.${SOURCE}-${TARGET} \
> ${DATA}/newstest2016.bpe.${SOURCE}-${TARGET}.forward.align
fi
if $fertilize
then
for method in guided actual
do
python -u ${SCRIPTS}/generate_fertilities.py \
-method ${method} \
-train_source ${DATA}/preprocessed.sink.align.train.1.pt.txt.src \
-train_align ${DATA}/preprocessed.sink.align.train.1.pt.txt.src-tgt.forward.align \
-dev_source ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src \
-dev_align ${DATA}/preprocessed.sink.align.valid.1.pt.txt.src-tgt.forward.align \
-test_source ${DATA}/newstest2016.bpe.sink.${SOURCE} \
-test_align ${DATA}/newstest2016.bpe.${SOURCE}-${TARGET}.forward.align
done
echo "Training and testing the fertility predictor..."
${SCRIPTS}/fertility/train_test_fertility_predictor.sh ${SOURCE} ${TARGET} classification
fi