-
Notifications
You must be signed in to change notification settings - Fork 13
/
run.sh
156 lines (137 loc) · 6.11 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# convert paras to sents
for lang in as bd bn dg en gom gu hi kha kn ks mai ml mni mr ne or pa sa sat sd ta te ur
do
python scripts/convert_para2sent.py --input ../monolingual_data/$lang.txt --output ../monolingual_sents/$lang.txt --lang $lang
echo $lang
done
# create tokenizer
python tokenization/build_tokenizer.py --input ../tokenizer_data/ --output ../wordpiece_250k/ --vocab 250000
# sbatch command to create mlm data
sbatch --job-name mlm_data --gres gpu:0 -p cpup --cpus-per-task 128 --nodes 1 \
--ntasks-per-node 1 --time=07-00:00:00 \
--wrap 'srun --output mlm_data.log.node%t --error mlm_data.stderr.node%t.%j \
bash create_data.sh'
# create tfrecords for evalutation data
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/process_data/create_mlm_data.py \
--input_file=as.txt, bd.txt, bn.txt, dg.txt, en.txt, gom.txt, gu.txt, hi.txt, kha.txt, kn.txt, ks.txt, mai.txt, ml.txt, mni.txt, mr.txt, ne.txt, or.txt, pa.txt, sa.txt, sat.txt, sd.txt, ta.txt, te.txt, ur.txt \
--output_file=/nlsasfs/home/ai4bharat/gramesh/bertteam/IndicXLM/data/gcp/tfrecords/eval.tfrecord \
--vocab_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/wordpiece_250k/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--do_whole_word_mask=True \
--masked_lm_prob=0.15 \
--random_seed=12345
# create MLM tfrecord for training data
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/process_data/create_mlm_data.py \
--input_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_text.txt \
--input_file_type=monolingual \
--output_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_mlm.tfrecord \
--vocab_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/wordpiece_250k/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--do_whole_word_mask=True \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor 5 \
--num_workers 2
# create TLM tfrecord for training data
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/process_data/create_mlm_data.py \
--input_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_data/en-as,/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_data/en-bn,/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_data/en-hi \
--input_file_type=parallel \
--output_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/sample_tlm.tfrecord \
--vocab_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/wordpiece_250k/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--do_whole_word_mask=True \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor 5 \
--num_workers 2
# train MLM and TLM
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/train/run_pretraining.py \
--input_file=gs://indic-bert/test_tpu/input/sample_mlm.tfrecord,gs://indic-bert/test_tpu/input/sample_tlm.tfrecord \
--output_dir=gs://indic-bert/mlm_tlm_test/ \
--do_train=True \
--bert_config_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/config.json \
--train_batch_size=4096 \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--num_train_steps=100000 \
--num_warmup_steps=10000 \
--learning_rate=2e-5 \
--save_checkpoint_steps=10000 \
--use_tpu=True \
--tpu_name=indic-bert \
--tpu_zone=us-east1-d \
--num_tpu_cores=128
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/train/run_pretraining.py \
--input_file=gs://indic-bert/aug-24-tfrecords/* \
--output_dir=gs://indic-bert/sep-6-mlm-only-ckpts/ \
--do_train=True \
--bert_config_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/config.json \
--train_batch_size=4096 \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--num_train_steps=1000000 \
--num_warmup_steps=50000 \
--learning_rate=5e-4 \
--save_checkpoints_steps=50000 \
--use_tpu=True \
--tpu_name=indic-bert \
--tpu_zone=us-east1-d \
--num_tpu_cores=128
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/train/run_pretraining.py \
--input_file=gs://indic-bert/aug-24-tfrecords/* \
--output_dir=gs://indic-bert/aug-26-ckpts-mlm-tlm/ \
--do_train=True \
--bert_config_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/config.json \
--train_batch_size=4096 \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--num_train_steps=1000000 \
--num_warmup_steps=50000 \
--learning_rate=5e-4 \
--save_checkpoints_steps=50000 \
--use_tpu=True \
--tpu_name=indic-bert \
--tpu_zone=us-east1-d \
--num_tpu_cores=128
python /nlsasfs/home/ai4bharat/gramesh/fine-tuning/IndicBERT/process_data/create_mlm_data.py \
--input_file=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/tagged-mlm/tagged-as.txt \
--input_file_type=monolingual \
--output_file=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/tagged-mlm/as.tfrecord \
--tokenizer=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/IndicBERT/tokenization/wp_land_id_250k/config.json \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--do_whole_word_mask=True \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor 1 \
--num_workers 128
python /nlsasfs/home/ai4bharat/gramesh/fine-tuning/IndicBERT/process_data/create_mlm_data.py \
--input_file=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/sam-splits/shuf-0 \
--input_file_type=parallel \
--output_file=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/sam-tfrecords/shuf-0.tfrecord \
--tokenizer=/nlsasfs/home/ai4bharat/gramesh/fine-tuning/IndicBERT/tokenization/wp_land_id_250k/config.json \
--max_seq_length=512 \
--max_predictions_per_seq=77 \
--do_whole_word_mask=True \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor 1 \
--num_workers 128
# mlm + sam + wiki
python /home/cs21d409_cse_iitm_ac_in/IndicBERT/train/run_pretraining.py \
--input_file=gs://indic-bert/aug-24-tfrecords/* \
--output_dir=gs://indic-bert/sep-28-mlm-wiki/ \
--do_train=True \
--bert_config_file=/home/cs21d409_cse_iitm_ac_in/IndicBERT/config.json \
--train_batch_size=4096 --max_seq_length=512 \
--max_predictions_per_seq=77 --num_train_steps=1000000 \
--num_warmup_steps=50000 --learning_rate=5e-4 \
--save_checkpoints_steps=50000 --use_tpu=True \
--tpu_name=indic-bert --tpu_zone=us-east1-d \
--num_tpu_cores=128