-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimagenet-launch.sh
executable file
·142 lines (125 loc) · 5.12 KB
/
imagenet-launch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
##################################################
# File Name: launch.sh
# Author: shwin
# Creat Time: Tue 12 Nov 2019 09:56:32 AM PST
##################################################
#!/bin/bash
debug=0 # 0
model='resnet' # "preresnet"
dataset="tiny-imagenet"
depth=18
grow=true
hooker='Lip'
# ------ grow setting -----------
mode='adapt' # fixed
maxdepth=66 # if mode == adapt
# ----- fixed setting ---------------
dupEpoch=()
if [ "$grow" = true ] && [ "$mode" = 'fixed' ]; then
dupEpoch=(60 110) # grow at xx epochs
fi
# ------ adapt setting --------------
thresh='1.3' # threshold to trigger grow
reserve=20 # reserved epochs for final model
window=10 # smooth window
# ----- regular hypers -----------
epochs=90
lr='0.5' # initial learning rate
scheduler='adacosine' # learning rate scheduler: cosine / constant / step
if [ "$grow" = true ]; then
# if grow, no need to set learning rate scheduler, because learning rate is always reset at grow
schedule=()
else
# otherwise, set learning rate scheduler (if using step scheduler)
schedule=(60 110)
fi
gamma='0.1' # lr decaying factor, if using step lr scheduler
weight_decay='1e-4'
train_batch='128'
test_batch='100'
gpu_id='3' # For multiple gpu training, set like '1,2'
workers=4 # 4 * num gpus; or estimate by throughput
log_file="train.out"
suffix=""
prefix="Batch-Lip"
if (( debug > 0 )); then
# debug mode - train a few epochs
epochs=10
schedule=() # 2 7)
dupEpoch=()
thresh='1.0'
reserve=2
window=3
fi
# set dir name
if [ "$grow" = true ]; then
if [ "$mode" = 'fixed' ]; then
if [ "$scheduler" = 'constant' ]; then
dir="$model-$depth-"$mode-"$(IFS='-'; printf '%s' "${dupEpoch[*]}")"-$operation-$scheduler-"lr=${lr//'.'/'-'}"
else
dir="$model-$depth-"$mode-"$(IFS='-'; printf '%s' "${dupEpoch[*]}")"-$operation-$scheduler-"$(IFS='-'; printf '%s' "${schedule[*]}")"-"lr=${lr//'.'/'-'}"
fi
else
dir="$model-$depth-$mode-$maxdepth-$operation-$scheduler"-"lr=${lr//'.'/'-'}-window=$window-reserve=$reserve-thresh=${thresh//'.'/'-'}"
fi
else
if [ "$scheduler" = 'constant' ]; then
dir="$model-$depth-$scheduler-lr=${lr//'.'/'-'}"
else
dir="$model-$depth-$scheduler-"$(IFS='-'; printf '%s' "${schedule[*]}")"-lr=${lr//'.'/'-'}"
fi
fi
if [ "$scheduler" = step ];then
dir="$dir-gamma=${gamma//'.'/'-'}"
fi
if [ ! -z "$suffix" ];then
dir=$dir'_'$suffix
fi
if [ ! -z "$prefix" ];then
dir=$prefix-$dir
fi
if (( debug > 0 )); then
dir="Debug-"$dir
fi
checkpoint="checkpoints/$dataset/$dir"
[[ -f $checkpoint ]] && rm $checkpoint
i=1
while [ -d $checkpoint ]; do
echo '-----------------------------------------------------------------------------------------'
ls $checkpoint
tail -n 5 $checkpoint/train.out
read -p "Checkpoint path $checkpoint already exists. Delete[d], Rename[r], or Terminate[*]? " ans
case $ans in
d ) rm -rf $checkpoint; break;;
r ) checkpoint=${checkpoint%%_*}"_"$i;;
* ) exit;;
esac
(( i++ ))
done
if [ ! -f $checkpoint ];then
mkdir $checkpoint
fi
echo "Checkpoint path: "$checkpoint
echo 'Save main script to dir..'
cp launch.sh $checkpoint
cp train.py $checkpoint
cp -r utils $checkpoint
cp -r models $checkpoint
if [ "$grow" = true ]; then
if (( debug > 0 )); then
python train.py -d $dataset -a $model --grow --depth $depth --mode $mode --max-depth $maxdepth --epochs $epochs --grow-epoch "${dupEpoch[@]}" --threshold $thresh --window $window --reserve $reserve --hooker $hooker --scheduler $scheduler --schedule "${schedule[@]}" --gamma $gamma --wd $weight_decay --lr $lr --train-batch $train_batch --test-batch $test_batch --checkpoint "$checkpoint" --gpu-id "$gpu_id" --workers $workers --debug-batch-size $debug 2>&1 | tee "$checkpoint""/"$log_file
else
python train.py -d $dataset -a $model --grow --depth $depth --mode $mode --max-depth $maxdepth --epochs $epochs --grow-epoch "${dupEpoch[@]}" --threshold $thresh --window $window --reserve $reserve --hooker $hooker --scheduler $scheduler --schedule "${schedule[@]}" --gamma $gamma --wd $weight_decay --lr $lr --train-batch $train_batch --test-batch $test_batch --checkpoint "$checkpoint" --gpu-id "$gpu_id" --workers $workers --debug-batch-size $debug > "$checkpoint""/"$log_file 2>&1 &
fi
else
if (( debug > 0 )); then
python train.py -d $dataset -a $model --depth $depth --epochs $epochs --hooker $hooker --scheduler $scheduler --schedule "${schedule[@]}" --gamma $gamma --wd $weight_decay --lr $lr --train-batch $train_batch --test-batch $test_batch --checkpoint "$checkpoint" --gpu-id "$gpu_id" --workers $workers --debug-batch-size $debug | tee "$checkpoint""/"$log_file
else
python train.py -d $dataset -a $model --depth $depth --epochs $epochs --hooker $hooker --scheduler $scheduler --schedule "${schedule[@]}" --gamma $gamma --wd $weight_decay --lr $lr --train-batch $train_batch --test-batch $test_batch --checkpoint "$checkpoint" --gpu-id "$gpu_id" --workers $workers --debug-batch-size $debug > "$checkpoint""/"$log_file 2>&1 &
fi
fi
pid=$!
echo "[$pid] [$gpu_id] [Path]: $checkpoint"
if (( debug == 0 )); then
echo "s [$pid] [$gpu_id] $(date) [Path]: $checkpoint" >> log-cifar.txt
fi