From 22d8d7b6ad1ba926822e5751b41e1ad6249db5b1 Mon Sep 17 00:00:00 2001
From: ouyangyu <xuanjiuye@gmail.com>
Date: Mon, 28 Sep 2020 16:37:32 +0800
Subject: [PATCH 1/4] add scripts

---
 Classification/cnns/train_fp16.sh | 52 +++++++++++++++++++++++++++++++
 Classification/cnns/train_fp32.sh | 46 +++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100755 Classification/cnns/train_fp16.sh
 create mode 100755 Classification/cnns/train_fp32.sh

diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh
new file mode 100755
index 0000000..cda1d15
--- /dev/null
+++ b/Classification/cnns/train_fp16.sh
@@ -0,0 +1,52 @@
+rm -rf core.*
+rm -rf ./output/snapshots/*
+
+if [ -n "$1" ]; then
+    NUM_EPOCH=$1
+else
+    NUM_EPOCH=50
+fi
+echo NUM_EPOCH=$NUM_EPOCH
+
+# training with imagenet
+if [ -n "$2" ]; then
+    DATA_ROOT=$2
+else
+    DATA_ROOT=/data/imagenet/ofrecord
+fi
+echo DATA_ROOT=$DATA_ROOT
+
+LOG_FOLDER=../logs
+mkdir -p $LOG_FOLDER
+LOGFILE=$LOG_FOLDER/resnet_training.log
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+
+python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
+     --num_nodes=1 \
+     --gpu_num_per_node=8 \
+     --optimizer="sgd" \
+     --momentum=0.875 \
+     --label_smoothing=0.1 \
+     --learning_rate=1.024 \
+     --loss_print_every_n_iter=100 \
+     --batch_size_per_device=128 \
+     --val_batch_size_per_device=50 \
+     --use_fp16 \
+     --channel_last=True \
+     --pad_output \
+     --fuse_bn_relu=True \
+     --fuse_bn_add_relu=True \
+     --nccl_fusion_threshold_mb=16 \
+     --nccl_fusion_max_ops=24 \
+     --num_epoch=$NUM_EPOCH \
+     --model="resnet50" 2>&1 | tee ${LOGFILE}
+
+echo "Writting log to ${LOGFILE}"
diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh
new file mode 100755
index 0000000..e8033a0
--- /dev/null
+++ b/Classification/cnns/train_fp32.sh
@@ -0,0 +1,46 @@
+rm -rf core.*
+rm -rf ./output/snapshots/*
+
+if [ -n "$1" ]; then
+    NUM_EPOCH=$1
+else
+    NUM_EPOCH=50
+fi
+echo NUM_EPOCH=$NUM_EPOCH
+
+# training with imagenet
+if [ -n "$2" ]; then
+    DATA_ROOT=$2
+else
+    DATA_ROOT=/data/imagenet/ofrecord
+fi
+echo DATA_ROOT=$DATA_ROOT
+
+LOG_FOLDER=../logs
+mkdir -p $LOG_FOLDER
+LOGFILE=$LOG_FOLDER/resnet_training.log
+
+python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
+     --num_nodes=1 \
+     --gpu_num_per_node=8 \
+     --optimizer="sgd" \
+     --momentum=0.875 \
+     --label_smoothing=0.1 \
+     --learning_rate=1.024 \
+     --loss_print_every_n_iter=100 \
+     --batch_size_per_device=128 \
+     --val_batch_size_per_device=50 \
+     --channel_last=True \
+     --pad_output \
+     --fuse_bn_relu=True \
+     --fuse_bn_add_relu=True \
+     --nccl_fusion_threshold_mb=16 \
+     --nccl_fusion_max_ops=24 \
+     --num_epoch=$NUM_EPOCH \
+     --model="resnet50" 2>&1 | tee ${LOGFILE}
+
+echo "Writting log to ${LOGFILE}"

From 7142e2f2d57f7a6defd9ed69d68d04e7b209162e Mon Sep 17 00:00:00 2001
From: ouyangyu <xuanjiuye@gmail.com>
Date: Mon, 28 Sep 2020 17:51:07 +0800
Subject: [PATCH 2/4] add env

---
 Classification/cnns/train_fp32.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh
index e8033a0..a7e5882 100755
--- a/Classification/cnns/train_fp32.sh
+++ b/Classification/cnns/train_fp32.sh
@@ -20,6 +20,11 @@ LOG_FOLDER=../logs
 mkdir -p $LOG_FOLDER
 LOGFILE=$LOG_FOLDER/resnet_training.log
 
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+
 python3 of_cnn_train_val.py \
      --train_data_dir=$DATA_ROOT/train \
      --train_data_part_num=256 \

From d45ad7f7799fc57b4a2a16d8e5dc4acd5dcbd9e6 Mon Sep 17 00:00:00 2001
From: ouyangyu <xuanjiuye@gmail.com>
Date: Tue, 29 Sep 2020 11:39:22 +0800
Subject: [PATCH 3/4] fp32 channel_last false

---
 Classification/cnns/train_fp32.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh
index a7e5882..045467f 100755
--- a/Classification/cnns/train_fp32.sh
+++ b/Classification/cnns/train_fp32.sh
@@ -39,8 +39,7 @@ python3 of_cnn_train_val.py \
      --loss_print_every_n_iter=100 \
      --batch_size_per_device=128 \
      --val_batch_size_per_device=50 \
-     --channel_last=True \
-     --pad_output \
+     --channel_last=False \
      --fuse_bn_relu=True \
      --fuse_bn_add_relu=True \
      --nccl_fusion_threshold_mb=16 \

From 5e0054ee7a10e75d2ad3100311084158c22fc6cf Mon Sep 17 00:00:00 2001
From: ouyangyu <xuanjiuye@gmail.com>
Date: Tue, 29 Sep 2020 15:18:33 +0800
Subject: [PATCH 4/4] fp32 batch size

---
 Classification/cnns/train_fp32.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh
index 045467f..ce06884 100755
--- a/Classification/cnns/train_fp32.sh
+++ b/Classification/cnns/train_fp32.sh
@@ -35,9 +35,9 @@ python3 of_cnn_train_val.py \
      --optimizer="sgd" \
      --momentum=0.875 \
      --label_smoothing=0.1 \
-     --learning_rate=1.024 \
+     --learning_rate=0.512 \
      --loss_print_every_n_iter=100 \
-     --batch_size_per_device=128 \
+     --batch_size_per_device=64 \
      --val_batch_size_per_device=50 \
      --channel_last=False \
      --fuse_bn_relu=True \