Skip to content

Commit f6e4f48

Browse files
committed
+ 2 machine script
1 parent 985119b commit f6e4f48

File tree

2 files changed

+218
-3
lines changed

2 files changed

+218
-3
lines changed

configs/vit_imagenet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,9 @@
8888

8989
# Distributed Settings
9090
train.dist.pipeline_num_layers = model.cfg.depth
91-
train.dist.data_parallel_size = 2
92-
train.dist.tensor_parallel_size = 2
93-
train.dist.pipeline_parallel_size = 2
91+
train.dist.data_parallel_size = 16
92+
train.dist.tensor_parallel_size = 1
93+
train.dist.pipeline_parallel_size = 1
9494

9595
# train.num_accumulation_steps = train.dist.pipeline_parallel_size
9696
# global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups

tools/train_27_25.sh

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/usr/bin/env bash
2+
3+
clear
4+
5+
FILE=$1
6+
CONFIG=$2
7+
GPUS=$3
8+
9+
# NODE=${NODE:-1}
10+
# NODE_RANK=${NODE_RANK:-0}
11+
# ADDR=${ADDR:-127.0.0.1}
12+
# PORT=${PORT:-12345}
13+
14+
NODE=2
15+
16+
if [[ $HOST = "oneflow-27" ]]; then
17+
NODE_RANK=0
18+
elif [[ $HOST = "oneflow-25" ]]; then
19+
NODE_RANK=1
20+
fi
21+
echo $NODE_RANK
22+
23+
ADDR=11.11.1.27
24+
PORT=12345
25+
26+
export GLOG_logtostderr=1
27+
export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor
28+
29+
export NCCL_PROTO=Simple
30+
export NCCL_ALGO=Ring
31+
# export NCCL_MAX_NCHANNELS=1
32+
# export NCCL_MIN_NCHANNELS=1
33+
# export NCCL_NTHREADS=64
34+
35+
if [ -z $RUN_TYPE ];then
36+
RUN_TYPE="PURE"
37+
# RUN_TYPE="GDB"
38+
# RUN_TYPE="NSYS"
39+
fi
40+
41+
export ONEFLOW_ENABLE_OFCCL=1
42+
export DISABLE_NCCL_COMPUTE_STREAM=1
43+
# export ONEFLOW_TIME_SHAPE=1
44+
export ONEFLOW_DEBUG_MODE=1
45+
export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1
46+
47+
export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1
48+
# nn_graph*=1,
49+
# export GLOG_v=1
50+
51+
export SHOW_ALL_PREPARED_COLL=1
52+
53+
export DEV_TRY_ROUND=10
54+
export CHECK_REMAINING_SQE_INTERVAL=10000
55+
export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_"
56+
57+
export NUM_ITER_ENV=20
58+
echo NUM_ITER_ENV=$NUM_ITER_ENV
59+
60+
if [ $GPUS = 2 ]; then
61+
export CUDA_VISIBLE_DEVICES=4,5
62+
63+
#pure dp
64+
# export RECV_SUCCESS_FACTOR=5
65+
# export RECV_SUCCESS_THRESHOLD=10000
66+
# export BASE_CTX_SWITCH_THRESHOLD=100
67+
# export TOLERANT_UNPROGRESSED_CNT=2000
68+
# export NUM_TRY_TASKQ_HEAD=40
69+
70+
#pure tp
71+
export RECV_SUCCESS_FACTOR=20
72+
export RECV_SUCCESS_THRESHOLD=10000
73+
export BASE_CTX_SWITCH_THRESHOLD=120
74+
export TOLERANT_UNPROGRESSED_CNT=10000
75+
export NUM_TRY_TASKQ_HEAD=100
76+
elif [ $GPUS = 4 ]; then
77+
export CUDA_VISIBLE_DEVICES=0,1,4,5
78+
export ONEFLOW_OFCCL_SKIP_NEGO=0
79+
80+
#pure dp
81+
# export ONEFLOW_OFCCL_SKIP_NEGO=0
82+
# export RECV_SUCCESS_FACTOR=40
83+
# export RECV_SUCCESS_THRESHOLD=10000
84+
# export BASE_CTX_SWITCH_THRESHOLD=30000
85+
# export TOLERANT_UNPROGRESSED_CNT=30000
86+
# export NUM_TRY_TASKQ_HEAD=200
87+
88+
#pure tp
89+
export ONEFLOW_OFCCL_SKIP_NEGO=0
90+
export RECV_SUCCESS_FACTOR=40
91+
export RECV_SUCCESS_THRESHOLD=1000000000
92+
export BASE_CTX_SWITCH_THRESHOLD=100000
93+
export TOLERANT_UNPROGRESSED_CNT=16000
94+
export NUM_TRY_TASKQ_HEAD=200
95+
96+
elif [ $GPUS = 8 ]; then
97+
98+
#pure dp
99+
# export ONEFLOW_OFCCL_SKIP_NEGO=0
100+
# export RECV_SUCCESS_FACTOR=30
101+
# export RECV_SUCCESS_THRESHOLD=100000000
102+
# export BASE_CTX_SWITCH_THRESHOLD=120000
103+
# export TOLERANT_UNPROGRESSED_CNT=180000
104+
# export NUM_TRY_TASKQ_HEAD=240
105+
106+
#pure tp
107+
# export ONEFLOW_OFCCL_SKIP_NEGO=0
108+
# export RECV_SUCCESS_FACTOR=10
109+
# export RECV_SUCCESS_THRESHOLD=1000000
110+
# export BASE_CTX_SWITCH_THRESHOLD=6000
111+
# export TOLERANT_UNPROGRESSED_CNT=8000
112+
# export NUM_TRY_TASKQ_HEAD=10
113+
114+
#4tp2dp
115+
# export ONEFLOW_OFCCL_SKIP_NEGO=0
116+
# export RECV_SUCCESS_FACTOR=10
117+
# export RECV_SUCCESS_THRESHOLD=10000000
118+
# export BASE_CTX_SWITCH_THRESHOLD=20000
119+
# export TOLERANT_UNPROGRESSED_CNT=9000
120+
# export NUM_TRY_TASKQ_HEAD=10
121+
122+
#3d
123+
export ONEFLOW_OFCCL_SKIP_NEGO=0
124+
export RECV_SUCCESS_FACTOR=5
125+
export RECV_SUCCESS_THRESHOLD=10000000
126+
export BASE_CTX_SWITCH_THRESHOLD=20000
127+
export TOLERANT_UNPROGRESSED_CNT=80000
128+
export NUM_TRY_TASKQ_HEAD=10
129+
130+
#2dp4pp
131+
# export ONEFLOW_OFCCL_SKIP_NEGO=0
132+
# export RECV_SUCCESS_FACTOR=5
133+
# export RECV_SUCCESS_THRESHOLD=10000
134+
# export BASE_CTX_SWITCH_THRESHOLD=8000
135+
# export TOLERANT_UNPROGRESSED_CNT=80000
136+
# export NUM_TRY_TASKQ_HEAD=10
137+
138+
#2tp4pp
139+
# export ONEFLOW_OFCCL_SKIP_NEGO=1
140+
# export RECV_SUCCESS_FACTOR=10
141+
# export RECV_SUCCESS_THRESHOLD=10000
142+
# export BASE_CTX_SWITCH_THRESHOLD=12000
143+
# export TOLERANT_UNPROGRESSED_CNT=8000
144+
# export NUM_TRY_TASKQ_HEAD=10
145+
146+
#4tp2pp
147+
# export ONEFLOW_OFCCL_SKIP_NEGO=1
148+
# export RECV_SUCCESS_FACTOR=10
149+
# export RECV_SUCCESS_THRESHOLD=10000
150+
# export BASE_CTX_SWITCH_THRESHOLD=14000
151+
# export TOLERANT_UNPROGRESSED_CNT=8000
152+
# export NUM_TRY_TASKQ_HEAD=10
153+
154+
fi
155+
156+
echo GPUS=$GPUS
157+
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL
158+
echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO
159+
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE
160+
echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE
161+
echo NCCL_PROTO=$NCCL_PROTO
162+
echo NCCL_ALGO=$NCCL_ALGO
163+
echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS
164+
echo NCCL_NTHREADS=$NCCL_NTHREADS
165+
echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN
166+
echo GLOG_vmodule=$GLOG_vmodule
167+
echo GLOG_v=$GLOG_v
168+
echo GLOG_logtostderr=$GLOG_logtostderr
169+
170+
echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR
171+
echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
172+
echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
173+
echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD
174+
echo DEV_TRY_ROUND=$DEV_TRY_ROUND
175+
echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL
176+
echo DEBUG_FILE=$DEBUG_FILE
177+
178+
export PYTHONUNBUFFERED=1
179+
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
180+
export NCCL_LAUNCH_MODE=PARALLEL
181+
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
182+
# export NCCL_DEBUG=INFO
183+
184+
rm -rf /home/panlichen/work/libai/log
185+
mkdir -p /home/panlichen/work/libai/log
186+
187+
rm -rf /home/panlichen/work/oneflow/log
188+
mkdir -p /home/panlichen/work/oneflow/log
189+
190+
export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true
191+
192+
if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then
193+
NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card
194+
else
195+
NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card
196+
fi
197+
198+
if [ "$RUN_TYPE" == "PURE" ];then
199+
cmd="python3 -m oneflow.distributed.launch"
200+
elif [ "$RUN_TYPE" == "GDB" ];then
201+
cmd="gdb -ex r --args python3 -m oneflow.distributed.launch"
202+
elif [ "$RUN_TYPE" == "NSYS" ];then
203+
if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then
204+
mkdir -p /home/panlichen/work/oneflow/log/nsys
205+
fi
206+
# cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch"
207+
cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch"
208+
fi
209+
echo cmd=$cmd
210+
211+
$cmd \
212+
--nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \
213+
$FILE --config-file $CONFIG ${@:4} \
214+
> /home/panlichen/work/oneflow/log/oneflow.log 2>&1
215+

0 commit comments

Comments
 (0)