Skip to content

Commit 8b10188

Browse files
committed
scripts
1 parent f6e4f48 commit 8b10188

1 file changed

Lines changed: 7 additions & 95 deletions

File tree

tools/train.sh

Lines changed: 7 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -41,103 +41,15 @@ export DEV_TRY_ROUND=10
4141
export CHECK_REMAINING_SQE_INTERVAL=10000
4242
export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_"
4343

44-
export NUM_ITER_ENV=20
44+
export NUM_ITER_ENV=200
4545
echo NUM_ITER_ENV=$NUM_ITER_ENV
4646

47-
if [ $GPUS = 2 ]; then
48-
export CUDA_VISIBLE_DEVICES=4,5
49-
50-
#pure dp
51-
# export RECV_SUCCESS_FACTOR=5
52-
# export RECV_SUCCESS_THRESHOLD=10000
53-
# export BASE_CTX_SWITCH_THRESHOLD=100
54-
# export TOLERANT_UNPROGRESSED_CNT=2000
55-
# export NUM_TRY_TASKQ_HEAD=40
56-
57-
#pure tp
58-
export RECV_SUCCESS_FACTOR=20
59-
export RECV_SUCCESS_THRESHOLD=10000
60-
export BASE_CTX_SWITCH_THRESHOLD=120
61-
export TOLERANT_UNPROGRESSED_CNT=10000
62-
export NUM_TRY_TASKQ_HEAD=100
63-
elif [ $GPUS = 4 ]; then
64-
export CUDA_VISIBLE_DEVICES=0,1,4,5
65-
export ONEFLOW_OFCCL_SKIP_NEGO=0
66-
67-
#pure dp
68-
# export ONEFLOW_OFCCL_SKIP_NEGO=0
69-
# export RECV_SUCCESS_FACTOR=5
70-
# export RECV_SUCCESS_THRESHOLD=10000
71-
# export BASE_CTX_SWITCH_THRESHOLD=80
72-
# export TOLERANT_UNPROGRESSED_CNT=10000
73-
# export NUM_TRY_TASKQ_HEAD=50
74-
75-
#pure tp
76-
export ONEFLOW_OFCCL_SKIP_NEGO=0
77-
export RECV_SUCCESS_FACTOR=40
78-
export RECV_SUCCESS_THRESHOLD=10000
79-
export BASE_CTX_SWITCH_THRESHOLD=3000
80-
export TOLERANT_UNPROGRESSED_CNT=16000
81-
export NUM_TRY_TASKQ_HEAD=200
82-
83-
elif [ $GPUS = 8 ]; then
84-
85-
#pure dp
86-
export ONEFLOW_OFCCL_SKIP_NEGO=0
87-
export RECV_SUCCESS_FACTOR=10
88-
export RECV_SUCCESS_THRESHOLD=10000
89-
export BASE_CTX_SWITCH_THRESHOLD=100000
90-
export TOLERANT_UNPROGRESSED_CNT=88000
91-
export NUM_TRY_TASKQ_HEAD=240
92-
93-
#pure tp
94-
# export ONEFLOW_OFCCL_SKIP_NEGO=1
95-
# export RECV_SUCCESS_FACTOR=5
96-
# export RECV_SUCCESS_THRESHOLD=10000
97-
# export BASE_CTX_SWITCH_THRESHOLD=4000
98-
# export TOLERANT_UNPROGRESSED_CNT=8000
99-
# export NUM_TRY_TASKQ_HEAD=10
100-
101-
#3d
102-
# export ONEFLOW_OFCCL_SKIP_NEGO=0
103-
# export RECV_SUCCESS_FACTOR=5
104-
# export RECV_SUCCESS_THRESHOLD=10000
105-
# export BASE_CTX_SWITCH_THRESHOLD=8000
106-
# export TOLERANT_UNPROGRESSED_CNT=80000
107-
# export NUM_TRY_TASKQ_HEAD=10
108-
109-
#2dp4pp
110-
# export ONEFLOW_OFCCL_SKIP_NEGO=0
111-
# export RECV_SUCCESS_FACTOR=5
112-
# export RECV_SUCCESS_THRESHOLD=10000
113-
# export BASE_CTX_SWITCH_THRESHOLD=8000
114-
# export TOLERANT_UNPROGRESSED_CNT=80000
115-
# export NUM_TRY_TASKQ_HEAD=10
116-
117-
#2tp4pp
118-
# export ONEFLOW_OFCCL_SKIP_NEGO=1
119-
# export RECV_SUCCESS_FACTOR=10
120-
# export RECV_SUCCESS_THRESHOLD=10000
121-
# export BASE_CTX_SWITCH_THRESHOLD=12000
122-
# export TOLERANT_UNPROGRESSED_CNT=8000
123-
# export NUM_TRY_TASKQ_HEAD=10
124-
125-
#4tp2pp
126-
# export ONEFLOW_OFCCL_SKIP_NEGO=1
127-
# export RECV_SUCCESS_FACTOR=10
128-
# export RECV_SUCCESS_THRESHOLD=10000
129-
# export BASE_CTX_SWITCH_THRESHOLD=14000
130-
# export TOLERANT_UNPROGRESSED_CNT=8000
131-
# export NUM_TRY_TASKQ_HEAD=10
132-
133-
#4tp2dp
134-
# export ONEFLOW_OFCCL_SKIP_NEGO=0
135-
# export RECV_SUCCESS_FACTOR=5
136-
# export RECV_SUCCESS_THRESHOLD=10000
137-
# export BASE_CTX_SWITCH_THRESHOLD=8000
138-
# export TOLERANT_UNPROGRESSED_CNT=9000
139-
# export NUM_TRY_TASKQ_HEAD=10
140-
fi
47+
export ONEFLOW_OFCCL_SKIP_NEGO=0
48+
export RECV_SUCCESS_FACTOR=5
49+
export RECV_SUCCESS_THRESHOLD=10000000
50+
export BASE_CTX_SWITCH_THRESHOLD=20000
51+
export TOLERANT_UNPROGRESSED_CNT=80000
52+
export NUM_TRY_TASKQ_HEAD=10
14153

14254
echo GPUS=$GPUS
14355
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL

0 commit comments

Comments
 (0)