@@ -41,103 +41,15 @@ export DEV_TRY_ROUND=10
4141export CHECK_REMAINING_SQE_INTERVAL=10000
4242export DEBUG_FILE=" /home/panlichen/work/oneflow/log/oneflow_cpu_rank_"
4343
44- export NUM_ITER_ENV=20
44+ export NUM_ITER_ENV=200
4545echo NUM_ITER_ENV=$NUM_ITER_ENV
4646
47- if [ $GPUS = 2 ]; then
48- export CUDA_VISIBLE_DEVICES=4,5
49-
50- # pure dp
51- # export RECV_SUCCESS_FACTOR=5
52- # export RECV_SUCCESS_THRESHOLD=10000
53- # export BASE_CTX_SWITCH_THRESHOLD=100
54- # export TOLERANT_UNPROGRESSED_CNT=2000
55- # export NUM_TRY_TASKQ_HEAD=40
56-
57- # pure tp
58- export RECV_SUCCESS_FACTOR=20
59- export RECV_SUCCESS_THRESHOLD=10000
60- export BASE_CTX_SWITCH_THRESHOLD=120
61- export TOLERANT_UNPROGRESSED_CNT=10000
62- export NUM_TRY_TASKQ_HEAD=100
63- elif [ $GPUS = 4 ]; then
64- export CUDA_VISIBLE_DEVICES=0,1,4,5
65- export ONEFLOW_OFCCL_SKIP_NEGO=0
66-
67- # pure dp
68- # export ONEFLOW_OFCCL_SKIP_NEGO=0
69- # export RECV_SUCCESS_FACTOR=5
70- # export RECV_SUCCESS_THRESHOLD=10000
71- # export BASE_CTX_SWITCH_THRESHOLD=80
72- # export TOLERANT_UNPROGRESSED_CNT=10000
73- # export NUM_TRY_TASKQ_HEAD=50
74-
75- # pure tp
76- export ONEFLOW_OFCCL_SKIP_NEGO=0
77- export RECV_SUCCESS_FACTOR=40
78- export RECV_SUCCESS_THRESHOLD=10000
79- export BASE_CTX_SWITCH_THRESHOLD=3000
80- export TOLERANT_UNPROGRESSED_CNT=16000
81- export NUM_TRY_TASKQ_HEAD=200
82-
83- elif [ $GPUS = 8 ]; then
84-
85- # pure dp
86- export ONEFLOW_OFCCL_SKIP_NEGO=0
87- export RECV_SUCCESS_FACTOR=10
88- export RECV_SUCCESS_THRESHOLD=10000
89- export BASE_CTX_SWITCH_THRESHOLD=100000
90- export TOLERANT_UNPROGRESSED_CNT=88000
91- export NUM_TRY_TASKQ_HEAD=240
92-
93- # pure tp
94- # export ONEFLOW_OFCCL_SKIP_NEGO=1
95- # export RECV_SUCCESS_FACTOR=5
96- # export RECV_SUCCESS_THRESHOLD=10000
97- # export BASE_CTX_SWITCH_THRESHOLD=4000
98- # export TOLERANT_UNPROGRESSED_CNT=8000
99- # export NUM_TRY_TASKQ_HEAD=10
100-
101- # 3d
102- # export ONEFLOW_OFCCL_SKIP_NEGO=0
103- # export RECV_SUCCESS_FACTOR=5
104- # export RECV_SUCCESS_THRESHOLD=10000
105- # export BASE_CTX_SWITCH_THRESHOLD=8000
106- # export TOLERANT_UNPROGRESSED_CNT=80000
107- # export NUM_TRY_TASKQ_HEAD=10
108-
109- # 2dp4pp
110- # export ONEFLOW_OFCCL_SKIP_NEGO=0
111- # export RECV_SUCCESS_FACTOR=5
112- # export RECV_SUCCESS_THRESHOLD=10000
113- # export BASE_CTX_SWITCH_THRESHOLD=8000
114- # export TOLERANT_UNPROGRESSED_CNT=80000
115- # export NUM_TRY_TASKQ_HEAD=10
116-
117- # 2tp4pp
118- # export ONEFLOW_OFCCL_SKIP_NEGO=1
119- # export RECV_SUCCESS_FACTOR=10
120- # export RECV_SUCCESS_THRESHOLD=10000
121- # export BASE_CTX_SWITCH_THRESHOLD=12000
122- # export TOLERANT_UNPROGRESSED_CNT=8000
123- # export NUM_TRY_TASKQ_HEAD=10
124-
125- # 4tp2pp
126- # export ONEFLOW_OFCCL_SKIP_NEGO=1
127- # export RECV_SUCCESS_FACTOR=10
128- # export RECV_SUCCESS_THRESHOLD=10000
129- # export BASE_CTX_SWITCH_THRESHOLD=14000
130- # export TOLERANT_UNPROGRESSED_CNT=8000
131- # export NUM_TRY_TASKQ_HEAD=10
132-
133- # 4tp2dp
134- # export ONEFLOW_OFCCL_SKIP_NEGO=0
135- # export RECV_SUCCESS_FACTOR=5
136- # export RECV_SUCCESS_THRESHOLD=10000
137- # export BASE_CTX_SWITCH_THRESHOLD=8000
138- # export TOLERANT_UNPROGRESSED_CNT=9000
139- # export NUM_TRY_TASKQ_HEAD=10
140- fi
47+ export ONEFLOW_OFCCL_SKIP_NEGO=0
48+ export RECV_SUCCESS_FACTOR=5
49+ export RECV_SUCCESS_THRESHOLD=10000000
50+ export BASE_CTX_SWITCH_THRESHOLD=20000
51+ export TOLERANT_UNPROGRESSED_CNT=80000
52+ export NUM_TRY_TASKQ_HEAD=10
14153
14254echo GPUS=$GPUS
14355echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL
0 commit comments