|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +clear |
| 4 | + |
| 5 | +FILE=$1 |
| 6 | +CONFIG=$2 |
| 7 | +GPUS=$3 |
| 8 | + |
| 9 | +# NODE=${NODE:-1} |
| 10 | +# NODE_RANK=${NODE_RANK:-0} |
| 11 | +# ADDR=${ADDR:-127.0.0.1} |
| 12 | +# PORT=${PORT:-12345} |
| 13 | + |
| 14 | +NODE=2 |
| 15 | + |
| 16 | +if [[ $HOST = "oneflow-27" ]]; then |
| 17 | + NODE_RANK=0 |
| 18 | +elif [[ $HOST = "oneflow-25" ]]; then |
| 19 | + NODE_RANK=1 |
| 20 | +fi |
| 21 | +echo $NODE_RANK |
| 22 | + |
| 23 | +ADDR=11.11.1.27 |
| 24 | +PORT=12345 |
| 25 | + |
| 26 | +export GLOG_logtostderr=1 |
| 27 | +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor |
| 28 | + |
| 29 | +export NCCL_PROTO=Simple |
| 30 | +export NCCL_ALGO=Ring |
| 31 | +# export NCCL_MAX_NCHANNELS=1 |
| 32 | +# export NCCL_MIN_NCHANNELS=1 |
| 33 | +# export NCCL_NTHREADS=64 |
| 34 | + |
| 35 | +if [ -z $RUN_TYPE ];then |
| 36 | + RUN_TYPE="PURE" |
| 37 | + # RUN_TYPE="GDB" |
| 38 | + # RUN_TYPE="NSYS" |
| 39 | +fi |
| 40 | + |
| 41 | +export ONEFLOW_ENABLE_OFCCL=1 |
| 42 | +export DISABLE_NCCL_COMPUTE_STREAM=1 |
| 43 | +# export ONEFLOW_TIME_SHAPE=1 |
| 44 | +export ONEFLOW_DEBUG_MODE=1 |
| 45 | +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 |
| 46 | + |
| 47 | +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 |
| 48 | +# nn_graph*=1, |
| 49 | +# export GLOG_v=1 |
| 50 | + |
| 51 | +export SHOW_ALL_PREPARED_COLL=1 |
| 52 | + |
| 53 | +export DEV_TRY_ROUND=10 |
| 54 | +export CHECK_REMAINING_SQE_INTERVAL=10000 |
| 55 | +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" |
| 56 | + |
| 57 | +export NUM_ITER_ENV=20 |
| 58 | +echo NUM_ITER_ENV=$NUM_ITER_ENV |
| 59 | + |
| 60 | +if [ $GPUS = 2 ]; then |
| 61 | + export CUDA_VISIBLE_DEVICES=4,5 |
| 62 | + |
| 63 | + #pure dp |
| 64 | + # export RECV_SUCCESS_FACTOR=5 |
| 65 | + # export RECV_SUCCESS_THRESHOLD=10000 |
| 66 | + # export BASE_CTX_SWITCH_THRESHOLD=100 |
| 67 | + # export TOLERANT_UNPROGRESSED_CNT=2000 |
| 68 | + # export NUM_TRY_TASKQ_HEAD=40 |
| 69 | + |
| 70 | + #pure tp |
| 71 | + export RECV_SUCCESS_FACTOR=20 |
| 72 | + export RECV_SUCCESS_THRESHOLD=10000 |
| 73 | + export BASE_CTX_SWITCH_THRESHOLD=120 |
| 74 | + export TOLERANT_UNPROGRESSED_CNT=10000 |
| 75 | + export NUM_TRY_TASKQ_HEAD=100 |
| 76 | +elif [ $GPUS = 4 ]; then |
| 77 | + export CUDA_VISIBLE_DEVICES=0,1,4,5 |
| 78 | + export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 79 | + |
| 80 | + #pure dp |
| 81 | + # export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 82 | + # export RECV_SUCCESS_FACTOR=40 |
| 83 | + # export RECV_SUCCESS_THRESHOLD=10000 |
| 84 | + # export BASE_CTX_SWITCH_THRESHOLD=30000 |
| 85 | + # export TOLERANT_UNPROGRESSED_CNT=30000 |
| 86 | + # export NUM_TRY_TASKQ_HEAD=200 |
| 87 | + |
| 88 | + #pure tp |
| 89 | + export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 90 | + export RECV_SUCCESS_FACTOR=40 |
| 91 | + export RECV_SUCCESS_THRESHOLD=1000000000 |
| 92 | + export BASE_CTX_SWITCH_THRESHOLD=100000 |
| 93 | + export TOLERANT_UNPROGRESSED_CNT=16000 |
| 94 | + export NUM_TRY_TASKQ_HEAD=200 |
| 95 | + |
| 96 | +elif [ $GPUS = 8 ]; then |
| 97 | + |
| 98 | + #pure dp |
| 99 | + # export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 100 | + # export RECV_SUCCESS_FACTOR=30 |
| 101 | + # export RECV_SUCCESS_THRESHOLD=100000000 |
| 102 | + # export BASE_CTX_SWITCH_THRESHOLD=120000 |
| 103 | + # export TOLERANT_UNPROGRESSED_CNT=180000 |
| 104 | + # export NUM_TRY_TASKQ_HEAD=240 |
| 105 | + |
| 106 | + #pure tp |
| 107 | + # export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 108 | + # export RECV_SUCCESS_FACTOR=10 |
| 109 | + # export RECV_SUCCESS_THRESHOLD=1000000 |
| 110 | + # export BASE_CTX_SWITCH_THRESHOLD=6000 |
| 111 | + # export TOLERANT_UNPROGRESSED_CNT=8000 |
| 112 | + # export NUM_TRY_TASKQ_HEAD=10 |
| 113 | + |
| 114 | + #4tp2dp |
| 115 | + # export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 116 | + # export RECV_SUCCESS_FACTOR=10 |
| 117 | + # export RECV_SUCCESS_THRESHOLD=10000000 |
| 118 | + # export BASE_CTX_SWITCH_THRESHOLD=20000 |
| 119 | + # export TOLERANT_UNPROGRESSED_CNT=9000 |
| 120 | + # export NUM_TRY_TASKQ_HEAD=10 |
| 121 | + |
| 122 | + #3d |
| 123 | + export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 124 | + export RECV_SUCCESS_FACTOR=5 |
| 125 | + export RECV_SUCCESS_THRESHOLD=10000000 |
| 126 | + export BASE_CTX_SWITCH_THRESHOLD=20000 |
| 127 | + export TOLERANT_UNPROGRESSED_CNT=80000 |
| 128 | + export NUM_TRY_TASKQ_HEAD=10 |
| 129 | + |
| 130 | + #2dp4pp |
| 131 | + # export ONEFLOW_OFCCL_SKIP_NEGO=0 |
| 132 | + # export RECV_SUCCESS_FACTOR=5 |
| 133 | + # export RECV_SUCCESS_THRESHOLD=10000 |
| 134 | + # export BASE_CTX_SWITCH_THRESHOLD=8000 |
| 135 | + # export TOLERANT_UNPROGRESSED_CNT=80000 |
| 136 | + # export NUM_TRY_TASKQ_HEAD=10 |
| 137 | + |
| 138 | + #2tp4pp |
| 139 | + # export ONEFLOW_OFCCL_SKIP_NEGO=1 |
| 140 | + # export RECV_SUCCESS_FACTOR=10 |
| 141 | + # export RECV_SUCCESS_THRESHOLD=10000 |
| 142 | + # export BASE_CTX_SWITCH_THRESHOLD=12000 |
| 143 | + # export TOLERANT_UNPROGRESSED_CNT=8000 |
| 144 | + # export NUM_TRY_TASKQ_HEAD=10 |
| 145 | + |
| 146 | + #4tp2pp |
| 147 | + # export ONEFLOW_OFCCL_SKIP_NEGO=1 |
| 148 | + # export RECV_SUCCESS_FACTOR=10 |
| 149 | + # export RECV_SUCCESS_THRESHOLD=10000 |
| 150 | + # export BASE_CTX_SWITCH_THRESHOLD=14000 |
| 151 | + # export TOLERANT_UNPROGRESSED_CNT=8000 |
| 152 | + # export NUM_TRY_TASKQ_HEAD=10 |
| 153 | + |
| 154 | +fi |
| 155 | + |
| 156 | +echo GPUS=$GPUS |
| 157 | +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL |
| 158 | +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO |
| 159 | +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE |
| 160 | +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE |
| 161 | +echo NCCL_PROTO=$NCCL_PROTO |
| 162 | +echo NCCL_ALGO=$NCCL_ALGO |
| 163 | +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS |
| 164 | +echo NCCL_NTHREADS=$NCCL_NTHREADS |
| 165 | +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN |
| 166 | +echo GLOG_vmodule=$GLOG_vmodule |
| 167 | +echo GLOG_v=$GLOG_v |
| 168 | +echo GLOG_logtostderr=$GLOG_logtostderr |
| 169 | + |
| 170 | +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR |
| 171 | +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT |
| 172 | +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD |
| 173 | +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD |
| 174 | +echo DEV_TRY_ROUND=$DEV_TRY_ROUND |
| 175 | +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL |
| 176 | +echo DEBUG_FILE=$DEBUG_FILE |
| 177 | + |
| 178 | +export PYTHONUNBUFFERED=1 |
| 179 | +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED |
| 180 | +export NCCL_LAUNCH_MODE=PARALLEL |
| 181 | +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE |
| 182 | +# export NCCL_DEBUG=INFO |
| 183 | + |
| 184 | +rm -rf /home/panlichen/work/libai/log |
| 185 | +mkdir -p /home/panlichen/work/libai/log |
| 186 | + |
| 187 | +rm -rf /home/panlichen/work/oneflow/log |
| 188 | +mkdir -p /home/panlichen/work/oneflow/log |
| 189 | + |
| 190 | +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true |
| 191 | + |
| 192 | +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then |
| 193 | + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card |
| 194 | +else |
| 195 | + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card |
| 196 | +fi |
| 197 | + |
| 198 | +if [ "$RUN_TYPE" == "PURE" ];then |
| 199 | + cmd="python3 -m oneflow.distributed.launch" |
| 200 | +elif [ "$RUN_TYPE" == "GDB" ];then |
| 201 | + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" |
| 202 | +elif [ "$RUN_TYPE" == "NSYS" ];then |
| 203 | + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then |
| 204 | + mkdir -p /home/panlichen/work/oneflow/log/nsys |
| 205 | + fi |
| 206 | + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" |
| 207 | + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" |
| 208 | +fi |
| 209 | +echo cmd=$cmd |
| 210 | + |
| 211 | +$cmd \ |
| 212 | + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ |
| 213 | + $FILE --config-file $CONFIG ${@:4} \ |
| 214 | + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 |
| 215 | + |
0 commit comments