test-maxtext.sh: support user-defined XLA flags (#763)

kocchop · web-flow · commit e44e507add23 · 2024-04-30T09:41:51.000+02:00
1. added support for used defined XLA flags
2. changed source of PGO converter script to jax main
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -13,10 +13,11 @@ usage() {
     echo ""
     echo "  OPTIONS                    DESCRIPTION"
     echo "  -a, --additional-args      Additional fiddle args to pass to MaxText/train.py"
+    echo "  --mem-fraction             Specify the percentage of memory to preallocate for XLA. Example: 0.90, 0.85, 0.65"
+    echo "  --model-name               Specify model to run. Example: llama2-7b, default"
+    echo "  --attn-type                Specify the attention type. Example: dot_product, cudnn_flash_te"
     echo "  -b, --batch-per-gpu        Batch size per GPU, defaults to 2."
     echo "  --dtype                    Batch size, defaults to bfloat16."
-    echo "  --enable-te                If set, will run with env var ENABLE_TE=1."
-    echo "  --enable-fused-attn        If set, will run with env var NVTE_FUSED_ATTN=1." 
     echo "  -s, --steps                Number of steps to run, defaults to 500."
     echo "  --multiprocess             Enable the multiprocess GPU mode."
     echo "  -o, --output NAME          Name for the output folder, a temporary folder will be created if none specified."
@@ -29,15 +30,18 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:s:o:n:h --long additional-args:,batch-per-gpu:,dtype:,enable-te,enable-fused-attn,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
+args=$(getopt -o a:b:s:o:n:h --long additional-args:,mem-fraction:,model-name:,attn-type:,batch-per-gpu:,dtype:,steps:,help,multiprocess,output:,data-parallel:,fsdp:,tensor-parallel:,pipeline-parallel:,nodes: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit $1
 fi
 
 # Default arguments
 HARDWARE='gpu'
 OUTPUT=$(mktemp -d)
+MEM_FRACTION=0.65
 
+MODEL_NAME='llama2-7b'
+ATTN_TYPE='dot_product'
 BATCH_PER_GPU=2
 DTYPE="bfloat16"
 STEPS=10
@@ -46,7 +50,6 @@ FSDP=1
 TP=1
 PP=1
 NODES=1
-ENABLE_TE=0
 ENABLE_FUSED_ATTN=0
 ADDITIONAL_ARGS=""
 
@@ -57,6 +60,18 @@ while [ : ]; do
             ADDITIONAL_ARGS="$2"
             shift 2
             ;;
+        --mem-fraction)
+            MEM_FRACTION="$2"
+            shift 2
+            ;;
+        --model-name)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --attn-type)
+            ATTN_TYPE="$2"
+            shift 2
+            ;;
         -b | --batch-per-gpu)
             BATCH_PER_GPU="$2"
             shift 2
@@ -130,13 +145,20 @@ else
     ici_DP=$DP
 fi
 
+if [ $ATTN_TYPE -eq 'cudnn_flash_te' ]
+then
+    ENABLE_FUSED_ATTN=1
+fi
+
+print_var MEM_FRACTION
+print_var MODEL_NAME
+print_var ATTN_TYPE
 print_var BATCH_PER_GPU
 print_var DTYPE
 print_var STEPS
 print_var NGPUS
 print_var HARDWARE
 print_var OUTPUT
-print_var ENABLE_TE
 print_var ENABLE_FUSED_ATTN
 print_var DP
 print_var ici_DP
@@ -152,10 +174,10 @@ pushd ${MAXTEXT_DIR}
 set -ex
 
 export NVTE_FUSED_ATTN=${ENABLE_FUSED_ATTN}
-export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
+export XLA_PYTHON_CLIENT_MEM_FRACTION=${MEM_FRACTION}
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true 
+export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true 
                 --xla_gpu_enable_async_all_gather=true 
                 --xla_gpu_enable_async_reduce_scatter=true 
                 --xla_gpu_enable_triton_gemm=false
@@ -173,12 +195,14 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_softmax_fusion=false 
                 --xla_gpu_enable_all_gather_combine_by_dim=false 
                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false 
-                --xla_disable_hlo_passes=rematerialization"
+                --xla_disable_hlo_passes=rematerialization}
+
+export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 RUN_NAME="logdir" ## the RUN_NAME cannot be changed
 
-RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} logits_via_embedding=true decoder_block=default \
-    steps=$STEPS per_device_batch_size=2 base_emb_dim=2560 base_mlp_dim=8192 remat_policy=minimal attention=dot_product\
+RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} logits_via_embedding=true decoder_block=${MODEL_NAME} \
+    steps=$STEPS per_device_batch_size=${BATCH_PER_GPU} base_emb_dim=2560 base_mlp_dim=8192 remat_policy=minimal attention=${ATTN_TYPE}\
     base_num_query_heads=8 base_num_kv_heads=8 base_num_decoder_layers=8 head_dim=128 enable_checkpointing=false\
     base_output_directory=$OUTPUT dataset_path=local dataset_type=synthetic hardware=$HARDWARE\
     dcn_fsdp_parallelism=1 ici_fsdp_parallelism=$FSDP\
diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_maxtext.yaml
@@ -117,6 +117,9 @@ jobs:
             test-maxtext.sh \
               --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
               --dtype bfloat16 \
+              --mem-fraction 0.65 \
+              --model-name default \
+              --attn-type dot_product \
               --batch-per-gpu 2 \
               --steps 10 \
               --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
@@ -267,6 +270,9 @@ jobs:
             test-maxtext.sh \
               --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
               --dtype bfloat16 \
+              --mem-fraction 0.65 \
+              --model-name default \
+              --attn-type dot_product \
               --batch-per-gpu 2 \
               --steps 10 \
               --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \
diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -14,7 +14,7 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false
 ```
 The main reason to do this is to not have any overlaps so that we can get exact costs for different ops.
 
-2. **Generate protobuf**: Once we have the nsys profile generated, we pass it to the python script provided [here [pgo_nsys_converter.py]](https://github.com/abhinavgoel95/jax/blob/patch-1/jax/tools/pgo_nsys_converter.py) to generate the pbtxt file. A sample pbtxt file would look like this:
+2. **Generate protobuf**: Once we have the nsys profile generated, we pass it to the python script provided [here [pgo_nsys_converter.py]](https://github.com/google/jax/blob/main/jax/tools/pgo_nsys_converter.py) to generate the pbtxt file. A sample pbtxt file would look like this:
 ```
 ...
 costs { name: "all-gather-start.1" cost_us: 7040.5215 }