Merge branch 'main' into anisha-mistral-fixit

AI-Hypercomputer · Jan 17, 2025 · e6c4d5f · e6c4d5f
2 parents f7d7e76 + 6d8a42b
commit e6c4d5f
Show file tree

Hide file tree

Showing 10 changed files with 12 additions and 13 deletions.
diff --git a/MaxText/configs/README.md b/MaxText/configs/README.md
@@ -103,7 +103,6 @@ Here are some of the most common XLA compiler flags used by MaxText.
 | xla_gpu_enable_pipelined_reduce_scatter | Boolean (true/false) | Enable pipelinling of reduce-scatter instructions. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
 | xla_gpu_enable_pipelined_all_reduce | Boolean (true/false) | Enable pipelinling of all-reduce instructions. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
 | xla_gpu_enable_while_loop_double_buffering | Boolean (true/false) | Enable double-buffering for while loop. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
-| xla_gpu_enable_triton_softmax_fusion | Boolean (true/false) | Use Triton-based Softmax fusion. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
 | xla_gpu_enable_all_gather_combine_by_dim | Boolean (true/false) | Combine all-gather ops with the same gather dimension or irrespective of their dimension. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
 | xla_gpu_enable_reduce_scatter_combine_by_dim | Boolean (true/false) | Combine reduce-scatter ops with the same dimension or irrespective of their dimension. <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |
 | xla_disable_hlo_passes | String (comma-separated list of pass names) | Comma-separated list of HLO passes to be disabled. These names must exactly match the pass name (no whitespace around commas). <br> **Usage:**  [a3/Llama2-7B 1vm](/MaxText/configs/a3/llama_2_7B/1vm.sh) [a3/Llama2-7B 2vm](/MaxText/configs/a3/llama_2_7B/2vm.sh) [a3/Llama2-7B 4vm](/MaxText/configs/a3/llama_2_7B/4vm.sh) [a3/Llama2-7B 8vm](/MaxText/configs/a3/llama_2_7B/8vm.sh) [a3/Llama2-7B 16vm](/MaxText/configs/a3/llama_2_7B/16vm.sh) |

diff --git a/MaxText/configs/a3/llama_2_7b/16vm.sh b/MaxText/configs/a3/llama_2_7b/16vm.sh
@@ -25,7 +25,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
---xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+--xla_gpu_enable_while_loop_double_buffering=true
 --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
 --xla_disable_hlo_passes=rematerialization"
 

diff --git a/MaxText/configs/a3/llama_2_7b/1vm.sh b/MaxText/configs/a3/llama_2_7b/1vm.sh
@@ -24,7 +24,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
  --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
  --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
  --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
- --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+ --xla_gpu_enable_while_loop_double_buffering=true
  --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
  --xla_disable_hlo_passes=rematerialization"
 

diff --git a/MaxText/configs/a3/llama_2_7b/2vm.sh b/MaxText/configs/a3/llama_2_7b/2vm.sh
@@ -25,7 +25,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 --xla_gpu_all_reduce_combine_threshold_bytes=67108864 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
---xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+--xla_gpu_enable_while_loop_double_buffering=true
 --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
 --xla_disable_hlo_passes=rematerialization"
 

diff --git a/MaxText/configs/a3/llama_2_7b/4vm.sh b/MaxText/configs/a3/llama_2_7b/4vm.sh
@@ -23,7 +23,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 --xla_gpu_all_reduce_combine_threshold_bytes=536870912 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
---xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+--xla_gpu_enable_while_loop_double_buffering=true
 --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
 --xla_disable_hlo_passes=rematerialization"
 

diff --git a/MaxText/configs/a3/llama_2_7b/8vm.sh b/MaxText/configs/a3/llama_2_7b/8vm.sh
@@ -24,7 +24,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
---xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+--xla_gpu_enable_while_loop_double_buffering=true
 --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
 --xla_disable_hlo_passes=rematerialization"
 

diff --git a/MaxText/configs/a3/llama_3.1_405b/128vm.sh b/MaxText/configs/a3/llama_3.1_405b/128vm.sh
@@ -1,8 +1,8 @@
 echo "Running 128vm.sh"
 # Example command to invoke this script via XPK, assume you've installed xpk
 # COMMAND="bash MaxText/configs/a3/llama_3.1_405b/128vm.sh"
-# COMMAND='export LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH;'"${COMMAND}"; 
-# 
+# COMMAND='export LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH;'"${COMMAND}";
+#
 # xpk workload create --project=${PROJECT}--cluster=${CLUSTER_NAME} --zone=${ZONE} \
 # --workload=${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
 # --device-type=${DEVICE_TYPE} --num-nodes=2 --priority=high \
@@ -28,7 +28,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
---xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+--xla_gpu_enable_while_loop_double_buffering=true
 --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
 --xla_disable_hlo_passes=rematerialization"
 
@@ -52,5 +52,5 @@ python MaxText/$EXECUTABLE MaxText/configs/models/llama3.1_405b.yml run_name=$RU
     dcn_fsdp_parallelism=128 \
     ici_fsdp_parallelism=8 \
     base_output_directory=$OUTPUT_PATH \
-    profiler=xplane 
+    profiler=xplane
 
diff --git a/end_to_end/gpu/a3/test_convergence_125m_params.sh b/end_to_end/gpu/a3/test_convergence_125m_params.sh
@@ -58,7 +58,7 @@ TRAIN_CMD="python MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME h
 TRAIN_CMD+=$CMD_DATA
 
 # Train
-export XLA_ARGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization"
+export XLA_ARGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization"
 $TRAIN_CMD
 
 # Assert training loss is smaller than input LOSS_THRESHOLD

diff --git a/end_to_end/gpu/a3/test_convergence_1b_params.sh b/end_to_end/gpu/a3/test_convergence_1b_params.sh
@@ -57,7 +57,7 @@ TRAIN_CMD="python MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME h
 TRAIN_CMD+=$CMD_DATA
 
 # Train
-export XLA_ARGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization"
+export XLA_ARGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization"
 $TRAIN_CMD
 
 # Assert training loss is smaller than input LOSS_THRESHOLD

diff --git a/end_to_end/gpu/a3/test_llama2_7b.sh b/end_to_end/gpu/a3/test_llama2_7b.sh
@@ -55,7 +55,7 @@ export XLA_FLAGS="--xla_dump_to=$BASE_OUTPUT_PATH/$RUN_NAME/HLO_dumps/
  --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
  --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
  --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
- --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+ --xla_gpu_enable_while_loop_double_buffering=true
  --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
  --xla_disable_hlo_passes=rematerialization"