Skip to content

Commit b04b58b

Browse files
[XLA:benchmarks] Add a workflow file for presubmit benchmarks and util shell script files
PiperOrigin-RevId: 758353414
1 parent 7ad71ef commit b04b58b

File tree

4 files changed

+518
-0
lines changed

4 files changed

+518
-0
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
# .github/workflows/benchmarks/build_binaries.sh
16+
# TODO(juliagmt): convert this to a python script.
17+
#!/bin/bash
18+
set -e # Exit immediately if a command exits with a non-zero status.
19+
set -u # Treat unset variables as an error when substituting.
20+
# set -o pipefail # Causes pipelines to fail if any command fails (see Run script)
21+
22+
echo "--- Configuring and Building Binaries ---"
23+
echo "Building binaries for $HARDWARE_CATEGORY..."
24+
25+
# --- Configure ---
26+
echo "Configuring backend..."
27+
if [[ "$HARDWARE_CATEGORY" == CPU* ]]; then
28+
./configure.py --backend=CPU || echo "INFO: CPU Configure script failed or is not applicable."
29+
elif [[ "$HARDWARE_CATEGORY" == GPU* ]]; then
30+
./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
31+
else
32+
echo "::error::Unsupported hardware category for configuration: $HARDWARE_CATEGORY"
33+
exit 1
34+
fi
35+
echo "Configuration step finished."
36+
37+
# --- Determine Paths and Build ---
38+
declare BAZEL_BIN_DIR="bazel-bin"
39+
declare runner_binary_path=""
40+
declare stats_binary_path=""
41+
declare device_type_flag_value=""
42+
declare bazel_exit_code=0
43+
44+
# TODO(juliagmt): use build.py to build binaries.
45+
if [[ "$HARDWARE_CATEGORY" == CPU* ]]; then
46+
runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main"
47+
stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main"
48+
device_type_flag_value="host"
49+
50+
echo "Building CPU binaries with RBE..."
51+
bazel build \
52+
--build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
53+
--test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
54+
--config=warnings \
55+
--config=nonccl \
56+
--config=rbe_linux_cpu \
57+
--color=yes \
58+
--test_output=errors \
59+
--verbose_failures \
60+
--keep_going \
61+
--nobuild_tests_only \
62+
--profile=profile.json.gz \
63+
--flaky_test_attempts=3 \
64+
--jobs=150 \
65+
--bes_upload_mode=fully_async \
66+
//xla/tools/multihost_hlo_runner:hlo_runner_main \
67+
//xla/tools:compute_xspace_stats_main
68+
bazel_exit_code=$?
69+
70+
elif [[ "$HARDWARE_CATEGORY" == GPU* ]]; then
71+
runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main_gpu"
72+
stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
73+
device_type_flag_value="gpu"
74+
75+
echo "Building GPU binaries with RBE..."
76+
bazel build \
77+
--build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only \
78+
--test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd \
79+
--config=warnings --config=rbe_linux_cuda_nvcc \
80+
--repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 \
81+
--run_under=//build_tools/ci:parallel_gpu_execute \
82+
--@cuda_driver//:enable_forward_compatibility=false --color=yes \
83+
--test_output=errors --verbose_failures --keep_going --nobuild_tests_only \
84+
--profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 \
85+
--bes_upload_mode=fully_async \
86+
-- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
87+
bazel_exit_code=$?
88+
else
89+
echo "::error::Unsupported hardware category for building binaries: $HARDWARE_CATEGORY"
90+
exit 1
91+
fi
92+
# Check build result
93+
if [ $bazel_exit_code -ne 0 ]; then
94+
echo "::error::Bazel build failed with exit code $bazel_exit_code!"
95+
exit $bazel_exit_code
96+
fi
97+
echo "Bazel build completed successfully."
98+
99+
# --- Verify and Output ---
100+
echo "Verifying binary existence..."
101+
if [ ! -f "$runner_binary_path" ]; then echo "::error::Runner binary '$runner_binary_path' not found after build!"; exit 1; fi
102+
if [ ! -f "$stats_binary_path" ]; then echo "::error::Stats binary '$stats_binary_path' not found after build!"; exit 1; fi
103+
echo "Binaries verified."
104+
105+
echo "Setting step outputs..."
106+
echo "runner_binary=$runner_binary_path" >> "$GITHUB_OUTPUT"
107+
echo "stats_binary=$stats_binary_path" >> "$GITHUB_OUTPUT"
108+
echo "device_type_flag=$device_type_flag_value" >> "$GITHUB_OUTPUT"
109+
110+
echo " runner_binary=$runner_binary_path"
111+
echo " stats_binary=$stats_binary_path"
112+
echo " device_type_flag=$device_type_flag_value"
113+
echo "--- Build Script Finished ---"
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
# .github/workflows/benchmarks/prepare_artifact.sh
16+
# TODO(juliagmt): convert this to a python script.
17+
#!/bin/bash
18+
set -e # Exit immediately if a command exits with a non-zero status.
19+
set -u # Treat unset variables as an error when substituting.
20+
21+
echo "--- prepare_artifact.sh (Self-creating directory version) ---"
22+
echo "SCRIPT: Current PWD: $(pwd)"
23+
echo "SCRIPT: GITHUB_WORKSPACE is: $GITHUB_WORKSPACE"
24+
echo "SCRIPT: Intended OUTPUT_DIR is: $OUTPUT_DIR"
25+
26+
# Create the directory HERE, inside this script, right before using it.
27+
echo "SCRIPT: Ensuring directory '$OUTPUT_DIR' exists by creating it with mkdir -p."
28+
mkdir -p "$OUTPUT_DIR"
29+
30+
# Verify creation immediately
31+
echo "SCRIPT: Verifying directory '$OUTPUT_DIR' after mkdir with 'ls -ld':"
32+
ls -ld "$OUTPUT_DIR" || echo "SCRIPT: 'ls -ld ""$OUTPUT_DIR""' FAILED even after mkdir in script!"
33+
34+
# Now, check with [ -d ... ]
35+
if [ ! -d "$OUTPUT_DIR" ]; then
36+
echo "::error::SCRIPT: Output directory '$OUTPUT_DIR' STILL NOT found with [ -d ... ] even after mkdir in this script."
37+
echo "SCRIPT: Listing parent directory '$(dirname "$OUTPUT_DIR")' using 'ls -la':"
38+
ls -la "$(dirname "$OUTPUT_DIR")" || echo "SCRIPT: Failed to list parent directory."
39+
exit 1
40+
else
41+
echo "SCRIPT: Output directory '$OUTPUT_DIR' IS now found with [ -d ... ]."
42+
fi
43+
44+
# --- Original script logic from here ---
45+
echo "--- Preparing Artifact (main logic) ---"
46+
47+
ARTIFACT_FILE_NAME=$(basename "$ARTIFACT_LOCATION")
48+
LOCAL_ARTIFACT_PATH="$OUTPUT_DIR/$ARTIFACT_FILE_NAME"
49+
50+
echo "Target local path: ${LOCAL_ARTIFACT_PATH}"
51+
52+
if [ "$IS_GCS_ARTIFACT" == "true" ]; then
53+
echo "Downloading GCS artifact from: $ARTIFACT_LOCATION"
54+
if ! command -v wget &> /dev/null; then
55+
echo "::error::wget command not found in container. Cannot download GCS artifact."
56+
exit 1
57+
fi
58+
59+
wget -q -nv -O "$LOCAL_ARTIFACT_PATH" "$ARTIFACT_LOCATION"
60+
WGET_EXIT_CODE=$?
61+
if [ $WGET_EXIT_CODE -ne 0 ]; then
62+
echo "::error::wget failed to download GCS artifact from $ARTIFACT_LOCATION (Exit code: $WGET_EXIT_CODE)"
63+
rm -f "$LOCAL_ARTIFACT_PATH" # Clean up partial file
64+
exit $WGET_EXIT_CODE
65+
fi
66+
echo "GCS artifact downloaded."
67+
else
68+
REPO_ARTIFACT_PATH="$GITHUB_WORKSPACE/$ARTIFACT_LOCATION" # ARTIFACT_LOCATION is the relative repo path here
69+
echo "Copying local artifact from workspace path: $REPO_ARTIFACT_PATH (IS_GCS_ARTIFACT was false)"
70+
if [ ! -f "$REPO_ARTIFACT_PATH" ]; then
71+
echo "::error::Local artifact not found at repository path: $REPO_ARTIFACT_PATH"
72+
exit 1
73+
fi
74+
cp -v "$REPO_ARTIFACT_PATH" "$LOCAL_ARTIFACT_PATH" || exit 1 # Exit if copy fails
75+
echo "Local artifact copied successfully."
76+
fi
77+
78+
# Verify the final destination file exists
79+
if [ ! -f "$LOCAL_ARTIFACT_PATH" ]; then
80+
echo "::error::Final artifact file not found at destination: $LOCAL_ARTIFACT_PATH"
81+
exit 1
82+
fi
83+
echo "Artifact successfully prepared at $LOCAL_ARTIFACT_PATH."
84+
85+
echo "artifact_local_path=$LOCAL_ARTIFACT_PATH" >> "$GITHUB_OUTPUT"
86+
echo "--- Artifact Prep Finished ---"
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
# .github/workflows/benchmarks/prepare_artifact.sh
16+
# TODO(juliagmt): convert this to a python script.
17+
#!/bin/bash
18+
set -u # Treat unset variables as an error when substituting.
19+
# IMPORTANT: pipefail is handled specifically around the runner command.
20+
set -e # Exit on errors, EXCEPT where explicitly handled.
21+
22+
echo "--- Running Benchmark ---"
23+
24+
# Reads ENVs from the Step:
25+
# RUNNER_BINARY, STATS_BINARY, DEVICE_TYPE_FLAG, LOCAL_ARTIFACT_PATH
26+
# Reads ENVs from the Job:
27+
# BENCHMARK_NAME, CONFIG_ID, HARDWARE_CATEGORY, OUTPUT_DIR,
28+
# XLA_FLAGS_JSON, RUNTIME_FLAGS_JSON,
29+
# COMMIT_SHA, WORKFLOW_RUN_ID
30+
31+
# --- Validate Inputs ---
32+
if [ -z "$LOCAL_ARTIFACT_PATH" ] || [ ! -f "$LOCAL_ARTIFACT_PATH" ]; then echo "::error::LOCAL_ARTIFACT_PATH path is invalid or file not found: '$LOCAL_ARTIFACT_PATH'"; exit 1; fi
33+
if [ -z "$RUNNER_BINARY" ] || [ ! -x "$RUNNER_BINARY" ]; then echo "::error::RUNNER_BINARY path is invalid or file not executable: '$RUNNER_BINARY'"; exit 1; fi
34+
if [ -z "$DEVICE_TYPE_FLAG" ]; then echo "::error::DEVICE_TYPE_FLAG is empty"; exit 1; fi
35+
if [ -z "$STATS_BINARY" ] || [ ! -x "$STATS_BINARY" ]; then echo "::error::STATS_BINARY path is invalid or file not executable: '$STATS_BINARY'"; exit 1; fi
36+
if ! command -v jq &> /dev/null; then echo "::error::jq command not found."; exit 1; fi
37+
38+
RUNNER_STDOUT_FILE="$OUTPUT_DIR/runner_stdout.txt"
39+
XSPACE_FILE_PATH="$OUTPUT_DIR/xspace.pb"
40+
RESULTS_JSON_FILE="$OUTPUT_DIR/results.json"
41+
42+
# --- Prepare flags ---
43+
declare -a xla_flags_array=()
44+
declare -a runtime_flags_array=()
45+
46+
# Use JQ to safely parse JSON and populate bash arrays
47+
if echo "$XLA_FLAGS_JSON" | jq -e '. | arrays and length > 0' > /dev/null; then
48+
mapfile -t xla_flags_array < <(echo "$XLA_FLAGS_JSON" | jq -r '.[]')
49+
fi
50+
if echo "$RUNTIME_FLAGS_JSON" | jq -e '. | arrays and length > 0' > /dev/null; then
51+
mapfile -t runtime_flags_array < <(echo "$RUNTIME_FLAGS_JSON" | jq -r '.[]')
52+
fi
53+
54+
# Conditionally add profile flag if needed for stats
55+
needs_profile_flag=true
56+
for flag in "${runtime_flags_array[@]}"; do
57+
if [[ "$flag" == "--profile_execution"* ]]; then
58+
needs_profile_flag=false; break
59+
fi
60+
done
61+
needs_xspace_dump_flag=true # Assume we always want stats if possible
62+
if $needs_profile_flag && $needs_xspace_dump_flag; then
63+
runtime_flags_array+=("--profile_execution=True")
64+
echo "INFO: Added --profile_execution=True for stats generation."
65+
fi
66+
67+
# --- Build Runner Command ---
68+
declare -a runner_command_array=("$RUNNER_BINARY" "--device_type=$DEVICE_TYPE_FLAG")
69+
if [ ${#runtime_flags_array[@]} -gt 0 ]; then runner_command_array+=("${runtime_flags_array[@]}"); fi
70+
if [ ${#xla_flags_array[@]} -gt 0 ]; then runner_command_array+=("${xla_flags_array[@]}"); fi
71+
if $needs_xspace_dump_flag; then
72+
runner_command_array+=("--xla_gpu_dump_xspace_to=$XSPACE_FILE_PATH")
73+
fi
74+
runner_command_array+=("$LOCAL_ARTIFACT_PATH")
75+
76+
# --- Execute Runner ---
77+
echo "Executing HLO Runner command:"
78+
printf "%q " "${runner_command_array[@]}"; echo # Print quoted command
79+
80+
set +e # Disable exit-on-error temporarily to capture exit code
81+
set -o pipefail # Ensure tee doesn't mask the runner's exit code
82+
"${runner_command_array[@]}" 2>&1 | tee "$RUNNER_STDOUT_FILE"
83+
RUNNER_EXIT_CODE=${PIPESTATUS[0]}
84+
set +o pipefail
85+
set -e # Re-enable exit-on-error
86+
87+
echo "Runner stdout/stderr saved to $RUNNER_STDOUT_FILE"
88+
echo "Runner exited with code: $RUNNER_EXIT_CODE"
89+
90+
# --- Execute Stats or Generate Fallback JSON ---
91+
STATS_EXIT_CODE=0
92+
if [ -f "$XSPACE_FILE_PATH" ] && [ $RUNNER_EXIT_CODE -eq 0 ]; then
93+
echo "Running compute_xspace_stats_main..."
94+
STATS_PLATFORM_TYPE=$([[ "$HARDWARE_CATEGORY" == GPU* ]] && echo "GPU" || echo "CPU")
95+
declare -a stats_command_array=("$STATS_BINARY" "--input=$XSPACE_FILE_PATH" "--device_type=$STATS_PLATFORM_TYPE" "--output_json=$RESULTS_JSON_FILE")
96+
97+
echo "Executing Stats command:"; printf "%q " "${stats_command_array[@]}"; echo
98+
99+
set +e # Disable exit-on-error temporarily
100+
"${stats_command_array[@]}" >> "$RUNNER_STDOUT_FILE" # Append stats stdout to runner log
101+
STATS_EXIT_CODE=$?
102+
set -e # Re-enable
103+
104+
if [ $STATS_EXIT_CODE -ne 0 ]; then
105+
echo "::warning::compute_xspace_stats_main failed with code $STATS_EXIT_CODE."
106+
# Fallback to creating JSON with run status and error message for stats failure
107+
jq -n \
108+
--arg bn "$BENCHMARK_NAME" --arg cid "$CONFIG_ID" --arg hc "$HARDWARE_CATEGORY" \
109+
--arg rs "STATS_FAILURE" \
110+
--arg em "compute_xspace_stats_main failed with code $STATS_EXIT_CODE. Runner was successful." \
111+
--arg cs "$COMMIT_SHA" --arg wrid "$WORKFLOW_RUN_ID" \
112+
'{ benchmark_name: $bn, config_id: $cid, hardware_category: $hc, run_status: $rs, error_message: $em, commit_sha: $cs, workflow_run_id: $wrid }' \
113+
> "$RESULTS_JSON_FILE"
114+
echo "Fallback results JSON created at $RESULTS_JSON_FILE due to stats failure."
115+
else
116+
echo "Stats computed and saved to $RESULTS_JSON_FILE"
117+
fi
118+
else
119+
# Create fallback JSON if Runner failed OR if Runner succeeded but produced no XSpace file
120+
if [ $RUNNER_EXIT_CODE -ne 0 ]; then
121+
echo "::warning::Runner failed (Exit Code: $RUNNER_EXIT_CODE), skipping stats."
122+
else
123+
echo "::warning::XSpace file missing at $XSPACE_FILE_PATH, skipping stats."
124+
fi
125+
126+
RUN_STATUS=$([ $RUNNER_EXIT_CODE -eq 0 ] && echo "SUCCESS_NO_PROFILE" || echo "FAILURE")
127+
ERROR_MSG=$([ $RUNNER_EXIT_CODE -ne 0 ] && echo "Runner failed with code $RUNNER_EXIT_CODE" || echo "XSpace file not generated by successful run.")
128+
129+
jq -n \
130+
--arg bn "$BENCHMARK_NAME" --arg cid "$CONFIG_ID" --arg hc "$HARDWARE_CATEGORY" \
131+
--arg rs "$RUN_STATUS" --arg em "$ERROR_MSG" \
132+
--arg cs "$COMMIT_SHA" --arg wrid "$WORKFLOW_RUN_ID" \
133+
'{ benchmark_name: $bn, config_id: $cid, hardware_category: $hc, run_status: $rs, error_message: $em, commit_sha: $cs, workflow_run_id: $wrid }' \
134+
> "$RESULTS_JSON_FILE"
135+
136+
if [ $? -eq 0 ]; then
137+
echo "Basic results JSON created at $RESULTS_JSON_FILE."
138+
else
139+
# Should not happen if jq is present, but a safety-net
140+
echo "::error::FATAL: Failed to create basic results JSON using jq."
141+
echo "Fallback error: Benchmark Name: $BENCHMARK_NAME, Run Status: $RUN_STATUS, Error: $ERROR_MSG" > "$RESULTS_JSON_FILE.txt"
142+
exit 1 # Make sure this failure is noted
143+
fi
144+
fi
145+
146+
# --- Final Exit Status ---
147+
if [ $RUNNER_EXIT_CODE -ne 0 ]; then
148+
echo "::error::Benchmark run failed (Runner Exit Code: $RUNNER_EXIT_CODE)."
149+
exit $RUNNER_EXIT_CODE # Propagate the runner's failure code
150+
fi
151+
152+
echo "--- Run Benchmark Script Finished Successfully ---"

0 commit comments

Comments
 (0)