Skip to content

Commit 8de5059

Browse files
Merge pull request #2571 from AI-Hypercomputer:hs2
PiperOrigin-RevId: 834334206
2 parents 32380ea + 84ef1b6 commit 8de5059

File tree

3 files changed

+21
-4
lines changed

3 files changed

+21
-4
lines changed

benchmarks/maxtest/getting_started.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,21 @@ EXIT_CODE=0
4444

4545
- maxtest.sh will generate a YAML file in the directory that is passed to kubectl. This file can be modified and reused by running `kubectl apply -f maxtest.yaml`
4646

47+
### Passing custom libtpu or XLA flags ###
48+
49+
If we want to pass custom flags this is also possible by specifying
50+
`--libtpu_args`.
51+
52+
53+
#### Setting flags for SDC checking ####
54+
55+
Useful checking for the existence of SDC on TPU hardware.
56+
57+
```
58+
bash maxtest.sh --project $TPU_PROJECT --cluster $CLUSTER --region $REGION --nodepool $NODEPOOL_NAME --num_workers $NUM_WORKERS --libtpu_args '--xla_tpu_enable_sdc_checker'
59+
```
60+
61+
4762
### Debugging common job errors ###
4863

4964
If the job does not exit with `EXIT_CODE=0`, there is a failure among one of

benchmarks/maxtest/maxtest.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!bin/bash
22

33
function usage() {
44
echo "error: $1"
@@ -15,6 +15,7 @@ while [[ "$#" > 0 ]]; do case $1 in
1515
-r|--region) GKE_REGION="$2";shift;shift;;
1616
--nodepool) NODEPOOL="$2";shift;shift;;
1717
--num_workers) NUM_WORKERS="$2";shift;shift;;
18+
--libtpu_args) LIBTPU_ARGS="$2";shift;shift;;
1819
*) usage "Unknown parameter passed: $1"; shift; shift;;
1920
esac; done
2021

@@ -32,19 +33,20 @@ if [ -z "$TPU_ACCELERATOR" ]; then exit; fi;
3233

3334
UUID=$(uuidgen)
3435
export JOB_NAME="${UUID:0:5}-maxtest"
35-
export DOCKER_IMAGE="gcr.io/cloud-tpu-images-public/tpu/healthscan"
36+
export DOCKER_IMAGE="us-docker.pkg.dev/cloud-tpu-images-public/tpu/healthscan:latest"
3637
export NODEPOOL
3738
export TPU_TOPOLOGY
3839
export TPU_ACCELERATOR
3940
export GKE_PROJECT
4041
export GKE_REGION
4142
export GKE_CLUSTER
43+
export LIBTPU_ARGS
4244

4345
export MEMORY_PER_HOST="407Gi"
4446
export TPU_CHIPS_PER_HOST=4
4547
export COMPLETIONS=$NUM_WORKERS # Number of VMs in the nodepool (v6e -> 2 VMs for v6e-8, v5p -> 1 VM for a v5p-8)
4648

47-
YAML_VARS='$JOB_NAME $DOCKER_IMAGE $NODEPOOL $TPU_TOPOLOGY $TPU_ACCELERATOR $COMPLETIONS $MEMORY_PER_HOST $TPU_CHIPS_PER_HOST $GKE_PROJECT $GKE_REGION $GKE_CLUSTER'
49+
YAML_VARS='$JOB_NAME $DOCKER_IMAGE $NODEPOOL $TPU_TOPOLOGY $TPU_ACCELERATOR $COMPLETIONS $MEMORY_PER_HOST $TPU_CHIPS_PER_HOST $GKE_PROJECT $GKE_REGION $GKE_CLUSTER $LIBTPU_ARGS'
4850

4951
envsubst "${YAML_VARS}" < maxtest.yaml.template > maxtest.yaml
5052

benchmarks/maxtest/maxtest.yaml.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ spec:
4242
_sigterm() (kill -SIGTERM $! 2>/dev/null;);
4343
trap _sigterm SIGTERM;
4444

45-
(export TPU_STDERR_LOG_LEVEL=0 && export TPU_MIN_LOG_LEVEL=0 && export TF_CPP_MIN_LOG_LEVEL=0 && python3 -m benchmarks.benchmark_runner healthscan --device_type=$TPU_ACCELERATOR_TYPE --base_output_directory=gke-healthscan-output --num_steps=5) & PID=$1;
45+
(export TPU_STDERR_LOG_LEVEL=0 && export TPU_MIN_LOG_LEVEL=0 && export TF_CPP_MIN_LOG_LEVEL=0 && echo LIBTPU_INIT_ARGS='$LIBTPU_ARGS' && export LIBTPU_INIT_ARGS='$LIBTPU_ARGS' && python3 -m benchmarks.benchmark_runner healthscan --device_type=$TPU_ACCELERATOR_TYPE --base_output_directory=gke-healthscan-output --num_steps=5) & PID=$1;
4646

4747
while kill -0 $PID 2>/dev/null;
4848
do sleep 5;

0 commit comments

Comments
 (0)