|
| 1 | +name: ~test MaxText functionality on Kubernetes |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_call: |
| 5 | + inputs: |
| 6 | + MAXTEXT_IMAGE: |
| 7 | + type: string |
| 8 | + description: MaxText container to test |
| 9 | + required: true |
| 10 | + |
| 11 | +permissions: |
| 12 | + contents: read # to fetch code |
| 13 | + |
| 14 | +jobs: |
| 15 | + maxtext: |
| 16 | + runs-on: eks |
| 17 | + env: |
| 18 | + CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}" |
| 19 | + JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}" |
| 20 | + steps: |
| 21 | + - name: Check out the repository |
| 22 | + uses: actions/checkout@v4 |
| 23 | + - name: Login to GitHub Container Registry |
| 24 | + uses: docker/login-action@v3 |
| 25 | + with: |
| 26 | + registry: ghcr.io |
| 27 | + username: ${{ github.repository_owner }} |
| 28 | + password: ${{ secrets.GITHUB_TOKEN }} |
| 29 | + - name: Login to NVIDIA Container Registry |
| 30 | + uses: docker/login-action@v3 |
| 31 | + with: |
| 32 | + registry: nvcr.io |
| 33 | + username: $oauthtoken |
| 34 | + password: ${{ secrets.NVCR_TOKEN }} |
| 35 | + - name: Store GitHub Container Registry token as Kubernetes secret |
| 36 | + run: | |
| 37 | + # Make this available to later steps |
| 38 | + TOKEN_NAME="${JOB_NAME}-token" |
| 39 | + echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" |
| 40 | + kubectl create secret generic \ |
| 41 | + ${TOKEN_NAME} \ |
| 42 | + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ |
| 43 | + --type=kubernetes.io/dockerconfigjson |
| 44 | + - name: Configure Kubernetes job |
| 45 | + run: | |
| 46 | + export SERVICE_NAME="${JOB_NAME}-svc" |
| 47 | + yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME) |
| 48 | + | select(di == 0).spec.selector.job-name = strenv(JOB_NAME) |
| 49 | + | select(di == 1).metadata.name = strenv(JOB_NAME) |
| 50 | + | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) |
| 51 | + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) |
| 52 | + | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE) |
| 53 | + | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME) |
| 54 | + | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME) |
| 55 | + | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \ |
| 56 | + .github/eks-workflow-files/maxtext-job.yaml |
| 57 | + git diff .github/eks-workflow-files/maxtext-job.yaml |
| 58 | + - name: Submit Kubernetes job |
| 59 | + run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml |
| 60 | + - name: Wait for Kubernetes job to start |
| 61 | + run: | |
| 62 | + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when |
| 63 | + # resources are available, but that is where there can be a long wait if the |
| 64 | + # cluster is busy executing other jobs. |
| 65 | + kubectl wait --for=create job/${JOB_NAME} |
| 66 | + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s |
| 67 | + - name: Stream Kubernetes job output |
| 68 | + run: | |
| 69 | + # Streaming logs will fail if the container/pod is still pending |
| 70 | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do |
| 71 | + sleep 1 |
| 72 | + done |
| 73 | + kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME} |
| 74 | + - name: Retrieve Kubernetes job status |
| 75 | + shell: bash -exo pipefail {0} |
| 76 | + run: | |
| 77 | + while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do |
| 78 | + failure=${status[0]:-0} |
| 79 | + success=${status[1]:-0} |
| 80 | + total=$((failure+success)) |
| 81 | + if [[ ${total} < 2 ]]; then |
| 82 | + sleep 1 |
| 83 | + elif [[ ${total} == 2 ]]; then |
| 84 | + break |
| 85 | + else |
| 86 | + # FIXME |
| 87 | + exit 255 |
| 88 | + fi |
| 89 | + done |
| 90 | + exit ${failure} |
| 91 | + # Provide more debug output in case of failure; note that some kinds of launch |
| 92 | + # failure do not produce any log output. |
| 93 | + - name: Debug failed Kubernetes job |
| 94 | + if: failure() |
| 95 | + run: | |
| 96 | + # Provide better debug in case of launch failures that will not produce log output |
| 97 | + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name) |
| 98 | + if [[ -n "${pods}" ]]; then |
| 99 | + kubectl describe ${pods} |
| 100 | + fi |
| 101 | + # Clean up in case of errors as well as success |
| 102 | + - name: Delete Kubernetes job |
| 103 | + if: always() |
| 104 | + run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml |
| 105 | + - name: Delete GitHub Container Registry token |
| 106 | + if: always() |
| 107 | + run: kubectl delete secret ${TOKEN_NAME} |
0 commit comments