Merge pull request #708 from Azure/cc_slurm_nhc_autoscaling_bug

Fix Cc slurm nhc autoscaling bug
Azure · Feb 20, 2023 · 4564233 · 4564233
2 parents 90c1319 + 9bdbdd1
commit 4564233
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init/files/configure_nhc.sh b/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init/files/configure_nhc.sh
@@ -19,6 +19,7 @@ SLURM_HEALTH_CHECK_NODE_STATE=IDLE
 NHC_PROLOG=1
 NHC_EPILOG=0
 AUTOSCALING=0
+PROLOG_NOHOLD_REQUEUE=0
 NHC_EXTRA_TEST_FILES="csc_nvidia_smi.nhc azure_cuda_bandwidth.nhc azure_gpu_app_clocks.nhc azure_gpu_ecc.nhc azure_gpu_persistence.nhc azure_ib_write_bw_gdr.nhc azure_nccl_allreduce_ib_loopback.nhc azure_ib_link_flapping.nhc azure_gpu_clock_throttling.nhc azure_cpu_drop_cache_mem.nhc azure_gpu_xid.nhc azure_nccl_allreduce.nhc azure_raid_health.nhc"
 
 source $CYCLECLOUD_SPEC_PATH/files/common_functions.sh
@@ -97,7 +98,9 @@ function update_slurm_prolog_epilog() {
          echo '#!/bin/bash' > /sched/scripts/prolog.sh
          chmod +x /sched/scripts/prolog.sh
          echo "Prolog=/sched/scripts/prolog.sh" >> $SLURM_CONF
-         echo "PrologFlags=Alloc" >> $SLURM_CONF
+         if [[ $AUTOSCALING == 0 ]]; then
+            echo "PrologFlags=Alloc" >> $SLURM_CONF
+         fi
       elif [[ $prolog_epilog == "epilog" ]]; then
          echo '#!/bin/bash' > /sched/scripts/epilog.sh
          echo 'TIMESTAMP=$(/bin/date "+%Y%m%d %H:%M:%S")' >> /sched/scripts/epilog.sh
@@ -123,6 +126,9 @@ function slurm_config() {
       if [[ $NHC_PROLOG == 1 ]]; then
          if [[ $AUTOSCALING == 1 ]]; then
             update_slurm_prolog_epilog prolog wait_for_nhc.sh
+            if [[ $PROLOG_NOHOLD_REQUEUE == 1 ]]; then
+               sed -i 's/SchedulerParameter.*$/&,nohold_on_prolog_fail/' $SLURM_CONF
+            fi
          else
             update_slurm_prolog_epilog prolog kill_nhc.sh
          fi

diff --git a/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init/files/wait_for_nhc.sh b/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init/files/wait_for_nhc.sh
@@ -16,4 +16,9 @@ done
 
 TIMESTAMP=$(/bin/date '+%Y%m%d %H:%M:%S')
 echo "${TIMESTAMP} [prolog] NHC processes finished and job can start" >> /var/log/nhc.log
-exit 0
+
+if [ -f /var/run/nhc/nhc.status ]; then
+   exit 1
+else
+   exit 0
+fi
diff --git a/experimental/cc_slurm_nhc/readme.md b/experimental/cc_slurm_nhc/readme.md
@@ -67,11 +67,11 @@ You just add your custom health check to /etc/nhc/scripts and modify your nhc.co
 ## Kill NHC via SLURM Prolog
 To prevent NHC from running while a job is running, we have provided a script to kill NHC processes (kill_nhc.sh). You can run this script before a job starts by using the SLURM PROLOG, set NHC_PROLOG=1 in the configure_nhc.sh script to enable this prolog (default) or set it to 0 to disable it.
 
->Note: If you have autoscaling enabled, then set AUTOSCALING=1 in the configure_nhc.sh script, this will replace kill_nhc.sh with wait_for_nhc.sh in the prolog.sh (To allow the NHC checks to complete (by waiting) when a node is autoscaled before starting your job)
+>Note: If you have autoscaling enabled, then set AUTOSCALING=1 in the configure_nhc.sh script, this will replace kill_nhc.sh with wait_for_nhc.sh in the prolog.sh (To allow the NHC checks to complete (by waiting) when a node is autoscaled before starting your job. There is an additional prolog option when AUTOSCALING=1. If PROLOG_NOHOLD_REQUEUE=1 and NHC fails, the slurm job will be requeued with no hold (i.e It will attempt to allocate new nodes for the job), the default behavior is to requeue with a hold.)
 
 
 ## Run NHC via SLURM Epilog
-If you need to run NHC checks after a job completes (SLURM Epilog), then set NHC_EPILOG=1 in the configure_nhc.sh script.
+If you need to run NHC checks after a job completes (SLURM Epilog), then set NHC_EPILOG=1 in the configure_nhc.sh script. The NHC will only run via EPILOG (after job) only if its an exclusive job (i.e no other jobs are running on the node).
 
 >Note: If you run NHC via Epilog, then set HealthCheckInterval to a large value so it effectively only runs when a new node is provisioned in the cluster.