@@ -186,12 +186,45 @@ create_aks_cluster() {
186186 export USER_IDENTITY
187187
188188 echo " assigning user-assigned managed identity to the AKS cluster"
189- az aks update --resource-group " ${AKS_RESOURCE_GROUP} " \
189+
190+ # Wait for any ongoing cluster operations to complete before proceeding
191+ echo " waiting for cluster to be in a ready state"
192+ az aks wait --resource-group " ${AKS_RESOURCE_GROUP} " --name " ${MGMT_CLUSTER_NAME} " --created --timeout 600 --only-show-errors
193+
194+ # Temporarily mitigate PDB issues by scaling up metrics-server before the update
195+ echo " temporarily scaling up metrics-server to avoid PDB drain issues"
196+ kubectl scale deployment metrics-server --replicas=3 -n kube-system || true
197+
198+ # Wait a moment for the pods to be scheduled
199+ sleep 15
200+
201+ # Retry the managed identity assignment with exponential backoff
202+ retry_count=0
203+ max_retries=5
204+ base_sleep=30
205+ until az aks update --resource-group " ${AKS_RESOURCE_GROUP} " \
190206 --name " ${MGMT_CLUSTER_NAME} " \
191207 --enable-managed-identity \
192208 --assign-identity " ${AKS_MI_RESOURCE_ID} " \
193209 --assign-kubelet-identity " ${AKS_MI_RESOURCE_ID} " \
194- --output none --only-show-errors --yes
210+ --output none --only-show-errors --yes; do
211+ retry_count=$(( retry_count + 1 ))
212+ if [ $retry_count -ge $max_retries ]; then
213+ echo " Failed to assign managed identity after $max_retries attempts"
214+ # Restore original metrics-server replicas before failing
215+ kubectl scale deployment metrics-server --replicas=2 -n kube-system || true
216+ exit 1
217+ fi
218+
219+ # Exponential backoff with jitter: base_sleep * (2^retry_count) + random(0-10)
220+ sleep_time=$(( base_sleep * (1 << retry_count) + RANDOM % 11 ))
221+ echo " Attempt $retry_count failed, retrying in $sleep_time seconds..."
222+ sleep $sleep_time
223+ done
224+
225+ # Restore original metrics-server replica count
226+ echo " restoring metrics-server to original replica count"
227+ kubectl scale deployment metrics-server --replicas=2 -n kube-system || true
195228
196229 else
197230 # echo "fetching Client ID for ${MGMT_CLUSTER_NAME}"
0 commit comments