Skip to content

Commit d631485

Browse files
committed
Fix pdb flake and delete flow for aks mgmt cluster
1 parent ed22c64 commit d631485

File tree

2 files changed

+53
-8
lines changed

2 files changed

+53
-8
lines changed

e2e.mk

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
# long-running E2E jobs every time that file changes
55

66
##@ E2E Testing:
7-
.PHONY: test-e2e-run
8-
test-e2e-run: generate-e2e-templates install-tools create-bootstrap ## Run e2e tests.
7+
.PHONY: test-e2e-run-steps
8+
test-e2e-run-steps: generate-e2e-templates install-tools create-bootstrap ## Run e2e test steps without cleanup.
99
if [ "$(MGMT_CLUSTER_TYPE)" == "aks" ]; then \
1010
source ./scripts/peer-vnets.sh && source_tilt_settings tilt-settings.yaml; \
1111
fi; \
@@ -17,14 +17,26 @@ test-e2e-run: generate-e2e-templates install-tools create-bootstrap ## Run e2e t
1717
-e2e.artifacts-folder="$(ARTIFACTS)" \
1818
-e2e.config="$(E2E_CONF_FILE_ENVSUBST)" \
1919
-e2e.skip-log-collection="$(SKIP_LOG_COLLECTION)" \
20-
-e2e.skip-resource-cleanup=$(SKIP_CLEANUP) -e2e.use-existing-cluster=$(SKIP_CREATE_MGMT_CLUSTER) $(E2E_ARGS) \
21-
$(MAKE) cleanup-workload-identity
22-
$(MAKE) clean-release-git
20+
-e2e.skip-resource-cleanup=$(SKIP_CLEANUP) -e2e.use-existing-cluster=$(SKIP_CREATE_MGMT_CLUSTER) $(E2E_ARGS)
21+
22+
.PHONY: test-e2e-cleanup
23+
test-e2e-cleanup: ## Clean up e2e test resources.
24+
$(MAKE) cleanup-workload-identity || true
25+
$(MAKE) clean-release-git || true
2326
if [ "$(MGMT_CLUSTER_TYPE)" == "aks" ] && [ "$(SKIP_CLEANUP)" != "true" ]; then \
2427
echo "Cleaning up AKS management cluster..."; \
25-
$(MAKE) aks-delete; \
28+
$(MAKE) aks-delete || true; \
2629
fi
2730

31+
.PHONY: test-e2e-run
32+
test-e2e-run: ## Run e2e tests.
33+
@set +e; \
34+
$(MAKE) test-e2e-run-steps; \
35+
EXIT_CODE=$$?; \
36+
set -e; \
37+
$(MAKE) test-e2e-cleanup; \
38+
exit $$EXIT_CODE
39+
2840
.PHONY: test-e2e
2941
test-e2e: ## Run "docker-build" and "docker-push" rules then run e2e tests.
3042
PULL_POLICY=IfNotPresent MANAGER_IMAGE=$(CONTROLLER_IMG)-$(ARCH):$(TAG) \

scripts/aks-as-mgmt.sh

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,45 @@ create_aks_cluster() {
186186
export USER_IDENTITY
187187

188188
echo "assigning user-assigned managed identity to the AKS cluster"
189-
az aks update --resource-group "${AKS_RESOURCE_GROUP}" \
189+
190+
# Wait for any ongoing cluster operations to complete before proceeding
191+
echo "waiting for cluster to be in a ready state"
192+
az aks wait --resource-group "${AKS_RESOURCE_GROUP}" --name "${MGMT_CLUSTER_NAME}" --created --timeout 600 --only-show-errors
193+
194+
# Temporarily mitigate PDB issues by scaling up metrics-server before the update
195+
echo "temporarily scaling up metrics-server to avoid PDB drain issues"
196+
kubectl scale deployment metrics-server --replicas=3 -n kube-system || true
197+
198+
# Wait a moment for the pods to be scheduled
199+
sleep 15
200+
201+
# Retry the managed identity assignment with exponential backoff
202+
retry_count=0
203+
max_retries=5
204+
base_sleep=30
205+
until az aks update --resource-group "${AKS_RESOURCE_GROUP}" \
190206
--name "${MGMT_CLUSTER_NAME}" \
191207
--enable-managed-identity \
192208
--assign-identity "${AKS_MI_RESOURCE_ID}" \
193209
--assign-kubelet-identity "${AKS_MI_RESOURCE_ID}" \
194-
--output none --only-show-errors --yes
210+
--output none --only-show-errors --yes; do
211+
retry_count=$((retry_count + 1))
212+
if [ $retry_count -ge $max_retries ]; then
213+
echo "Failed to assign managed identity after $max_retries attempts"
214+
# Restore original metrics-server replicas before failing
215+
kubectl scale deployment metrics-server --replicas=2 -n kube-system || true
216+
exit 1
217+
fi
218+
219+
# Exponential backoff with jitter: base_sleep * (2^retry_count) + random(0-10)
220+
sleep_time=$((base_sleep * (1 << retry_count) + RANDOM % 11))
221+
echo "Attempt $retry_count failed, retrying in $sleep_time seconds..."
222+
sleep $sleep_time
223+
done
224+
225+
# Restore original metrics-server replica count
226+
echo "restoring metrics-server to original replica count"
227+
kubectl scale deployment metrics-server --replicas=2 -n kube-system || true
195228

196229
else
197230
# echo "fetching Client ID for ${MGMT_CLUSTER_NAME}"

0 commit comments

Comments
 (0)