fix(test): check restart separately from ready pods. (#951)

# Description When assessing the stability of components within a cluster, container restarts can sometimes be ignored if they are expected or non-critical. This pull request adds an attribute to allow the option to ignore container restarts during end-to-end (E2E) cluster confidence tests. ## Related Issue If this pull request is related to any issue, please mention it here. Additionally, make sure that the issue is assigned to you before submitting this pull request. ## Checklist - [x] I have read the [contributing documentation](https://retina.sh/docs/contributing). - [x] I signed and signed-off the commits (`git commit -S -s ...`). See [this documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) on signing commits. - [x] I have correctly attributed the author(s) of the code. - [x] I have tested the changes locally. - [x] I have followed the project's style guidelines. - [x] I have updated the documentation, if necessary. - [x] I have added tests, if applicable. ## Screenshots (if applicable) or Testing Completed Please add any relevant screenshots or GIFs to showcase the changes made. ## Additional Notes Add any additional notes or context about the pull request here. --- Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more information on how to contribute to this project.
microsoft · Nov 6, 2024 · 2b109a3 · 2b109a3
1 parent ed639b0
commit 2b109a3
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 17 deletions.
diff --git a/...2e/framework/kubernetes/wait-pod-ready.go → .../framework/kubernetes/check-pod-status.go b/...2e/framework/kubernetes/wait-pod-ready.go → .../framework/kubernetes/check-pod-status.go
@@ -46,13 +46,6 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
 				return false, fmt.Errorf("error getting Pod: %w", err)
 			}
 
-			for istatus := range pod.Status.ContainerStatuses {
-				status := &pod.Status.ContainerStatuses[istatus]
-				if status.RestartCount > 0 {
-					return false, fmt.Errorf("pod %s has %d restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed)
-				}
-			}
-
 			// Check the Pod phase
 			if pod.Status.Phase != corev1.PodRunning {
 				if printIterator%printInterval == 0 {
@@ -85,3 +78,21 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
 	}
 	return nil
 }
+
+func CheckContainerRestart(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error {
+	var podList *corev1.PodList
+	podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
+	if err != nil {
+		return fmt.Errorf("error listing Pods: %w", err)
+	}
+
+	for _, pod := range podList.Items {
+		for istatus := range pod.Status.ContainerStatuses {
+			status := &pod.Status.ContainerStatuses[istatus]
+			if status.RestartCount > 0 {
+				return fmt.Errorf("pod %s has %d container restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed)
+			}
+		}
+	}
+	return nil
+}
diff --git a/test/e2e/framework/kubernetes/no-crashes.go b/test/e2e/framework/kubernetes/no-crashes.go
@@ -10,13 +10,18 @@ import (
 
 var ErrPodCrashed = fmt.Errorf("pod has crashes")
 
-type EnsureStableCluster struct {
+type EnsureStableComponent struct {
 	LabelSelector      string
 	PodNamespace       string
 	KubeConfigFilePath string
+
+	// Container restarts can occur for various reason, they do not necessarily mean the entire cluster
+	// is unstable or needs to be recreated. In some cases, container restarts are expected and acceptable.
+	// This flag should be set to true only in those cases and provide additional why restart restarts are acceptable.
+	IgnoreContainerRestart bool
 }
 
-func (n *EnsureStableCluster) Run() error {
+func (n *EnsureStableComponent) Run() error {
 	config, err := clientcmd.BuildConfigFromFlags("", n.KubeConfigFilePath)
 	if err != nil {
 		return fmt.Errorf("error building kubeconfig: %w", err)
@@ -31,13 +36,20 @@ func (n *EnsureStableCluster) Run() error {
 	if err != nil {
 		return fmt.Errorf("error waiting for retina pods to be ready: %w", err)
 	}
+
+	if !n.IgnoreContainerRestart {
+		err = CheckContainerRestart(context.TODO(), clientset, n.PodNamespace, n.LabelSelector)
+		if err != nil {
+			return fmt.Errorf("error checking pod restarts: %w", err)
+		}
+	}
 	return nil
 }
 
-func (n *EnsureStableCluster) Prevalidate() error {
+func (n *EnsureStableComponent) Prevalidate() error {
 	return nil
 }
 
-func (n *EnsureStableCluster) Stop() error {
+func (n *EnsureStableComponent) Stop() error {
 	return nil
 }
diff --git a/test/e2e/jobs/jobs.go b/test/e2e/jobs/jobs.go
@@ -137,9 +137,10 @@ func InstallAndTestRetinaBasicMetrics(kubeConfigFilePath, chartPath string, test
 		job.AddScenario(dns.ValidateBasicDNSMetrics(scenario.name, scenario.req, scenario.resp, testPodNamespace))
 	}
 
-	job.AddStep(&kubernetes.EnsureStableCluster{
-		PodNamespace:  common.KubeSystemNamespace,
-		LabelSelector: "k8s-app=retina",
+	job.AddStep(&kubernetes.EnsureStableComponent{
+		PodNamespace:           common.KubeSystemNamespace,
+		LabelSelector:          "k8s-app=retina",
+		IgnoreContainerRestart: false,
 	}, nil)
 
 	return job
@@ -204,9 +205,10 @@ func UpgradeAndTestRetinaAdvancedMetrics(kubeConfigFilePath, chartPath, valuesFi
 
 	job.AddScenario(latency.ValidateLatencyMetric(testPodNamespace))
 
-	job.AddStep(&kubernetes.EnsureStableCluster{
-		PodNamespace:  common.KubeSystemNamespace,
-		LabelSelector: "k8s-app=retina",
+	job.AddStep(&kubernetes.EnsureStableComponent{
+		PodNamespace:           common.KubeSystemNamespace,
+		LabelSelector:          "k8s-app=retina",
+		IgnoreContainerRestart: false,
 	}, nil)
 
 	return job