Skip to content

Commit

Permalink
fix(test): check restart separately from ready pods. (#951)
Browse files Browse the repository at this point in the history
# Description

When assessing the stability of components within a cluster, container
restarts can sometimes be ignored if they are expected or non-critical.
This pull request adds an attribute to allow the option to ignore
container restarts during end-to-end (E2E) cluster confidence tests.

## Related Issue

If this pull request is related to any issue, please mention it here.
Additionally, make sure that the issue is assigned to you before
submitting this pull request.

## Checklist

- [x] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [x] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [x] I have correctly attributed the author(s) of the code.
- [x] I have tested the changes locally.
- [x] I have followed the project's style guidelines.
- [x] I have updated the documentation, if necessary.
- [x] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

Please add any relevant screenshots or GIFs to showcase the changes
made.

## Additional Notes

Add any additional notes or context about the pull request here.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.
  • Loading branch information
jimassa authored Nov 6, 2024
1 parent ed639b0 commit 2b109a3
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
return false, fmt.Errorf("error getting Pod: %w", err)
}

for istatus := range pod.Status.ContainerStatuses {
status := &pod.Status.ContainerStatuses[istatus]
if status.RestartCount > 0 {
return false, fmt.Errorf("pod %s has %d restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed)
}
}

// Check the Pod phase
if pod.Status.Phase != corev1.PodRunning {
if printIterator%printInterval == 0 {
Expand Down Expand Up @@ -85,3 +78,21 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
}
return nil
}

func CheckContainerRestart(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error {
var podList *corev1.PodList
podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return fmt.Errorf("error listing Pods: %w", err)
}

for _, pod := range podList.Items {
for istatus := range pod.Status.ContainerStatuses {
status := &pod.Status.ContainerStatuses[istatus]
if status.RestartCount > 0 {
return fmt.Errorf("pod %s has %d container restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed)
}
}
}
return nil
}
20 changes: 16 additions & 4 deletions test/e2e/framework/kubernetes/no-crashes.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@ import (

var ErrPodCrashed = fmt.Errorf("pod has crashes")

type EnsureStableCluster struct {
type EnsureStableComponent struct {
LabelSelector string
PodNamespace string
KubeConfigFilePath string

// Container restarts can occur for various reason, they do not necessarily mean the entire cluster
// is unstable or needs to be recreated. In some cases, container restarts are expected and acceptable.
// This flag should be set to true only in those cases and provide additional why restart restarts are acceptable.
IgnoreContainerRestart bool
}

func (n *EnsureStableCluster) Run() error {
func (n *EnsureStableComponent) Run() error {
config, err := clientcmd.BuildConfigFromFlags("", n.KubeConfigFilePath)
if err != nil {
return fmt.Errorf("error building kubeconfig: %w", err)
Expand All @@ -31,13 +36,20 @@ func (n *EnsureStableCluster) Run() error {
if err != nil {
return fmt.Errorf("error waiting for retina pods to be ready: %w", err)
}

if !n.IgnoreContainerRestart {
err = CheckContainerRestart(context.TODO(), clientset, n.PodNamespace, n.LabelSelector)
if err != nil {
return fmt.Errorf("error checking pod restarts: %w", err)
}
}
return nil
}

func (n *EnsureStableCluster) Prevalidate() error {
func (n *EnsureStableComponent) Prevalidate() error {
return nil
}

func (n *EnsureStableCluster) Stop() error {
func (n *EnsureStableComponent) Stop() error {
return nil
}
14 changes: 8 additions & 6 deletions test/e2e/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,10 @@ func InstallAndTestRetinaBasicMetrics(kubeConfigFilePath, chartPath string, test
job.AddScenario(dns.ValidateBasicDNSMetrics(scenario.name, scenario.req, scenario.resp, testPodNamespace))
}

job.AddStep(&kubernetes.EnsureStableCluster{
PodNamespace: common.KubeSystemNamespace,
LabelSelector: "k8s-app=retina",
job.AddStep(&kubernetes.EnsureStableComponent{
PodNamespace: common.KubeSystemNamespace,
LabelSelector: "k8s-app=retina",
IgnoreContainerRestart: false,
}, nil)

return job
Expand Down Expand Up @@ -204,9 +205,10 @@ func UpgradeAndTestRetinaAdvancedMetrics(kubeConfigFilePath, chartPath, valuesFi

job.AddScenario(latency.ValidateLatencyMetric(testPodNamespace))

job.AddStep(&kubernetes.EnsureStableCluster{
PodNamespace: common.KubeSystemNamespace,
LabelSelector: "k8s-app=retina",
job.AddStep(&kubernetes.EnsureStableComponent{
PodNamespace: common.KubeSystemNamespace,
LabelSelector: "k8s-app=retina",
IgnoreContainerRestart: false,
}, nil)

return job
Expand Down

0 comments on commit 2b109a3

Please sign in to comment.