Skip to content

Commit

Permalink
Merge pull request #736 from cybozu-go/eviction-dry-run
Browse files Browse the repository at this point in the history
perform eviction dry-run during node reboot feature
  • Loading branch information
umezawatakeshi authored May 24, 2024
2 parents bb70ba5 + e64f850 commit 14ef042
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 14 deletions.
3 changes: 3 additions & 0 deletions mtest/assets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ var rebootJobCompletedYAML []byte
//go:embed reboot-job-running.yaml
var rebootJobRunningYAML []byte

//go:embed reboot-eviction-dry-run.yaml
var rebootEvictionDryRunYAML []byte

//go:embed reboot-slow-eviction-deployment.yaml
var rebootSlowEvictionDeploymentYAML []byte

Expand Down
81 changes: 81 additions & 0 deletions mtest/reboot-eviction-dry-run.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 1-not-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 1-not-evictable
template:
metadata:
labels:
reboot-app: 1-not-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
namespace: reboot-test
name: 1-not-evictable
spec:
maxUnavailable: 0
selector:
matchLabels:
reboot-app: 1-not-evictable
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 0-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 0-evictable
template:
metadata:
labels:
reboot-app: 0-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
reboot-app: 1-not-evictable
topologyKey: kubernetes.io/hostname
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 2-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 2-evictable
template:
metadata:
labels:
reboot-app: 2-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
reboot-app: 1-not-evictable
topologyKey: kubernetes.io/hostname
44 changes: 44 additions & 0 deletions mtest/reboot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,50 @@ func testRebootOperations() {
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)
})

It("checks eviction dry run behavior", func() {
deploymentNames := []string{"0-evictable", "1-not-evictable", "2-evictable"}

By("Deploying Pods")
_, stderr, err := kubectlWithInput(rebootEvictionDryRunYAML, "apply", "-f", "-")
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)

creationTime := map[string]time.Time{}
var nodeName string
for _, name := range deploymentNames {
checkDeploymentEventually("reboot-test", name)
pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name)
Expect(pods.Items).Should(HaveLen(1), "pod is not created")
pod := &pods.Items[0]
Expect(pod.Status.Phase).Should(Equal(corev1.PodRunning), "pod is not running")
creationTime[name] = pod.ObjectMeta.CreationTimestamp.Time
nodeName = pod.Spec.NodeName // all pods run on the same node
}

By("Reboot operation will not delete evictable pods if non-evictable pods exist on the same node")
rebootQueueAdd([]string{nodeName})
rebootShouldNotProceed()

for _, name := range deploymentNames {
pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name)
Expect(pods.Items).Should(HaveLen(1), "pod does not exist (probably evicted)")
pod := pods.Items[0]
Expect(pod.ObjectMeta.CreationTimestamp.Time).Should(Equal(creationTime[name]), "pod by deployment %s is recreated (probably evicted)", name)
}

By("Cleaning up")
rebootQueueCancelAllAndWait(cluster)

_, stderr, err = kubectlWithInput(rebootEvictionDryRunYAML, "delete", "-f", "-")
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)

for _, name := range deploymentNames {
Eventually(func(g Gomega) {
deploymentPods := getPodListGomega(g, "reboot-test", "-l=reboot-app="+name)
g.Expect(deploymentPods.Items).Should(HaveLen(0), "Pod does not terminate")
}).Should(Succeed())
}
})

It("checks drain timeout behavior", func() {
By("Deploying a slow eviction pod")
_, stderr, err := kubectlWithInput(rebootSlowEvictionDeploymentYAML, "apply", "-f", "-")
Expand Down
23 changes: 21 additions & 2 deletions op/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,20 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
// This "check and cordon" order may overlook Job pods just started.
// On the other hand, "cordon and check" may cause excessive cordon.
// The overlook is acceptable because it is rare case and detected by the following evictOrDeleteNodePod().

err = checkJobPodNotExist(ctx, cs, entry.Node)
log.Info("start eviction dry-run", map[string]interface{}{
"name": entry.Node,
})
err = dryRunEvictOrDeleteNodePod(ctx, cs, entry.Node, protected)
if err != nil {
log.Warn("eviction dry-run failed", map[string]interface{}{
"name": entry.Node,
log.FnError: err,
})
return err
}
log.Info("eviction dry-run succeeded", map[string]interface{}{
"name": entry.Node,
})

_, err = nodesAPI.Patch(ctx, entry.Node, types.StrategicMergePatchType, []byte(`
{
Expand All @@ -169,13 +178,23 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure

// next, evict pods on each node
for _, entry := range evictNodes {
log.Info("start eviction", map[string]interface{}{
"name": entry.Node,
})
err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval)
if err != nil {
log.Warn("eviction failed", map[string]interface{}{
"name": entry.Node,
log.FnError: err,
})
c.notifyFailedNode(entry.Node)
err = drainBackOff(ctx, inf, entry, err)
if err != nil {
return err
}
log.Info("eviction succeeded", map[string]interface{}{
"name": entry.Node,
})
}
}

Expand Down
53 changes: 42 additions & 11 deletions op/reboot_decide.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,46 +61,74 @@ func enumeratePods(ctx context.Context, cs *kubernetes.Clientset, node string,
return nil
}

// checkJobPodNotExist checks running Pods on the specified Node.
// It returns an error if a running Pod exists.
func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node string) error {
return enumeratePods(ctx, cs, node, func(_ *corev1.Pod) error {
return nil
}, func(pod *corev1.Pod) error {
return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase)
})
// dryRunEvictOrDeleteNodePod checks eviction or deletion of Pods on the specified Node can proceed.
// It returns an error if a running Pod exists or an eviction of the Pod in protected namespace failed.
func dryRunEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error {
return doEvictOrDeleteNodePod(ctx, cs, node, protected, 0, 0, true)
}

// evictOrDeleteNodePod evicts or delete Pods on the specified Node.
// It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
// If a running Job Pod exists, this function returns an error.
func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error {
return doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false)
}

// doEvictOrDeleteNodePod evicts or delete Pods on the specified Node.
// It first tries eviction.
// If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
// If the eviction failed and the Pod's namespace is protected, it retries after `interval` interval at most `attempts` times.
// If a running Job Pod exists, this function returns an error.
// If `dry` is true, it performs dry run and `attempts` and `interval` are ignored.
func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration, dry bool) error {
var deleteOptions *metav1.DeleteOptions
if dry {
deleteOptions = &metav1.DeleteOptions{
DryRun: []string{"All"},
}
}

return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error {
if dry && !protected[pod.Namespace] {
// in case of dry-run for Pods in non-protected namespace,
// return immediately because its "eviction or deletion" never fails
log.Info("skip evicting pod because its namespace is not protected", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry, // for consistency
})
return nil
}

evictCount := 0
EVICT:
log.Info("start evicting pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
DeleteOptions: deleteOptions,
})
evictCount++
switch {
case err == nil:
log.Info("evicted pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
case apierrors.IsNotFound(err):
// already evicted or deleted.
case !apierrors.IsTooManyRequests(err):
// not a PDB related error
return fmt.Errorf("failed to evict pod %s/%s: %w", pod.Namespace, pod.Name, err)
case !protected[pod.Namespace]:
// not dry here
log.Warn("failed to evict non-protected pod due to PDB", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry, // for consistency (same for below)
log.FnError: err,
})
err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
Expand All @@ -110,9 +138,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
log.Warn("deleted non-protected pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
default:
if evictCount < attempts {
// in case of dry-run, do not retry.
if !dry && evictCount < attempts {
select {
case <-ctx.Done():
return ctx.Err()
Expand All @@ -121,6 +151,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
log.Info("retry eviction of pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
goto EVICT
}
Expand Down
23 changes: 22 additions & 1 deletion op/repair_drain_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"github.com/cybozu-go/cke"
"github.com/cybozu-go/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)
Expand Down Expand Up @@ -87,10 +88,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
return err
}

err = checkJobPodNotExist(ctx, cs, c.entry.Nodename)
log.Info("start eviction dry-run", map[string]interface{}{
"address": c.entry.Address,
})
err = dryRunEvictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected)
if err != nil {
log.Warn("eviction dry-run failed", map[string]interface{}{
"address": c.entry.Address,
log.FnError: err,
})
return err
}
log.Info("eviction dry-run succeeded", map[string]interface{}{
"address": c.entry.Address,
})

// Note: The annotation name is shared with reboot operations.
_, err = nodesAPI.Patch(ctx, c.entry.Nodename, types.StrategicMergePatchType, []byte(`
Expand All @@ -109,10 +120,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
return repairDrainBackOff(ctx, inf, c.entry, err)
}

log.Info("start eviction", map[string]interface{}{
"address": c.entry.Address,
})
err = evictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected, c.evictAttempts, c.evictInterval)
if err != nil {
log.Warn("eviction failed", map[string]interface{}{
"address": c.entry.Address,
log.FnError: err,
})
return repairDrainBackOff(ctx, inf, c.entry, err)
}
log.Info("eviction succeeded", map[string]interface{}{
"address": c.entry.Address,
})

return nil
}
Expand Down

0 comments on commit 14ef042

Please sign in to comment.