From 513c31b4fbe0c4b46157fb463a395848d14f31b5 Mon Sep 17 00:00:00 2001 From: Sergey Nuzhdin Date: Tue, 30 Jun 2020 07:51:03 +0100 Subject: [PATCH 1/3] fix #51 pending jobs not being deleted Signed-off-by: Sergey Nuzhdin --- pkg/controller/controller.go | 70 +++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 163afa4e..7263bc25 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -165,7 +165,7 @@ func (c *Kleaner) Process(obj interface{}) { return } - finishTime := extractJobFinishTime(job) + finishTime := jobFinishTime(job) if finishTime.IsZero() { return @@ -191,22 +191,19 @@ func (c *Kleaner) Process(obj interface{}) { } pod := t owners := getPodOwnerKinds(pod) - podFinishTime := extractPodFinishTime(pod) - if podFinishTime.IsZero() { - return - } - age := time.Since(podFinishTime) - // orphaned pod: those that do not have any owner references - // - uses c.deleteOrphanedAfter - if len(owners) == 0 { - if c.deleteOrphanedAfter > 0 && age >= c.deleteOrphanedAfter { - c.deletePods(pod) + podFinishTime := podFinishTime(pod) + if !podFinishTime.IsZero() { + age := time.Since(podFinishTime) + // orphaned pod: those that do not have any owner references + // - uses c.deleteOrphanedAfter + if len(owners) == 0 { + if c.deleteOrphanedAfter > 0 && age >= c.deleteOrphanedAfter { + c.deletePod(pod) + } } - return - } - // owned by job, have exactly one ownerReference present and its kind is Job - // - uses the c.deleteSuccessfulAfter, c.deleteFailedAfter, c.deletePendingAfter - if isOwnedByJob(owners) { + // owned by job, have exactly one ownerReference present and its kind is Job + // - uses the c.deleteSuccessfulAfter, c.deleteFailedAfter, c.deletePendingAfter + if isOwnedByJob(owners) { jobOwnerName := pod.OwnerReferences[0].Name jobOwner, exists, err := c.jobInformer.GetStore().GetByKey(pod.Namespace + "/" + jobOwnerName) if err != nil { @@ -215,16 +212,26 @@ func (c *Kleaner) Process(obj interface{}) { } else if exists && isOwnedByCronJob(getJobOwnerKinds(jobOwner.(*batchv1.Job))) { return } - toDelete := c.maybeDeletePod(pod.Status.Phase, age) - if toDelete { - c.deletePods(pod) + toDelete := c.maybeDeletePod(pod.Status.Phase, age) + if toDelete { + c.deletePod(pod) + } + return } - return } // evicted pods, those with or without owner references, but in Evicted state // - uses c.deleteEvictedAfter - if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && c.deleteEvictedAfter > 0 && age >= c.deleteEvictedAfter { - c.deletePods(pod) + if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && c.deleteEvictedAfter > 0 { + c.deletePod(pod) + } + if pod.Status.Phase == corev1.PodPending && c.deletePendingAfter > 0 { + t := podLastTransitionTime(pod) + if t.IsZero() { + return + } + if time.Now().Sub(t) >= c.deleteEvictedAfter { + c.deletePod(pod) + } } } } @@ -245,7 +252,7 @@ func (c *Kleaner) deleteJobs(job *batchv1.Job) { metrics.GetOrCreateCounter(metricName(jobDeletedMetric, job.Namespace)).Inc() } -func (c *Kleaner) deletePods(pod *corev1.Pod) { +func (c *Kleaner) deletePod(pod *corev1.Pod) { if c.dryRun { log.Printf("dry-run: Pod '%s:%s' would have been deleted", pod.Namespace, pod.Name) return @@ -270,10 +277,6 @@ func (c *Kleaner) maybeDeletePod(podPhase corev1.PodPhase, timeSinceFinish time. if c.deleteFailedAfter > 0 && timeSinceFinish >= c.deleteFailedAfter { return true } - case corev1.PodPending: - if c.deletePendingAfter > 0 && timeSinceFinish >= c.deletePendingAfter { - return true - } default: return false } @@ -314,7 +317,16 @@ func isOwnedByCronJob(ownerKinds []string) bool { return false } -func extractPodFinishTime(podObj *corev1.Pod) time.Time { +func podLastTransitionTime(podObj *corev1.Pod) time.Time { + for _, pc := range podObj.Status.Conditions { + if pc.Type == corev1.PodScheduled && pc.Status == corev1.ConditionFalse { + return pc.LastTransitionTime.Time + } + } + return time.Time{} +} + +func podFinishTime(podObj *corev1.Pod) time.Time { for _, pc := range podObj.Status.Conditions { // Looking for the time when pod's condition "Ready" became "false" (equals end of execution) if pc.Type == corev1.PodReady && pc.Status == corev1.ConditionFalse { @@ -325,7 +337,7 @@ func extractPodFinishTime(podObj *corev1.Pod) time.Time { } // Can return "zero" time, caller must check -func extractJobFinishTime(jobObj *batchv1.Job) time.Time { +func jobFinishTime(jobObj *batchv1.Job) time.Time { if !jobObj.Status.CompletionTime.IsZero() { return jobObj.Status.CompletionTime.Time } From a500d16dc4316a37e59425127d6455efe5fe26a8 Mon Sep 17 00:00:00 2001 From: Sergey Nuzhdin Date: Sat, 4 Jul 2020 12:21:59 +0100 Subject: [PATCH 2/3] Separate pod and job decision logic, add tests Introduce `ignore-owned-by-cronjobs` flag to explicitly opt-in for the change of logic introduced in 7e342c16f94d6e8aec1a0a7811b210b1bcf14e6b. Flag is marked as `EXPERIMENTAL`, which means it could be changed or renamed in the coming versions. Split pod and job decision logic to be able to unit test it. Add tests. Signed-off-by: Sergey Nuzhdin --- Makefile | 7 +- cmd/main.go | 10 +- pkg/controller/controller.go | 181 ++++------------------------------- pkg/controller/job.go | 75 +++++++++++++++ pkg/controller/job_test.go | 99 +++++++++++++++++++ pkg/controller/pod.go | 109 +++++++++++++++++++++ pkg/controller/pod_test.go | 164 +++++++++++++++++++++++++++++++ 7 files changed, 480 insertions(+), 165 deletions(-) create mode 100644 pkg/controller/job.go create mode 100644 pkg/controller/job_test.go create mode 100644 pkg/controller/pod.go create mode 100644 pkg/controller/pod_test.go diff --git a/Makefile b/Makefile index 471e27ed..13cc87f1 100644 --- a/Makefile +++ b/Makefile @@ -2,14 +2,14 @@ NAME := kube-cleanup-operator AUTHOR=lwolf VERSION ?= 0.6.0 REGISTRY ?= quay.io -GIT_SHA=$(shell git --no-pager describe --always --dirty) +GIT_SHA=$(shell git rev-list --count HEAD)-$(shell git rev-parse --short=7 HEAD) COMMIT_TIME=$(shell git show --format=%ct --no-patch) LFLAGS ?= -X main.gitsha=${GIT_SHA} -X main.committed=${COMMIT_TIME} ROOT_DIR=${PWD} GOVERSION ?= 1.14.0 HARDWARE=$(shell uname -m) -.PHONY: build docker static release install_deps +.PHONY: build docker static release install_deps test default: build @@ -26,6 +26,9 @@ build: golang @mkdir -p bin go build -mod=vendor -ldflags "${LFLAGS}" -o bin/$(NAME) ./cmd +test: + go test -race ./pkg/... -coverprofile=coverage.txt -covermode=atomic + static: golang @echo "--> Compiling the static binary" @mkdir -p bin diff --git a/cmd/main.go b/cmd/main.go index a26fcaa8..e3760b4c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -24,6 +24,11 @@ import ( "github.com/lwolf/kube-cleanup-operator/pkg/controller" ) +var ( + gitsha string + committed string +) + func setupLogging() { // Set logging output to standard console out log.SetOutput(os.Stdout) @@ -45,6 +50,7 @@ func main() { deleteOrphanedAfter := flag.Duration("delete-orphaned-pods-after", 1*time.Hour, "Delete orphaned pods. Pods without an owner in non-running state (golang duration format, e.g 5m), 0 - never delete") deleteEvictedAfter := flag.Duration("delete-evicted-pods-after", 15*time.Minute, "Delete pods in evicted state (golang duration format, e.g 5m), 0 - never delete") deletePendingAfter := flag.Duration("delete-pending-pods-after", 0, "Delete pods in pending state after X duration (golang duration format, e.g 5m), 0 - never delete") + ignoreOwnedByCronjob := flag.Bool("ignore-owned-by-cronjobs", false, "[EXPERIMENTAL] Do not cleanup pods and jobs created by cronjobs") legacyKeepSuccessHours := flag.Int64("keep-successful", 0, "Number of hours to keep successful jobs, -1 - forever, 0 - never (default), >0 number of hours") legacyKeepFailedHours := flag.Int64("keep-failures", -1, "Number of hours to keep failed jobs, -1 - forever (default) 0 - never, >0 number of hours") @@ -55,7 +61,7 @@ func main() { flag.Parse() setupLogging() - log.Println("Starting the application.") + log.Printf("Starting the application. Version: %s, CommitTime: %s\n", gitsha, committed) var optsInfo strings.Builder optsInfo.WriteString("Provided options: \n") optsInfo.WriteString(fmt.Sprintf("\tnamespace: %s\n", *namespace)) @@ -65,6 +71,7 @@ func main() { optsInfo.WriteString(fmt.Sprintf("\tdelete-pending-after: %s\n", *deletePendingAfter)) optsInfo.WriteString(fmt.Sprintf("\tdelete-orphaned-after: %s\n", *deleteOrphanedAfter)) optsInfo.WriteString(fmt.Sprintf("\tdelete-evicted-after: %s\n", *deleteEvictedAfter)) + optsInfo.WriteString(fmt.Sprintf("\tignore-owned-by-cronjobs: %v\n", *ignoreOwnedByCronjob)) optsInfo.WriteString(fmt.Sprintf("\n\tlegacy-mode: %v\n", *legacyMode)) optsInfo.WriteString(fmt.Sprintf("\tkeep-successful: %d\n", *legacyKeepSuccessHours)) @@ -121,6 +128,7 @@ func main() { *deletePendingAfter, *deleteOrphanedAfter, *deleteEvictedAfter, + *ignoreOwnedByCronjob, stopCh, ).Run() } diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 7263bc25..d9917999 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -50,6 +50,8 @@ type Kleaner struct { deleteOrphanedAfter time.Duration deleteEvictedAfter time.Duration + ignoreOwnedByCronjob bool + dryRun bool ctx context.Context stopCh <-chan struct{} @@ -57,7 +59,8 @@ type Kleaner struct { // NewKleaner creates a new NewKleaner func NewKleaner(ctx context.Context, kclient *kubernetes.Clientset, namespace string, dryRun bool, deleteSuccessfulAfter, - deleteFailedAfter, deletePendingAfter, deleteOrphanedAfter, deleteEvictedAfter time.Duration, stopCh <-chan struct{}) *Kleaner { + deleteFailedAfter, deletePendingAfter, deleteOrphanedAfter, deleteEvictedAfter time.Duration, ignoreOwnedByCronjob bool, + stopCh <-chan struct{}) *Kleaner { jobInformer := cache.NewSharedIndexInformer( &cache.ListWatch{ ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { @@ -95,6 +98,7 @@ func NewKleaner(ctx context.Context, kclient *kubernetes.Clientset, namespace st deletePendingAfter: deletePendingAfter, deleteOrphanedAfter: deleteOrphanedAfter, deleteEvictedAfter: deleteEvictedAfter, + ignoreOwnedByCronjob: ignoreOwnedByCronjob, } jobInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ UpdateFunc: func(old, new interface{}) { @@ -154,94 +158,32 @@ func (c *Kleaner) Process(obj interface{}) { if !t.DeletionTimestamp.IsZero() { return } - job := t - // skip the job if it has any active pods - if job.Status.Active > 0 { - return - } - - owners := getJobOwnerKinds(job) - if isOwnedByCronJob(owners) { - return - } - - finishTime := jobFinishTime(job) - - if finishTime.IsZero() { - return - } - - timeSinceFinish := time.Since(finishTime) - - if job.Status.Succeeded > 0 { - if c.deleteSuccessfulAfter > 0 && timeSinceFinish > c.deleteSuccessfulAfter { - c.deleteJobs(job) - } - } - if job.Status.Failed > 0 { - if c.deleteFailedAfter > 0 && timeSinceFinish >= c.deleteFailedAfter { - c.deleteJobs(job) - } + if shouldDeleteJob(t, c.deleteSuccessfulAfter, c.deleteFailedAfter, c.ignoreOwnedByCronjob) { + c.DeleteJob(t) } - case *corev1.Pod: + pod := t // skip pods that are already in the deleting process - if !t.DeletionTimestamp.IsZero() { + if !pod.DeletionTimestamp.IsZero() { return } - pod := t - owners := getPodOwnerKinds(pod) - podFinishTime := podFinishTime(pod) - if !podFinishTime.IsZero() { - age := time.Since(podFinishTime) - // orphaned pod: those that do not have any owner references - // - uses c.deleteOrphanedAfter - if len(owners) == 0 { - if c.deleteOrphanedAfter > 0 && age >= c.deleteOrphanedAfter { - c.deletePod(pod) - } - } - // owned by job, have exactly one ownerReference present and its kind is Job - // - uses the c.deleteSuccessfulAfter, c.deleteFailedAfter, c.deletePendingAfter - if isOwnedByJob(owners) { - jobOwnerName := pod.OwnerReferences[0].Name - jobOwner, exists, err := c.jobInformer.GetStore().GetByKey(pod.Namespace + "/" + jobOwnerName) - if err != nil { - log.Printf("Can't find job '%s:%s`", pod.Namespace, jobOwnerName) - - } else if exists && isOwnedByCronJob(getJobOwnerKinds(jobOwner.(*batchv1.Job))) { - return - } - toDelete := c.maybeDeletePod(pod.Status.Phase, age) - if toDelete { - c.deletePod(pod) - } - return - } - } - // evicted pods, those with or without owner references, but in Evicted state - // - uses c.deleteEvictedAfter - if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && c.deleteEvictedAfter > 0 { - c.deletePod(pod) + // skip pods related to jobs created by cronjobs if `ignoreOwnedByCronjob` is set + if c.ignoreOwnedByCronjob && podRelatedToCronJob(pod, c.jobInformer.GetStore()) { + return } - if pod.Status.Phase == corev1.PodPending && c.deletePendingAfter > 0 { - t := podLastTransitionTime(pod) - if t.IsZero() { - return - } - if time.Now().Sub(t) >= c.deleteEvictedAfter { - c.deletePod(pod) - } + // normal cleanup flow + if shouldDeletePod(t, c.deleteOrphanedAfter, c.deletePendingAfter, c.deleteEvictedAfter, c.deleteSuccessfulAfter, c.deleteFailedAfter) { + c.DeletePod(t) } } } -func (c *Kleaner) deleteJobs(job *batchv1.Job) { +func (c *Kleaner) DeleteJob(job *batchv1.Job) { if c.dryRun { log.Printf("dry-run: Job '%s:%s' would have been deleted", job.Namespace, job.Name) return } - log.Printf("Deleting job '%s:%s'", job.Namespace, job.Name) + log.Printf("Deleting job '%s/%s'", job.Namespace, job.Name) propagation := metav1.DeletePropagationForeground jo := metav1.DeleteOptions{PropagationPolicy: &propagation} if err := c.kclient.BatchV1().Jobs(job.Namespace).Delete(c.ctx, job.Name, jo); ignoreNotFound(err) != nil { @@ -252,12 +194,12 @@ func (c *Kleaner) deleteJobs(job *batchv1.Job) { metrics.GetOrCreateCounter(metricName(jobDeletedMetric, job.Namespace)).Inc() } -func (c *Kleaner) deletePod(pod *corev1.Pod) { +func (c *Kleaner) DeletePod(pod *corev1.Pod) { if c.dryRun { log.Printf("dry-run: Pod '%s:%s' would have been deleted", pod.Namespace, pod.Name) return } - log.Printf("Deleting pod '%s:%s'", pod.Namespace, pod.Name) + log.Printf("Deleting pod '%s/%s'", pod.Namespace, pod.Name) var po metav1.DeleteOptions if err := c.kclient.CoreV1().Pods(pod.Namespace).Delete(c.ctx, pod.Name, po); ignoreNotFound(err) != nil { log.Printf("failed to delete pod '%s:%s': %v", pod.Namespace, pod.Name, err) @@ -266,88 +208,3 @@ func (c *Kleaner) deletePod(pod *corev1.Pod) { } metrics.GetOrCreateCounter(metricName(podDeletedMetric, pod.Namespace)).Inc() } - -func (c *Kleaner) maybeDeletePod(podPhase corev1.PodPhase, timeSinceFinish time.Duration) bool { - switch podPhase { - case corev1.PodSucceeded: - if c.deleteSuccessfulAfter > 0 && timeSinceFinish >= c.deleteSuccessfulAfter { - return true - } - case corev1.PodFailed: - if c.deleteFailedAfter > 0 && timeSinceFinish >= c.deleteFailedAfter { - return true - } - default: - return false - } - return false -} - -func getPodOwnerKinds(pod *corev1.Pod) []string { - var kinds []string - for _, ow := range pod.OwnerReferences { - kinds = append(kinds, ow.Kind) - } - return kinds -} - -func getJobOwnerKinds(job *batchv1.Job) []string { - var kinds []string - for _, ow := range job.OwnerReferences { - kinds = append(kinds, ow.Kind) - } - return kinds -} - -// isOwnedByJob returns true if and only if pod has a single owner -// and this owners kind is Job -func isOwnedByJob(ownerKinds []string) bool { - if len(ownerKinds) == 1 && ownerKinds[0] == "Job" { - return true - } - return false -} - -// isOwnedByCronJob returns true if and only if job has a single owner CronJob -// and this owners kind is CronJob -func isOwnedByCronJob(ownerKinds []string) bool { - if len(ownerKinds) == 1 && ownerKinds[0] == "CronJob" { - return true - } - return false -} - -func podLastTransitionTime(podObj *corev1.Pod) time.Time { - for _, pc := range podObj.Status.Conditions { - if pc.Type == corev1.PodScheduled && pc.Status == corev1.ConditionFalse { - return pc.LastTransitionTime.Time - } - } - return time.Time{} -} - -func podFinishTime(podObj *corev1.Pod) time.Time { - for _, pc := range podObj.Status.Conditions { - // Looking for the time when pod's condition "Ready" became "false" (equals end of execution) - if pc.Type == corev1.PodReady && pc.Status == corev1.ConditionFalse { - return pc.LastTransitionTime.Time - } - } - return time.Time{} -} - -// Can return "zero" time, caller must check -func jobFinishTime(jobObj *batchv1.Job) time.Time { - if !jobObj.Status.CompletionTime.IsZero() { - return jobObj.Status.CompletionTime.Time - } - - for _, jc := range jobObj.Status.Conditions { - // Looking for the time when job's condition "Failed" became "true" (equals end of execution) - if jc.Type == batchv1.JobFailed && jc.Status == corev1.ConditionTrue { - return jc.LastTransitionTime.Time - } - } - - return time.Time{} -} diff --git a/pkg/controller/job.go b/pkg/controller/job.go new file mode 100644 index 00000000..062a60fb --- /dev/null +++ b/pkg/controller/job.go @@ -0,0 +1,75 @@ +package controller + +import ( + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" +) + +func shouldDeleteJob(job *batchv1.Job, deleteSuccessfulAfter, deleteFailedAfter time.Duration, ignoreCronJobs bool) bool { + // skip the job if it has any active pods + if job.Status.Active > 0 { + return false + } + + if ignoreCronJobs { + owners := getJobOwnerKinds(job) + if isOwnedByCronJob(owners) { + return false + } + } + + finishTime := jobFinishTime(job) + + if finishTime.IsZero() { + return false + } + + timeSinceFinish := time.Since(finishTime) + + if job.Status.Succeeded > 0 { + if deleteSuccessfulAfter > 0 && timeSinceFinish > deleteSuccessfulAfter { + return true + } + } + if job.Status.Failed > 0 { + if deleteFailedAfter > 0 && timeSinceFinish >= deleteFailedAfter { + return true + } + } + return false +} + +func getJobOwnerKinds(job *batchv1.Job) []string { + var kinds []string + for _, ow := range job.OwnerReferences { + kinds = append(kinds, ow.Kind) + } + return kinds +} + +// Can return "zero" time, caller must check +func jobFinishTime(jobObj *batchv1.Job) time.Time { + if !jobObj.Status.CompletionTime.IsZero() { + return jobObj.Status.CompletionTime.Time + } + + for _, jc := range jobObj.Status.Conditions { + // Looking for the time when job's condition "Failed" became "true" (equals end of execution) + if jc.Type == batchv1.JobFailed && jc.Status == corev1.ConditionTrue { + return jc.LastTransitionTime.Time + } + } + + return time.Time{} +} + +// isOwnedByCronJob returns true if and only if job has a single owner CronJob +// and this owners kind is CronJob +func isOwnedByCronJob(ownerKinds []string) bool { + if len(ownerKinds) == 1 && ownerKinds[0] == "CronJob" { + return true + } + return false +} diff --git a/pkg/controller/job_test.go b/pkg/controller/job_test.go new file mode 100644 index 00000000..ec604e1c --- /dev/null +++ b/pkg/controller/job_test.go @@ -0,0 +1,99 @@ +package controller + +import ( + "testing" + "time" + + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func createJob(ownedByCronJob bool, completed time.Time, active, succeeded, failed int32) *batchv1.Job { + ts := metav1.NewTime(completed) + job := batchv1.Job{ + Spec: batchv1.JobSpec{}, + Status: batchv1.JobStatus{ + CompletionTime: &ts, + Active: active, + Succeeded: succeeded, + Failed: failed, + }, + } + if ownedByCronJob { + job.ObjectMeta = metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + {Kind: "CronJob"}, + }, + } + } + return &job +} + +func TestKleaner_DeleteJob(t *testing.T) { + ts := time.Now() + testCases := map[string]struct { + jobSpec *batchv1.Job + successful time.Duration + failed time.Duration + ignoreCron bool + expected bool + }{ + "jobs owned by cronjobs should be ignored": { + jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0), + successful: time.Second, + failed: time.Second, + ignoreCron: true, + expected: false, + }, + "jobs owned by cronjobs should be deleted": { + jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0), + successful: time.Second, + failed: time.Second, + ignoreCron: false, + expected: false, + }, + "jobs with active pods should not be deleted": { + jobSpec: createJob(false, ts.Add(-time.Minute), 1, 0, 0), // job.Status.Active > 0 + successful: time.Second, + failed: time.Second, + ignoreCron: false, + expected: false, + }, + "expired successful jobs should be deleted": { + jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0), + successful: time.Second, + failed: time.Second, + ignoreCron: false, + expected: true, + }, + "non-expired successful jobs should not be deleted": { + jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0), + successful: time.Minute * 2, + failed: time.Second, + ignoreCron: false, + expected: false, + }, + "expired failed jobs should be deleted": { + jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1), + successful: time.Second, + failed: time.Second, + ignoreCron: false, + expected: true, + }, + "non-expired failed jobs should not be deleted": { + jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1), + successful: time.Second, + failed: time.Minute * 2, + ignoreCron: false, + expected: false, + }, + } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + result := shouldDeleteJob(tc.jobSpec, tc.successful, tc.failed, tc.ignoreCron) + if result != tc.expected { + t.Fatalf("failed, expected %v, got %v", tc.expected, result) + } + }) + } +} diff --git a/pkg/controller/pod.go b/pkg/controller/pod.go new file mode 100644 index 00000000..f990f21e --- /dev/null +++ b/pkg/controller/pod.go @@ -0,0 +1,109 @@ +package controller + +import ( + "log" + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/cache" +) + +func podRelatedToCronJob(pod *corev1.Pod, jobStore cache.Store) bool { + isOwnedByJob := isOwnedByJob(getPodOwnerKinds(pod)) + if !isOwnedByJob { + return false + } + jobOwnerName := pod.OwnerReferences[0].Name + jobOwner, exists, err := jobStore.GetByKey(pod.Namespace + "/" + jobOwnerName) + if err != nil { + log.Printf("Can't find job '%s:%s`", pod.Namespace, jobOwnerName) + } else if exists && isOwnedByCronJob(getJobOwnerKinds(jobOwner.(*batchv1.Job))) { + return true + } + return false +} + +func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, successful, failed time.Duration) bool { + // evicted pods, those with or without owner references, but in Evicted state + // - uses c.deleteEvictedAfter, this one is tricky, because there is no timestamp of eviction. + // So, basically it will be removed as soon as discovered + if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && evicted > 0 { + return true + } + owners := getPodOwnerKinds(pod) + podFinishTime := podFinishTime(pod) + if !podFinishTime.IsZero() { + age := time.Since(podFinishTime) + // orphaned pod: those that do not have any owner references + // - uses c.deleteOrphanedAfter + if len(owners) == 0 { + if orphaned > 0 && age >= orphaned { + return true + } + } + // owned by job, have exactly one ownerReference present and its kind is Job + // - uses the c.deleteSuccessfulAfter, c.deleteFailedAfter, c.deletePendingAfter + if isOwnedByJob(owners) { + switch pod.Status.Phase { + case corev1.PodSucceeded: + if successful > 0 && age >= successful { + return true + } + case corev1.PodFailed: + if failed > 0 && age >= failed { + return true + } + default: + return false + } + return false + } + } + if pod.Status.Phase == corev1.PodPending && pending > 0 { + t := podLastTransitionTime(pod) + if t.IsZero() { + return false + } + if time.Now().Sub(t) >= pending { + return true + } + } + return false +} + +func getPodOwnerKinds(pod *corev1.Pod) []string { + var kinds []string + for _, ow := range pod.OwnerReferences { + kinds = append(kinds, ow.Kind) + } + return kinds +} + +// isOwnedByJob returns true if and only if pod has a single owner +// and this owners kind is Job +func isOwnedByJob(ownerKinds []string) bool { + if len(ownerKinds) == 1 && ownerKinds[0] == "Job" { + return true + } + return false +} + +func podLastTransitionTime(podObj *corev1.Pod) time.Time { + for _, pc := range podObj.Status.Conditions { + if pc.Type == corev1.PodScheduled && pc.Status == corev1.ConditionFalse { + return pc.LastTransitionTime.Time + } + } + return time.Time{} +} + +func podFinishTime(podObj *corev1.Pod) time.Time { + for _, pc := range podObj.Status.Conditions { + // Looking for the time when pod's condition "Ready" became "false" (equals end of execution) + if pc.Type == corev1.PodReady && pc.Status == corev1.ConditionFalse { + return pc.LastTransitionTime.Time + } + } + return time.Time{} +} diff --git a/pkg/controller/pod_test.go b/pkg/controller/pod_test.go new file mode 100644 index 00000000..6189813c --- /dev/null +++ b/pkg/controller/pod_test.go @@ -0,0 +1,164 @@ +package controller + +import ( + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestKleaner_DeletePod(t *testing.T) { + ts := time.Now() + testCases := map[string]struct { + podSpec *corev1.Pod + orphaned time.Duration + pending time.Duration + evicted time.Duration + successful time.Duration + failed time.Duration + expected bool + }{ + "expired orphaned pods should be deleted": { + podSpec: &corev1.Pod{ + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute * 2)), + }, + }, + }, + }, + orphaned: time.Minute, + pending: 0, + evicted: 0, + successful: 0, + failed: 0, + expected: true, + }, + "non expired orphaned pods should not be deleted": { + podSpec: &corev1.Pod{ + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute)), + }, + }, + }, + }, + orphaned: time.Minute * 5, + pending: 0, + evicted: 0, + successful: 0, + failed: 0, + expected: false, + }, + "expired, PodSucceeded owned by Job should be deleted": { + podSpec: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + { + Kind: "Job", + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute * 2)), + }, + }, + }, + }, + orphaned: 0, + pending: 0, + evicted: 0, + successful: time.Minute, + failed: 0, + expected: true, + }, + "expired, PodFailed owned by Job should be deleted": { + podSpec: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + { + Kind: "Job", + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute * 2)), + }, + }, + }, + }, + orphaned: 0, + pending: 0, + evicted: 0, + successful: 0, + failed: time.Minute, + expected: true, + }, + "evicted pods should be deleted": { + podSpec: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + Reason: "Evicted", + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute * 2)), + }, + }, + }, + }, + orphaned: 0, + pending: 0, + evicted: time.Hour, + successful: 0, + failed: 0, + expected: true, + }, + "expired pending pods should be deleted": { + podSpec: &corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute * 2)), + }, + }, + }, + }, + orphaned: 0, + pending: time.Minute, + evicted: 0, + successful: 0, + failed: 0, + expected: true, + }, + } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + result := shouldDeletePod(tc.podSpec, tc.orphaned, tc.pending, tc.evicted, tc.successful, tc.failed) + if result != tc.expected { + t.Fatalf("failed, expected %v, got %v", tc.expected, result) + } + }) + } +} From 3302bc11084f282891d49072f14b9e84de1f1f99 Mon Sep 17 00:00:00 2001 From: Sergey Nuzhdin Date: Sat, 4 Jul 2020 12:28:31 +0100 Subject: [PATCH 3/3] Add test stage to .travis.yaml --- .travis.yml | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index ec3f5089..bffbfdd9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,13 +6,19 @@ go: services: - docker -before_deploy: -- echo "$QUAY_PASSWORD" | docker login -u "$QUAY_LOGIN" quay.io --password-stdin - -deploy: -- provider: script - skip_cleanup: true - script: curl -sL http://git.io/goreleaser | bash - on: - tags: true - condition: $TRAVIS_OS_NAME = linux +jobs: + include: + - stage: Test + script: make test + after_success: + - bash <(curl -s https://codecov.io/bash) + - stage: Release + before_script: + - echo "$QUAY_PASSWORD" | docker login -u "$QUAY_LOGIN" quay.io --password-stdin + deploy: + - provider: script + skip_cleanup: true + script: curl -sL http://git.io/goreleaser | bash + on: + tags: true + condition: $TRAVIS_OS_NAME = linux