@@ -23,10 +23,9 @@ import (
2323 "sync"
2424 "time"
2525
26- "github.com/ai-dynamo/grove/operator/e2e_testing /utils"
26+ "github.com/ai-dynamo/grove/operator/e2e /utils"
2727 "github.com/docker/docker/api/types/image"
28- "github.com/docker/docker/client"
29- "github.com/sirupsen/logrus"
28+ dockerclient "github.com/docker/docker/client"
3029 v1 "k8s.io/api/core/v1"
3130 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3231 "k8s.io/apimachinery/pkg/runtime/schema"
@@ -35,17 +34,23 @@ import (
3534 "k8s.io/client-go/rest"
3635)
3736
37+ const (
38+ // relativeSkaffoldYAMLPath is the path to the skaffold.yaml file relative to the e2e/tests directory
39+ relativeSkaffoldYAMLPath = "../../skaffold.yaml"
40+
41+ defaulPollInterval = 1 * time .Second
42+ )
43+
3844// SharedClusterManager manages a shared (singleton) k3d cluster for E2E tests
3945type SharedClusterManager struct {
40- clientset * kubernetes.Clientset
41- restConfig * rest.Config
42- dynamicClient dynamic.Interface
43- cleanup func ()
44- logger * logrus.Logger
45- isSetup bool
46- workerNodes []string
47- registryPort string
48- relativeSkaffoldYAMLPath string
46+ clientset * kubernetes.Clientset
47+ restConfig * rest.Config
48+ dynamicClient dynamic.Interface
49+ cleanup func ()
50+ logger * utils.Logger
51+ isSetup bool
52+ workerNodes []string
53+ registryPort string
4954}
5055
5156var (
@@ -54,11 +59,10 @@ var (
5459)
5560
5661// SharedCluster returns the singleton shared cluster manager
57- func SharedCluster (logger * logrus .Logger , skaffoldYAMLPath string ) * SharedClusterManager {
62+ func SharedCluster (logger * utils .Logger ) * SharedClusterManager {
5863 once .Do (func () {
5964 sharedCluster = & SharedClusterManager {
60- logger : logger ,
61- relativeSkaffoldYAMLPath : skaffoldYAMLPath ,
65+ logger : logger ,
6266 }
6367 })
6468 return sharedCluster
@@ -110,7 +114,7 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
110114
111115 scm .logger .Info ("🚀 Setting up shared k3d cluster for all e2e tests..." )
112116
113- restConfig , cleanup , err := SetupCompleteK3DCluster (ctx , customCfg , scm . relativeSkaffoldYAMLPath , scm .logger )
117+ restConfig , cleanup , err := SetupCompleteK3DCluster (ctx , customCfg , relativeSkaffoldYAMLPath , scm .logger )
114118 if err != nil {
115119 return fmt .Errorf ("failed to setup shared k3d cluster: %w" , err )
116120 }
@@ -198,8 +202,8 @@ func (scm *SharedClusterManager) CleanupWorkloads(ctx context.Context) error {
198202 scm .logger .Warnf ("failed to delete PodCliqueSets: %v" , err )
199203 }
200204
201- // Step 2: Poll for all resources and pods to be cleaned up (max 15 seconds)
202- if err := scm .waitForAllResourcesAndPodsDeleted (ctx , 15 * time . Second ); err != nil {
205+ // Step 2: Poll for all resources and pods to be cleaned up
206+ if err := scm .waitForAllResourcesAndPodsDeleted (ctx , defaulPollInterval ); err != nil {
203207 scm .logger .Warnf ("timeout waiting for resources and pods to be deleted: %v" , err )
204208 // List remaining resources and pods for debugging
205209 scm .listRemainingResources (ctx )
@@ -292,9 +296,6 @@ func (scm *SharedClusterManager) resetNodeStates(ctx context.Context) error {
292296
293297// waitForAllResourcesAndPodsDeleted waits for all Grove resources and pods to be deleted
294298func (scm * SharedClusterManager ) waitForAllResourcesAndPodsDeleted (ctx context.Context , timeout time.Duration ) error {
295- timeoutCtx , cancel := context .WithTimeout (ctx , timeout )
296- defer cancel ()
297-
298299 // Define all resource types to check
299300 resourceTypes := []struct {
300301 group string
@@ -308,59 +309,53 @@ func (scm *SharedClusterManager) waitForAllResourcesAndPodsDeleted(ctx context.C
308309 {"scheduler.grove.io" , "v1alpha1" , "podgangs" , "PodGangs" },
309310 }
310311
311- ticker := time .NewTicker (1 * time .Second )
312- defer ticker .Stop ()
313-
314- for {
315- select {
316- case <- timeoutCtx .Done ():
317- return fmt .Errorf ("timeout waiting for resources and pods to be deleted" )
318- case <- ticker .C :
319- allResourcesDeleted := true
320- totalResources := 0
321-
322- // Check Grove resources
323- for _ , rt := range resourceTypes {
324- gvr := schema.GroupVersionResource {
325- Group : rt .group ,
326- Version : rt .version ,
327- Resource : rt .resource ,
328- }
312+ return utils .PollForCondition (ctx , timeout , defaulPollInterval , func () (bool , error ) {
313+ allResourcesDeleted := true
314+ totalResources := 0
329315
330- resourceList , err := scm .dynamicClient .Resource (gvr ).List (ctx , metav1.ListOptions {})
331- if err != nil {
332- // If we can't list the resource type, assume it doesn't exist or is being deleted
333- continue
334- }
335-
336- if len (resourceList .Items ) > 0 {
337- allResourcesDeleted = false
338- totalResources += len (resourceList .Items )
339- }
316+ // Check Grove resources
317+ for _ , rt := range resourceTypes {
318+ gvr := schema.GroupVersionResource {
319+ Group : rt .group ,
320+ Version : rt .version ,
321+ Resource : rt .resource ,
340322 }
341323
342- // Check pods
343- allPodsDeleted := true
344- nonSystemPods := 0
345- pods , err := scm .clientset .CoreV1 ().Pods ("default" ).List (ctx , metav1.ListOptions {})
346- if err == nil {
347- for _ , pod := range pods .Items {
348- if ! isSystemPod (& pod ) {
349- allPodsDeleted = false
350- nonSystemPods ++
351- }
352- }
324+ resourceList , err := scm .dynamicClient .Resource (gvr ).List (ctx , metav1.ListOptions {})
325+ if err != nil {
326+ // If we can't list the resource type, assume it doesn't exist or is being deleted
327+ continue
353328 }
354329
355- if allResourcesDeleted && allPodsDeleted {
356- return nil
330+ if len (resourceList .Items ) > 0 {
331+ allResourcesDeleted = false
332+ totalResources += len (resourceList .Items )
357333 }
334+ }
358335
359- if totalResources > 0 || nonSystemPods > 0 {
360- scm .logger .Debugf ("⏳ Waiting for %d Grove resources and %d pods to be deleted..." , totalResources , nonSystemPods )
336+ // Check pods
337+ allPodsDeleted := true
338+ nonSystemPods := 0
339+ pods , err := scm .clientset .CoreV1 ().Pods ("default" ).List (ctx , metav1.ListOptions {})
340+ if err == nil {
341+ for _ , pod := range pods .Items {
342+ if ! isSystemPod (& pod ) {
343+ allPodsDeleted = false
344+ nonSystemPods ++
345+ }
361346 }
362347 }
363- }
348+
349+ if allResourcesDeleted && allPodsDeleted {
350+ return true , nil
351+ }
352+
353+ if totalResources > 0 || nonSystemPods > 0 {
354+ scm .logger .Debugf ("⏳ Waiting for %d Grove resources and %d pods to be deleted..." , totalResources , nonSystemPods )
355+ }
356+
357+ return false , nil
358+ })
364359}
365360
366361// listRemainingResources lists remaining Grove resources for debugging
@@ -436,7 +431,7 @@ func setupRegistryTestImages(registryPort string, images []string) error {
436431 ctx := context .Background ()
437432
438433 // Initialize Docker client
439- cli , err := client .NewClientWithOpts (client .FromEnv , client .WithAPIVersionNegotiation ())
434+ cli , err := dockerclient .NewClientWithOpts (dockerclient .FromEnv , dockerclient .WithAPIVersionNegotiation ())
440435 if err != nil {
441436 return fmt .Errorf ("failed to create Docker client: %w" , err )
442437 }
0 commit comments