@@ -43,7 +43,7 @@ type SharedClusterManager struct {
4343 cleanup func ()
4444 logger * logrus.Logger
4545 isSetup bool
46- agentNodes []string
46+ workerNodes []string
4747 registryPort string
4848 relativeSkaffoldYAMLPath string
4949}
@@ -70,11 +70,11 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
7070 return nil
7171 }
7272
73- // Configuration for maximum cluster size needed (28 agents + 3 servers )
73+ // Configuration for maximum cluster size needed (28 worker nodes + 3 server nodes )
7474 customCfg := ClusterConfig {
7575 Name : "shared-e2e-test-cluster" ,
76- Servers : 3 ,
77- Agents : 28 , // Maximum needed across all tests
76+ ServerNodes : 3 ,
77+ WorkerNodes : 28 , // Maximum needed across all tests
7878 WorkerMemory : "150m" , // 150m memory per agent node to fit one workload pod
7979 Image : "rancher/k3s:v1.33.5-k3s1" ,
8080 HostPort : "6560" , // Use a different port to avoid conflicts
@@ -83,20 +83,23 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
8383 RegistryPort : "5001" ,
8484 NodeLabels : []NodeLabel {
8585 {
86- Key : "node_role.e2e.grove.nvidia.com" ,
86+ Key : "node_role.e2e.grove.nvidia.com" ,
87+ // k3s refers to worker nodes as agent nodes
8788 Value : "agent" ,
8889 NodeFilters : []string {"agent:*" },
8990 },
9091 // we currently don't want GPUs in e2e tests as validator is causing issues
9192 {
92- Key : "nvidia.com/gpu.deploy.operands" ,
93- Value : "false" ,
93+ Key : "nvidia.com/gpu.deploy.operands" ,
94+ Value : "false" ,
95+ // k3s refers to worker nodes as agent nodes
9496 NodeFilters : []string {"server:*" , "agent:*" },
9597 },
9698 },
97- AgentNodeTaints : []NodeTaint {
99+ WorkerNodeTaints : []NodeTaint {
98100 {
99- Key : "node_role.e2e.grove.nvidia.com" ,
101+ Key : "node_role.e2e.grove.nvidia.com" ,
102+ // k3s refers to worker nodes as agent nodes
100103 Value : "agent" ,
101104 Effect : "NoSchedule" ,
102105 },
@@ -137,41 +140,41 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
137140 return fmt .Errorf ("failed to setup registry test images: %w" , err )
138141 }
139142
140- // Get list of agent nodes for cordoning management
143+ // Get list of worker nodes for cordoning management
141144 nodes , err := clientset .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
142145 if err != nil {
143146 cleanup ()
144147 return fmt .Errorf ("failed to list nodes: %w" , err )
145148 }
146149
147- scm .agentNodes = make ([]string , 0 )
150+ scm .workerNodes = make ([]string , 0 )
148151 for _ , node := range nodes .Items {
149152 if _ , isServer := node .Labels ["node-role.kubernetes.io/control-plane" ]; ! isServer {
150- scm .agentNodes = append (scm .agentNodes , node .Name )
153+ scm .workerNodes = append (scm .workerNodes , node .Name )
151154 }
152155 }
153156
154- scm .logger .Infof ("✅ Shared cluster setup complete with %d agent nodes" , len (scm .agentNodes ))
157+ scm .logger .Infof ("✅ Shared cluster setup complete with %d worker nodes" , len (scm .workerNodes ))
155158 scm .isSetup = true
156159 return nil
157160}
158161
159162// PrepareForTest prepares the cluster for a specific test by cordoning the appropriate nodes
160- func (scm * SharedClusterManager ) PrepareForTest (ctx context.Context , requiredAgents int ) error {
163+ func (scm * SharedClusterManager ) PrepareForTest (ctx context.Context , requiredWorkerNodes int ) error {
161164 if ! scm .isSetup {
162165 return fmt .Errorf ("shared cluster not setup" )
163166 }
164167
165168 // First, uncordon all nodes to reset state
166- for _ , nodeName := range scm .agentNodes {
169+ for _ , nodeName := range scm .workerNodes {
167170 if err := utils .CordonNode (ctx , scm .clientset , nodeName , false ); err != nil {
168171 return fmt .Errorf ("failed to uncordon node %s: %w" , nodeName , err )
169172 }
170173 }
171174
172175 // Cordon nodes that are not needed for this test
173- if requiredAgents < len (scm .agentNodes ) {
174- nodesToCordon := scm .agentNodes [ requiredAgents :]
176+ if requiredWorkerNodes < len (scm .workerNodes ) {
177+ nodesToCordon := scm .workerNodes [ requiredWorkerNodes :]
175178 for _ , nodeName := range nodesToCordon {
176179 if err := utils .CordonNode (ctx , scm .clientset , nodeName , true ); err != nil {
177180 return fmt .Errorf ("failed to cordon node %s: %w" , nodeName , err )
@@ -276,9 +279,9 @@ func (scm *SharedClusterManager) listRemainingPods(ctx context.Context, namespac
276279 }
277280}
278281
279- // resetNodeStates uncordons all agent nodes to reset cluster state
282+ // resetNodeStates uncordons all worker nodes to reset cluster state
280283func (scm * SharedClusterManager ) resetNodeStates (ctx context.Context ) error {
281- for _ , nodeName := range scm .agentNodes {
284+ for _ , nodeName := range scm .workerNodes {
282285 if err := utils .CordonNode (ctx , scm .clientset , nodeName , false ); err != nil {
283286 scm .logger .Warnf ("failed to uncordon node %s: %v" , nodeName , err )
284287 return fmt .Errorf ("failed to uncordon node %s: %w" , nodeName , err )
@@ -406,9 +409,9 @@ func (scm *SharedClusterManager) GetRegistryPort() string {
406409 return scm .registryPort
407410}
408411
409- // GetAgentNodes returns the list of agent node names
410- func (scm * SharedClusterManager ) GetAgentNodes () []string {
411- return scm .agentNodes
412+ // GetWorkerNodes returns the list of worker node names
413+ func (scm * SharedClusterManager ) GetWorkerNodes () []string {
414+ return scm .workerNodes
412415}
413416
414417// IsSetup returns whether the shared cluster is setup
0 commit comments