Skip to content

Commit 5230fcc

Browse files
committed
rename agent to worker and add documentation
Signed-off-by: Geoff Flarity <[email protected]>
1 parent b3b8842 commit 5230fcc

File tree

4 files changed

+67
-59
lines changed

4 files changed

+67
-59
lines changed

operator/e2e_testing/setup/k8s_clusters.go

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,22 @@ type NodeTaint struct {
5858

5959
// NodeLabel represents a Kubernetes node label with its target node filters
6060
type NodeLabel struct {
61-
Key string // Label key
62-
Value string // Label value
61+
Key string // Label key
62+
Value string // Label value
63+
// k3s refers to worker nodes as agent nodes
6364
NodeFilters []string // Node filters (e.g., "server:*", "agent:*", "server:0", "agent:1")
6465
}
6566

6667
// ClusterConfig holds configuration for creating a k3d cluster
6768
type ClusterConfig struct {
6869
Name string // Name of the k3d cluster
69-
Servers int // Number of server/non-worker nodes
70-
Agents int // Number of agent/worker nodes
70+
ServerNodes int // Number of server (non-worker) nodes
71+
WorkerNodes int // Number of worker nodes (called agents in k3s terminology)
7172
Image string // Docker image to use for k3d cluster (e.g., "rancher/k3s:v1.28.8-k3s1")
7273
HostPort string // Port on host to expose Kubernetes API (e.g., "6550")
7374
LoadBalancerPort string // Load balancer port mapping in format "hostPort:containerPort" (e.g., "8080:80")
7475
NodeLabels []NodeLabel // Kubernetes labels to apply with specific node filters
75-
AgentNodeTaints []NodeTaint // Taints to apply to agent nodes
76+
WorkerNodeTaints []NodeTaint // Taints to apply to worker nodes
7677
WorkerMemory string // Memory allocation for worker/agent nodes (e.g., "150m")
7778
EnableRegistry bool // Enable built-in Docker registry
7879
RegistryPort string // Port for the Docker registry (e.g., "5001")
@@ -82,8 +83,8 @@ type ClusterConfig struct {
8283
func DefaultClusterConfig() ClusterConfig {
8384
return ClusterConfig{
8485
Name: "test-k3d-cluster",
85-
Servers: 1,
86-
Agents: 2,
86+
ServerNodes: 1,
87+
WorkerNodes: 2,
8788
Image: "rancher/k3s:v1.28.8-k3s1",
8889
HostPort: "6550",
8990
LoadBalancerPort: "8080:80",
@@ -126,8 +127,9 @@ func SetupCompleteK3DCluster(ctx context.Context, cfg ClusterConfig, skaffoldYAM
126127
{
127128
"key": "node_role.e2e.grove.nvidia.com",
128129
"operator": "Equal",
129-
"value": "agent",
130-
"effect": "NoSchedule",
130+
// k3s refers to worker nodes as agent nodes
131+
"value": "agent",
132+
"effect": "NoSchedule",
131133
},
132134
}
133135

@@ -227,9 +229,10 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *logrus.Logg
227229
ObjectMeta: types.ObjectMeta{
228230
Name: cfg.Name,
229231
},
230-
Servers: cfg.Servers,
231-
Agents: cfg.Agents,
232-
Image: cfg.Image,
232+
Servers: cfg.ServerNodes,
233+
// k3s calls these agents, but we call them worker nodes
234+
Agents: cfg.WorkerNodes,
235+
Image: cfg.Image,
233236
ExposeAPI: v1alpha5.SimpleExposureOpts{
234237
Host: "0.0.0.0",
235238
HostPort: cfg.HostPort,
@@ -242,6 +245,7 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *logrus.Logg
242245
},
243246
Options: v1alpha5.SimpleConfigOptions{
244247
Runtime: v1alpha5.SimpleConfigOptionsRuntime{
248+
// worker node memory
245249
AgentsMemory: cfg.WorkerMemory,
246250
},
247251
},
@@ -258,12 +262,13 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *logrus.Logg
258262
)
259263
}
260264

261-
// Apply agent node taints if specified
262-
for _, taint := range cfg.AgentNodeTaints {
265+
// Apply worker node taints if specified
266+
for _, taint := range cfg.WorkerNodeTaints {
263267
clusterConfig.Options.K3sOptions.ExtraArgs = append(
264268
clusterConfig.Options.K3sOptions.ExtraArgs,
265269
v1alpha5.K3sArgWithNodeFilters{
266-
Arg: fmt.Sprintf("--node-taint=%s=%s:%s", taint.Key, taint.Value, taint.Effect),
270+
Arg: fmt.Sprintf("--node-taint=%s=%s:%s", taint.Key, taint.Value, taint.Effect),
271+
// k3s refers to worker nodes as agent nodes
267272
NodeFilters: []string{"agent:*"},
268273
},
269274
)
@@ -297,8 +302,8 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *logrus.Logg
297302
}
298303

299304
// Create cluster
300-
logger.Debugf("🚀 Creating cluster '%s' with %d server(s) and %d agent(s)...",
301-
k3dConfig.Name, cfg.Servers, cfg.Agents)
305+
logger.Debugf("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)...",
306+
k3dConfig.Name, cfg.ServerNodes, cfg.WorkerNodes)
302307

303308
if err := client.ClusterRun(ctx, runtimes.Docker, k3dConfig); err != nil {
304309
return nil, cleanup, fmt.Errorf("failed to create cluster: %w", err)

operator/e2e_testing/setup/shared_cluster.go

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ type SharedClusterManager struct {
4343
cleanup func()
4444
logger *logrus.Logger
4545
isSetup bool
46-
agentNodes []string
46+
workerNodes []string
4747
registryPort string
4848
relativeSkaffoldYAMLPath string
4949
}
@@ -70,11 +70,11 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
7070
return nil
7171
}
7272

73-
// Configuration for maximum cluster size needed (28 agents + 3 servers)
73+
// Configuration for maximum cluster size needed (28 worker nodes + 3 server nodes)
7474
customCfg := ClusterConfig{
7575
Name: "shared-e2e-test-cluster",
76-
Servers: 3,
77-
Agents: 28, // Maximum needed across all tests
76+
ServerNodes: 3,
77+
WorkerNodes: 28, // Maximum needed across all tests
7878
WorkerMemory: "150m", // 150m memory per agent node to fit one workload pod
7979
Image: "rancher/k3s:v1.33.5-k3s1",
8080
HostPort: "6560", // Use a different port to avoid conflicts
@@ -83,20 +83,23 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
8383
RegistryPort: "5001",
8484
NodeLabels: []NodeLabel{
8585
{
86-
Key: "node_role.e2e.grove.nvidia.com",
86+
Key: "node_role.e2e.grove.nvidia.com",
87+
// k3s refers to worker nodes as agent nodes
8788
Value: "agent",
8889
NodeFilters: []string{"agent:*"},
8990
},
9091
// we currently don't want GPUs in e2e tests as validator is causing issues
9192
{
92-
Key: "nvidia.com/gpu.deploy.operands",
93-
Value: "false",
93+
Key: "nvidia.com/gpu.deploy.operands",
94+
Value: "false",
95+
// k3s refers to worker nodes as agent nodes
9496
NodeFilters: []string{"server:*", "agent:*"},
9597
},
9698
},
97-
AgentNodeTaints: []NodeTaint{
99+
WorkerNodeTaints: []NodeTaint{
98100
{
99-
Key: "node_role.e2e.grove.nvidia.com",
101+
Key: "node_role.e2e.grove.nvidia.com",
102+
// k3s refers to worker nodes as agent nodes
100103
Value: "agent",
101104
Effect: "NoSchedule",
102105
},
@@ -137,41 +140,41 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
137140
return fmt.Errorf("failed to setup registry test images: %w", err)
138141
}
139142

140-
// Get list of agent nodes for cordoning management
143+
// Get list of worker nodes for cordoning management
141144
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
142145
if err != nil {
143146
cleanup()
144147
return fmt.Errorf("failed to list nodes: %w", err)
145148
}
146149

147-
scm.agentNodes = make([]string, 0)
150+
scm.workerNodes = make([]string, 0)
148151
for _, node := range nodes.Items {
149152
if _, isServer := node.Labels["node-role.kubernetes.io/control-plane"]; !isServer {
150-
scm.agentNodes = append(scm.agentNodes, node.Name)
153+
scm.workerNodes = append(scm.workerNodes, node.Name)
151154
}
152155
}
153156

154-
scm.logger.Infof("✅ Shared cluster setup complete with %d agent nodes", len(scm.agentNodes))
157+
scm.logger.Infof("✅ Shared cluster setup complete with %d worker nodes", len(scm.workerNodes))
155158
scm.isSetup = true
156159
return nil
157160
}
158161

159162
// PrepareForTest prepares the cluster for a specific test by cordoning the appropriate nodes
160-
func (scm *SharedClusterManager) PrepareForTest(ctx context.Context, requiredAgents int) error {
163+
func (scm *SharedClusterManager) PrepareForTest(ctx context.Context, requiredWorkerNodes int) error {
161164
if !scm.isSetup {
162165
return fmt.Errorf("shared cluster not setup")
163166
}
164167

165168
// First, uncordon all nodes to reset state
166-
for _, nodeName := range scm.agentNodes {
169+
for _, nodeName := range scm.workerNodes {
167170
if err := utils.CordonNode(ctx, scm.clientset, nodeName, false); err != nil {
168171
return fmt.Errorf("failed to uncordon node %s: %w", nodeName, err)
169172
}
170173
}
171174

172175
// Cordon nodes that are not needed for this test
173-
if requiredAgents < len(scm.agentNodes) {
174-
nodesToCordon := scm.agentNodes[requiredAgents:]
176+
if requiredWorkerNodes < len(scm.workerNodes) {
177+
nodesToCordon := scm.workerNodes[requiredWorkerNodes:]
175178
for _, nodeName := range nodesToCordon {
176179
if err := utils.CordonNode(ctx, scm.clientset, nodeName, true); err != nil {
177180
return fmt.Errorf("failed to cordon node %s: %w", nodeName, err)
@@ -276,9 +279,9 @@ func (scm *SharedClusterManager) listRemainingPods(ctx context.Context, namespac
276279
}
277280
}
278281

279-
// resetNodeStates uncordons all agent nodes to reset cluster state
282+
// resetNodeStates uncordons all worker nodes to reset cluster state
280283
func (scm *SharedClusterManager) resetNodeStates(ctx context.Context) error {
281-
for _, nodeName := range scm.agentNodes {
284+
for _, nodeName := range scm.workerNodes {
282285
if err := utils.CordonNode(ctx, scm.clientset, nodeName, false); err != nil {
283286
scm.logger.Warnf("failed to uncordon node %s: %v", nodeName, err)
284287
return fmt.Errorf("failed to uncordon node %s: %w", nodeName, err)
@@ -406,9 +409,9 @@ func (scm *SharedClusterManager) GetRegistryPort() string {
406409
return scm.registryPort
407410
}
408411

409-
// GetAgentNodes returns the list of agent node names
410-
func (scm *SharedClusterManager) GetAgentNodes() []string {
411-
return scm.agentNodes
412+
// GetWorkerNodes returns the list of worker node names
413+
func (scm *SharedClusterManager) GetWorkerNodes() []string {
414+
return scm.workerNodes
412415
}
413416

414417
// IsSetup returns whether the shared cluster is setup

operator/e2e_testing/tests/gang_scheduling_test.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,20 @@ func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) {
4141
clientset, restConfig, _, cleanup, _ := setupTestCluster(ctx, t, 10)
4242
defer cleanup()
4343

44-
// Get agent (worker) nodes for cordoning
45-
agentNodes, err := getAgentNodes(ctx, clientset)
44+
// Get worker nodes for cordoning
45+
workerNodes, err := getWorkerNodes(ctx, clientset)
4646
if err != nil {
47-
t.Fatalf("Failed to get agent nodes: %v", err)
47+
t.Fatalf("Failed to get worker nodes: %v", err)
4848
}
4949

50-
if len(agentNodes) < 1 {
51-
t.Fatalf("Need at least 1 agent node to cordon, but found %d", len(agentNodes))
50+
if len(workerNodes) < 1 {
51+
t.Fatalf("Need at least 1 worker node to cordon, but found %d", len(workerNodes))
5252
}
5353

54-
agentNodeToCordon := agentNodes[0]
55-
logger.Debugf("🚫 Cordoning agent node: %s", agentNodeToCordon)
56-
if err := utils.CordonNode(ctx, clientset, agentNodeToCordon, true); err != nil {
57-
t.Fatalf("Failed to cordon node %s: %v", agentNodeToCordon, err)
54+
workerNodeToCordon := workerNodes[0]
55+
logger.Debugf("🚫 Cordoning worker node: %s", workerNodeToCordon)
56+
if err := utils.CordonNode(ctx, clientset, workerNodeToCordon, true); err != nil {
57+
t.Fatalf("Failed to cordon node %s: %v", workerNodeToCordon, err)
5858
}
5959

6060
logger.Info("2. Deploy workload WL1, and verify 10 newly created pods")
@@ -91,8 +91,8 @@ func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) {
9191
}
9292

9393
logger.Info("4. Uncordon the node and verify all pods get scheduled")
94-
if err := utils.CordonNode(ctx, clientset, agentNodeToCordon, false); err != nil {
95-
t.Fatalf("Failed to uncordon node %s: %v", agentNodeToCordon, err)
94+
if err := utils.CordonNode(ctx, clientset, workerNodeToCordon, false); err != nil {
95+
t.Fatalf("Failed to uncordon node %s: %v", workerNodeToCordon, err)
9696
}
9797

9898
// Wait for all pods to be scheduled and ready
@@ -108,7 +108,7 @@ func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) {
108108
t.Fatalf("Failed to list workload pods: %v", err)
109109
}
110110

111-
// Verify that each pod is scheduled on a unique node, agent nodes have 150m memory
111+
// Verify that each pod is scheduled on a unique node, worker nodes have 150m memory
112112
// and workload pods requests 80m memory, so only 1 should fit per node
113113
assertPodsOnDistinctNodes(t, pods.Items)
114114

operator/e2e_testing/tests/main_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ func TestMain(m *testing.M) {
104104
}
105105

106106
// setupTestCluster initializes a shared Kubernetes cluster for testing.
107-
// It creates the cluster if needed, ensures the required number of agent nodes are available,
107+
// It creates the cluster if needed, ensures the required number of worker nodes are available,
108108
// and returns K8s clients along with a cleanup function and registry port.
109109
// The cleanup function removes workloads and optionally tears down the cluster for individual test runs.
110-
func setupTestCluster(ctx context.Context, t *testing.T, requiredAgents int) (*kubernetes.Clientset, *rest.Config, dynamic.Interface, func(), string) {
110+
func setupTestCluster(ctx context.Context, t *testing.T, requiredWorkerNodes int) (*kubernetes.Clientset, *rest.Config, dynamic.Interface, func(), string) {
111111
// Always use shared cluster approach
112112
sharedCluster := setup.SharedCluster(logger, "../../skaffold.yaml")
113113

@@ -118,7 +118,7 @@ func setupTestCluster(ctx context.Context, t *testing.T, requiredAgents int) (*k
118118
}
119119
}
120120

121-
if err := sharedCluster.PrepareForTest(ctx, requiredAgents); err != nil {
121+
if err := sharedCluster.PrepareForTest(ctx, requiredWorkerNodes); err != nil {
122122
t.Errorf("Failed to prepare shared cluster for test: %v", err)
123123
}
124124

@@ -139,22 +139,22 @@ func setupTestCluster(ctx context.Context, t *testing.T, requiredAgents int) (*k
139139
return clientset, restConfig, dynamicClient, cleanup, sharedCluster.GetRegistryPort()
140140
}
141141

142-
// getAgentNodes retrieves the names of all agent (worker) nodes in the cluster,
142+
// getWorkerNodes retrieves the names of all worker nodes in the cluster,
143143
// excluding control plane nodes. Returns an error if the node list cannot be retrieved.
144-
func getAgentNodes(ctx context.Context, clientset kubernetes.Interface) ([]string, error) {
144+
func getWorkerNodes(ctx context.Context, clientset kubernetes.Interface) ([]string, error) {
145145
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
146146
if err != nil {
147147
return nil, fmt.Errorf("failed to list nodes: %w", err)
148148
}
149149

150-
agentNodes := make([]string, 0)
150+
workerNodes := make([]string, 0)
151151
for _, node := range nodes.Items {
152152
if _, isServer := node.Labels["node-role.kubernetes.io/control-plane"]; !isServer {
153-
agentNodes = append(agentNodes, node.Name)
153+
workerNodes = append(workerNodes, node.Name)
154154
}
155155
}
156156

157-
return agentNodes, nil
157+
return workerNodes, nil
158158
}
159159

160160
// assertPodsOnDistinctNodes asserts that the pods are scheduled on distinct nodes and fails the test if not.

0 commit comments

Comments
 (0)