Skip to content

Commit b5686a4

Browse files
hasbro17claude
andcommitted
Add test for CPMS OnDelete strategy with full master replacement
Creates a new test case that validates the ControlPlaneMachineSet OnDelete strategy by deleting all three master machines simultaneously and verifying CPMS correctly replaces them while maintaining cluster health. The test switches CPMS to OnDelete strategy, deletes all master machines, and validates that CPMS creates replacements with proper etcd membership transitions. Verifies that all old etcd members are removed from both the cluster and etcd-endpoints ConfigMap, and new members are properly integrated. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 9b50a28 commit b5686a4

3 files changed

Lines changed: 248 additions & 5 deletions

File tree

test/extended/etcd/OWNERS

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
reviewers:
22
- dusk125
33
- hasbro17
4-
- Elbehery
4+
- jubittajohn
55
- tjungblu
66
approvers:
77
- deads2k
8-
- soltysh
98
- hasbro17
109
- dusk125
11-
- Elbehery
10+
- jubittajohn
1211
- tjungblu

test/extended/etcd/helpers/helpers.go

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,167 @@ func EnsureCPMSReplicasConverged(ctx context.Context, cpmsClient machinev1client
385385
return nil
386386
}
387387

388+
// UpdateCPMSStrategy updates the CPMS strategy to the specified type (OnDelete, RollingUpdate, or Recreate)
389+
func UpdateCPMSStrategy(ctx context.Context, t TestingT, cpmsClient machinev1client.ControlPlaneMachineSetInterface, strategy machinev1.ControlPlaneMachineSetStrategyType) error {
390+
cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
391+
if err != nil {
392+
return err
393+
}
394+
395+
cpms.Spec.Strategy.Type = strategy
396+
_, err = cpmsClient.Update(ctx, cpms, metav1.UpdateOptions{})
397+
if err != nil {
398+
return err
399+
}
400+
401+
framework.Logf("Successfully updated CPMS strategy to %v", strategy)
402+
return nil
403+
}
404+
405+
// DeleteAllMasterMachines deletes all master machines and returns the list of deleted machine names
406+
func DeleteAllMasterMachines(ctx context.Context, t TestingT, machineClient machinev1beta1client.MachineInterface) ([]string, error) {
407+
machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
408+
if err != nil {
409+
return nil, fmt.Errorf("error listing master machines: '%w'", err)
410+
}
411+
412+
var deletedMachineNames []string
413+
for _, machine := range machineList.Items {
414+
if err := DeleteMachine(ctx, t, machineClient, machine.Name); err != nil {
415+
return deletedMachineNames, err
416+
}
417+
deletedMachineNames = append(deletedMachineNames, machine.Name)
418+
}
419+
420+
return deletedMachineNames, nil
421+
}
422+
423+
// EnsureUpdatedReplicasOnCPMS checks if status.updatedReplicas on the cluster CPMS equals the expected count
424+
// updatedReplicas represents machines with the desired spec that are ready
425+
func EnsureUpdatedReplicasOnCPMS(ctx context.Context, t TestingT, expectedCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
426+
waitPollInterval := 15 * time.Second
427+
waitPollTimeout := 90 * time.Minute
428+
framework.Logf("Waiting up to %s for the CPMS to have status.updatedReplicas = %v", waitPollTimeout.String(), expectedCount)
429+
430+
return wait.PollUntilContextTimeout(ctx, waitPollInterval, waitPollTimeout, true, func(ctx context.Context) (done bool, err error) {
431+
cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
432+
if err != nil {
433+
return isTransientAPIError(t, err)
434+
}
435+
436+
if cpms.Status.UpdatedReplicas != int32(expectedCount) {
437+
framework.Logf("expected %d updated replicas on CPMS, got: %v", expectedCount, cpms.Status.UpdatedReplicas)
438+
return false, nil
439+
}
440+
framework.Logf("CPMS has reached the desired number of updated replicas: %v", cpms.Status.UpdatedReplicas)
441+
return true, nil
442+
})
443+
}
444+
445+
// GetVotingMemberNames returns the list of current voting etcd member names
446+
func GetVotingMemberNames(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator) ([]string, error) {
447+
etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient()
448+
if err != nil {
449+
return nil, fmt.Errorf("failed to get etcd client: %w", err)
450+
}
451+
defer closeFn()
452+
453+
memberCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
454+
defer cancel()
455+
memberList, err := etcdClient.MemberList(memberCtx)
456+
if err != nil {
457+
return nil, fmt.Errorf("failed to get the member list: %w", err)
458+
}
459+
460+
var votingMemberNames []string
461+
for _, member := range memberList.Members {
462+
if !member.IsLearner {
463+
votingMemberNames = append(votingMemberNames, member.Name)
464+
}
465+
}
466+
467+
framework.Logf("Current voting etcd members: %v", votingMemberNames)
468+
return votingMemberNames, nil
469+
}
470+
471+
// EnsureVotingMembersExcluding waits for the cluster to have exactly expectedCount voting members,
472+
// with none of the members in the excludedMemberNames list
473+
func EnsureVotingMembersExcluding(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, excludedMemberNames []string, expectedCount int) error {
474+
waitPollInterval := 15 * time.Second
475+
waitPollTimeout := 90 * time.Minute
476+
excludedSet := sets.NewString(excludedMemberNames...)
477+
framework.Logf("Waiting up to %s for the cluster to have %v voting members with none from the excluded list: %v", waitPollTimeout.String(), expectedCount, excludedMemberNames)
478+
479+
return wait.PollUntilContextTimeout(ctx, waitPollInterval, waitPollTimeout, true, func(ctx context.Context) (done bool, err error) {
480+
etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient()
481+
if err != nil {
482+
framework.Logf("failed to get etcd client, will retry, err: %v", err)
483+
return false, nil
484+
}
485+
defer closeFn()
486+
487+
memberCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
488+
defer cancel()
489+
memberList, err := etcdClient.MemberList(memberCtx)
490+
if err != nil {
491+
framework.Logf("failed to get the member list, will retry, err: %v", err)
492+
return false, nil
493+
}
494+
495+
var votingMemberNames []string
496+
excludedMemberIDs := sets.NewString()
497+
for _, member := range memberList.Members {
498+
if !member.IsLearner {
499+
votingMemberNames = append(votingMemberNames, member.Name)
500+
// Collect IDs of excluded members
501+
if excludedSet.Has(member.Name) {
502+
// Convert member ID to hexadecimal format to match etcd-endpoints ConfigMap format
503+
memberID := fmt.Sprintf("%x", member.ID)
504+
excludedMemberIDs.Insert(memberID)
505+
}
506+
}
507+
}
508+
509+
// Check if we have the expected count
510+
if len(votingMemberNames) != expectedCount {
511+
framework.Logf("unexpected number of voting etcd members, expected exactly %d, got: %v, current members are: %v", expectedCount, len(votingMemberNames), votingMemberNames)
512+
return false, nil
513+
}
514+
515+
// Check if any of the current members are in the excluded list
516+
for _, memberName := range votingMemberNames {
517+
if excludedSet.Has(memberName) {
518+
framework.Logf("found excluded member %q still in the cluster, current members are: %v", memberName, votingMemberNames)
519+
return false, nil
520+
}
521+
}
522+
523+
framework.Logf("cluster has reached the expected number of %v voting members with none from excluded list, current members are: %v", expectedCount, votingMemberNames)
524+
525+
// Also validate etcd-endpoints ConfigMap
526+
framework.Logf("ensuring that the openshift-etcd/etcd-endpoints cm has the expected number of %v voting members and excludes old members", expectedCount)
527+
etcdEndpointsConfigMap, err := kubeClient.CoreV1().ConfigMaps("openshift-etcd").Get(ctx, "etcd-endpoints", metav1.GetOptions{})
528+
if err != nil {
529+
return false, err
530+
}
531+
currentVotingMemberIPListSet := sets.NewString()
532+
for memberID, votingMemberIP := range etcdEndpointsConfigMap.Data {
533+
// Check if this member ID is in the excluded member IDs list
534+
if excludedMemberIDs.Has(memberID) {
535+
framework.Logf("found excluded member ID %q in etcd-endpoints ConfigMap, will retry", memberID)
536+
return false, nil
537+
}
538+
currentVotingMemberIPListSet.Insert(votingMemberIP)
539+
}
540+
if currentVotingMemberIPListSet.Len() != expectedCount {
541+
framework.Logf("unexpected number of voting members in the openshift-etcd/etcd-endpoints cm, expected exactly %d, got: %v, current members are: %v", expectedCount, currentVotingMemberIPListSet.Len(), currentVotingMemberIPListSet.List())
542+
return false, nil
543+
}
544+
545+
return true, nil
546+
})
547+
}
548+
388549
// EnsureVotingMembersCount counts the number of voting etcd members, it doesn't evaluate health conditions or any other attributes (i.e. name) of individual members
389550
// this method won't fail immediately on errors, this is useful during scaling down operation until the feature can ensure this operation to be graceful
390551
func EnsureVotingMembersCount(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, expectedMembersCount int) error {

test/extended/etcd/vertical_scaling.go

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ import (
77
o "github.com/onsi/gomega"
88
"github.com/pkg/errors"
99

10+
machinev1 "github.com/openshift/api/machine/v1"
1011
machineclient "github.com/openshift/client-go/machine/clientset/versioned"
11-
machinev1 "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1"
12+
machinev1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1"
1213
machinev1beta1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1beta1"
1314
testlibraryapi "github.com/openshift/library-go/test/library/apiserver"
1415
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -19,6 +20,7 @@ import (
1920
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021
"k8s.io/client-go/kubernetes"
2122
"k8s.io/kubernetes/test/e2e/framework"
23+
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
2224
)
2325

2426
var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd/scaling][Serial] etcd", func() {
@@ -30,7 +32,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
3032
machineClientSet *machineclient.Clientset
3133
machineClient machinev1beta1client.MachineInterface
3234
nodeClient v1.NodeInterface
33-
cpmsClient machinev1.ControlPlaneMachineSetInterface
35+
cpmsClient machinev1client.ControlPlaneMachineSetInterface
3436
kubeClient kubernetes.Interface
3537
cpmsActive bool
3638
ctx context.Context
@@ -293,4 +295,85 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
293295
err = scalingtestinglibrary.EnsureCPMSReplicasConverged(ctx, cpmsClient)
294296
o.Expect(err).ToNot(o.HaveOccurred())
295297
})
298+
299+
// The following test validates CPMS OnDelete strategy behavior during full master replacement.
300+
// OnDelete strategy differs from RollingUpdate in that CPMS does not automatically update
301+
// machines when their spec changes. However, when machines are deleted, CPMS still creates
302+
// replacements to maintain the desired replica count.
303+
//
304+
// This test verifies that CPMS correctly handles the deletion of all three master machines
305+
// simultaneously while in OnDelete mode:
306+
// 1) Switches CPMS to OnDelete strategy
307+
// 2) Deletes all master machines at once
308+
// 3) Validates CPMS creates three new replacement machines
309+
// 4) Verifies all old etcd members are removed from both the cluster and etcd-endpoints ConfigMap
310+
// 5) Waits for API server rollout to stabilize and verifies the cluster returns to 3 running machines
311+
g.It("is able to delete all masters with OnDelete strategy and wait for CPMSO to replace them [Timeout:120m][apigroup:machine.openshift.io]", func() {
312+
if !cpmsActive {
313+
e2eskipper.Skipf("CPMS is not active on this platform, this test requires an active CPMS to validate OnDelete strategy")
314+
}
315+
316+
// step 1: Update CPMS to OnDelete strategy
317+
framework.Logf("Updating CPMS strategy to OnDelete")
318+
err = scalingtestinglibrary.UpdateCPMSStrategy(ctx, g.GinkgoT(), cpmsClient, machinev1.OnDelete)
319+
err = errors.Wrap(err, "failed to update CPMS strategy to OnDelete")
320+
o.Expect(err).ToNot(o.HaveOccurred())
321+
322+
// step 2: Restore RollingUpdate strategy in cleanup
323+
defer func() {
324+
framework.Logf("Restoring CPMS strategy to RollingUpdate")
325+
err := scalingtestinglibrary.UpdateCPMSStrategy(ctx, g.GinkgoT(), cpmsClient, machinev1.RollingUpdate)
326+
err = errors.Wrap(err, "cleanup: failed to restore CPMS strategy to RollingUpdate")
327+
o.Expect(err).ToNot(o.HaveOccurred())
328+
}()
329+
330+
// step 3: Capture current etcd member names before deletion
331+
framework.Logf("Capturing current voting etcd member names")
332+
oldMemberNames, err := scalingtestinglibrary.GetVotingMemberNames(ctx, g.GinkgoT(), etcdClientFactory)
333+
err = errors.Wrap(err, "failed to get current voting member names")
334+
o.Expect(err).ToNot(o.HaveOccurred())
335+
336+
// step 4: Delete all master machines
337+
framework.Logf("Deleting all master machines")
338+
deletedMachineNames, err := scalingtestinglibrary.DeleteAllMasterMachines(ctx, g.GinkgoT(), machineClient)
339+
err = errors.Wrap(err, "failed to delete all master machines")
340+
o.Expect(err).ToNot(o.HaveOccurred())
341+
framework.Logf("Deleted machines: %v", deletedMachineNames)
342+
343+
// step 5: Wait for CPMS to show 3 updated replicas
344+
framework.Logf("Waiting for CPMS to show 3 updated replicas")
345+
err = scalingtestinglibrary.EnsureUpdatedReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient)
346+
err = errors.Wrap(err, "timed out waiting for CPMS to show 3 updated replicas")
347+
o.Expect(err).ToNot(o.HaveOccurred())
348+
349+
// step 6: Wait for etcd membership to have 3 members with none from old member list
350+
framework.Logf("Waiting for etcd membership to stabilize with new members")
351+
err = scalingtestinglibrary.EnsureVotingMembersExcluding(ctx, g.GinkgoT(), etcdClientFactory, kubeClient, oldMemberNames, 3)
352+
err = errors.Wrap(err, "timed out waiting for etcd to have 3 voting members excluding old members")
353+
o.Expect(err).ToNot(o.HaveOccurred())
354+
355+
// step 7: Wait for API server to stabilize
356+
framework.Logf("Waiting for API servers to stabilize on the same revision")
357+
err = testlibraryapi.WaitForAPIServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc.KubeClient().CoreV1().Pods("openshift-kube-apiserver"))
358+
err = errors.Wrap(err, "timed out waiting for APIServer pods to stabilize on the same revision")
359+
o.Expect(err).ToNot(o.HaveOccurred())
360+
361+
// step 8: Verify CPMS shows 3 ready replicas
362+
framework.Logf("Waiting for 3 ready replicas on CPMS")
363+
err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient, nodeClient)
364+
err = errors.Wrap(err, "timed out waiting for CPMS to show 3 ready replicas")
365+
o.Expect(err).ToNot(o.HaveOccurred())
366+
367+
// step 9: Verify only 3 running master machines
368+
framework.Logf("Waiting for 3 Running master machines")
369+
err = scalingtestinglibrary.EnsureMasterMachinesAndCount(ctx, g.GinkgoT(), machineClient)
370+
err = errors.Wrap(err, "timed out waiting for only 3 Running master machines")
371+
o.Expect(err).ToNot(o.HaveOccurred())
372+
373+
// step 10: Verify CPMS replicas converged
374+
framework.Logf("Waiting for CPMS replicas to converge")
375+
err = scalingtestinglibrary.EnsureCPMSReplicasConverged(ctx, cpmsClient)
376+
err = errors.Wrap(err, "CPMS replicas failed to converge")
377+
o.Expect(err).ToNot(o.HaveOccurred())
378+
})
296379
})

0 commit comments

Comments
 (0)