From a72cc4f5842c04873a5940acb80eec71326ccf66 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 10 Jan 2025 12:02:44 +0100 Subject: [PATCH] roachtest: decommission/drains -> decommission/drains/{live,dead} - rename `decommission/drains` to `decommission/drains/alive` - add `decommission/drains/dead` flavor - run these weekly instead of daily (we have enough other decom tests and since I'm adding one, we can also clamp down a bit) - remove an old workaround that also accepted errors that should be impossible now that we properly wait for drain. Epic: none --- pkg/cmd/roachtest/tests/decommission.go | 58 +++++++++++++++++-------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go index 04d79c5d875d..cad4938104bd 100644 --- a/pkg/cmd/roachtest/tests/decommission.go +++ b/pkg/cmd/roachtest/tests/decommission.go @@ -75,17 +75,25 @@ func registerDecommission(r registry.Registry) { } { numNodes := 4 - r.Add(registry.TestSpec{ - Name: "decommission/drains", - Owner: registry.OwnerKV, - Cluster: r.MakeClusterSpec(numNodes), - CompatibleClouds: registry.AllExceptAWS, - Suites: registry.Suites(registry.Nightly), - Leases: registry.MetamorphicLeases, - Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - runDecommissionDrains(ctx, t, c) - }, - }) + for _, dead := range []bool{false, true} { + name := "decommission/drains" + if dead { + name += "/dead" + } else { + name += "/alive" + } + r.Add(registry.TestSpec{ + Name: name, + Owner: registry.OwnerKV, + Cluster: r.MakeClusterSpec(numNodes), + CompatibleClouds: registry.AllExceptAWS, + Suites: registry.Suites(registry.Weekly), + Leases: registry.MetamorphicLeases, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runDecommissionDrains(ctx, t, c, dead) + }, + }) + } } { numNodes := 6 @@ -991,7 +999,7 @@ func runDecommissionRandomized(ctx context.Context, t test.Test, c cluster.Clust // the end of decommissioning. The test cluster contains 4 nodes and the fourth // node is decommissioned. While the decommissioning node has open SQL // connections, queries should never fail. -func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) { +func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster, dead bool) { var ( numNodes = 4 pinnedNodeID = 1 @@ -1032,16 +1040,25 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) // The expected output of decommission while the node is about to be drained/is draining. expReplicasTransferred = [][]string{ decommissionHeader, - {strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioning", "false", "ready", "0"}, + {strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioning", "false", ".*", "0"}, decommissionFooter, } // The expected output of decommission once the node is finally marked as "decommissioned." expDecommissioned = [][]string{ decommissionHeader, - {strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioned", "false", "ready", "0"}, + {strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioned", "false", ".*", "0"}, decommissionFooter, } ) + if dead { + t.Status(fmt.Sprintf("stopping node %d and waiting for it to be recognized as dead", decommNodeID)) + c.Stop(ctx, t.L(), option.DefaultStopOpts(), decommNode) + // This should reliably result in the node being perceived as non-live from + // this point on. If the node were still "down but live" when decommission + // finishes, we'd try to drain a live node and get an error (since it can't + // be reached any more). + time.Sleep(15 * time.Second) + } t.Status(fmt.Sprintf("decommissioning node %d", decommNodeID)) e := retry.WithMaxAttempts(ctx, retryOpts, maxAttempts, func() error { o, err := h.decommission(ctx, decommNode, pinnedNodeID, "--wait=none", "--format=csv") @@ -1052,15 +1069,22 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) return err } + // When the target node is dead in this test configuration, the draining + // step is moot. If the target node is alive, the last decommission + // invocation should have drained it, which we verify below. + + if dead { + return nil + } + // Check to see if the node has been drained or decommissioned. // If not, queries should not fail. // Connect to node 4 (the target node of the decommission). decommNodeDB := c.Conn(ctx, t.L(), decommNodeID) defer decommNodeDB.Close() if err = run(decommNodeDB, `SHOW DATABASES`); err != nil { - if strings.Contains(err.Error(), "not accepting clients") || // drained - strings.Contains(err.Error(), "node is decommissioned") { // decommissioned - return nil + if strings.Contains(err.Error(), "not accepting clients") { + return nil // success (drained) } t.Fatal(err) }