From a72cc4f5842c04873a5940acb80eec71326ccf66 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Fri, 10 Jan 2025 12:02:44 +0100
Subject: [PATCH] roachtest: decommission/drains ->
 decommission/drains/{live,dead}

- rename `decommission/drains` to `decommission/drains/alive`
- add `decommission/drains/dead` flavor
- run these weekly instead of daily (we have enough other decom tests
  and since I'm adding one, we can also clamp down a bit)
- remove an old workaround that also accepted errors that
  should be impossible now that we properly wait for drain.

Epic: none
---
 pkg/cmd/roachtest/tests/decommission.go | 58 +++++++++++++++++--------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go
index 04d79c5d875d..cad4938104bd 100644
--- a/pkg/cmd/roachtest/tests/decommission.go
+++ b/pkg/cmd/roachtest/tests/decommission.go
@@ -75,17 +75,25 @@ func registerDecommission(r registry.Registry) {
 	}
 	{
 		numNodes := 4
-		r.Add(registry.TestSpec{
-			Name:             "decommission/drains",
-			Owner:            registry.OwnerKV,
-			Cluster:          r.MakeClusterSpec(numNodes),
-			CompatibleClouds: registry.AllExceptAWS,
-			Suites:           registry.Suites(registry.Nightly),
-			Leases:           registry.MetamorphicLeases,
-			Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
-				runDecommissionDrains(ctx, t, c)
-			},
-		})
+		for _, dead := range []bool{false, true} {
+			name := "decommission/drains"
+			if dead {
+				name += "/dead"
+			} else {
+				name += "/alive"
+			}
+			r.Add(registry.TestSpec{
+				Name:             name,
+				Owner:            registry.OwnerKV,
+				Cluster:          r.MakeClusterSpec(numNodes),
+				CompatibleClouds: registry.AllExceptAWS,
+				Suites:           registry.Suites(registry.Weekly),
+				Leases:           registry.MetamorphicLeases,
+				Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
+					runDecommissionDrains(ctx, t, c, dead)
+				},
+			})
+		}
 	}
 	{
 		numNodes := 6
@@ -991,7 +999,7 @@ func runDecommissionRandomized(ctx context.Context, t test.Test, c cluster.Clust
 // the end of decommissioning. The test cluster contains 4 nodes and the fourth
 // node is decommissioned. While the decommissioning node has open SQL
 // connections, queries should never fail.
-func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) {
+func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster, dead bool) {
 	var (
 		numNodes     = 4
 		pinnedNodeID = 1
@@ -1032,16 +1040,25 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster)
 		// The expected output of decommission while the node is about to be drained/is draining.
 		expReplicasTransferred = [][]string{
 			decommissionHeader,
-			{strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioning", "false", "ready", "0"},
+			{strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioning", "false", ".*", "0"},
 			decommissionFooter,
 		}
 		// The expected output of decommission once the node is finally marked as "decommissioned."
 		expDecommissioned = [][]string{
 			decommissionHeader,
-			{strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioned", "false", "ready", "0"},
+			{strconv.Itoa(decommNodeID), "true|false", "0", "true", "decommissioned", "false", ".*", "0"},
 			decommissionFooter,
 		}
 	)
+	if dead {
+		t.Status(fmt.Sprintf("stopping node %d and waiting for it to be recognized as dead", decommNodeID))
+		c.Stop(ctx, t.L(), option.DefaultStopOpts(), decommNode)
+		// This should reliably result in the node being perceived as non-live from
+		// this point on. If the node were still "down but live" when decommission
+		// finishes, we'd try to drain a live node and get an error (since it can't
+		// be reached any more).
+		time.Sleep(15 * time.Second)
+	}
 	t.Status(fmt.Sprintf("decommissioning node %d", decommNodeID))
 	e := retry.WithMaxAttempts(ctx, retryOpts, maxAttempts, func() error {
 		o, err := h.decommission(ctx, decommNode, pinnedNodeID, "--wait=none", "--format=csv")
@@ -1052,15 +1069,22 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster)
 			return err
 		}
 
+		// When the target node is dead in this test configuration, the draining
+		// step is moot. If the target node is alive, the last decommission
+		// invocation should have drained it, which we verify below.
+
+		if dead {
+			return nil
+		}
+
 		// Check to see if the node has been drained or decommissioned.
 		// If not, queries should not fail.
 		// Connect to node 4 (the target node of the decommission).
 		decommNodeDB := c.Conn(ctx, t.L(), decommNodeID)
 		defer decommNodeDB.Close()
 		if err = run(decommNodeDB, `SHOW DATABASES`); err != nil {
-			if strings.Contains(err.Error(), "not accepting clients") || // drained
-				strings.Contains(err.Error(), "node is decommissioned") { // decommissioned
-				return nil
+			if strings.Contains(err.Error(), "not accepting clients") {
+				return nil // success (drained)
 			}
 			t.Fatal(err)
 		}