From 137380503aa2ab396cf2517f8da93ea347446216 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:10:36 +0300 Subject: [PATCH] [release-15.0] OnlineDDL: fix nil 'completed_timestamp' for cancelled migrations (#13928) (#13935) Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../scheduler/onlineddl_scheduler_test.go | 3 + .../onlineddl/vrepl/onlineddl_vrepl_test.go | 3 + go/test/endtoend/onlineddl/vtgate_util.go | 73 +++++++++++++++++++ go/test/endtoend/onlineddl/vttablet_util.go | 2 +- go/vt/vttablet/onlineddl/executor.go | 23 ++++-- go/vt/vttablet/onlineddl/schema.go | 5 +- 6 files changed, 100 insertions(+), 9 deletions(-) diff --git a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go index dca084103b8..d5d7cffab08 100644 --- a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go +++ b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go @@ -646,6 +646,9 @@ func TestSchemaChange(t *testing.T) { } }) }) + t.Run("summary: validate completed_timestamp", func(t *testing.T) { + onlineddl.ValidateCompletedTimestamp(t, &vtParams) + }) } // testOnlineDDLStatement runs an online DDL, ALTER statement diff --git a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go index 93420b0f326..118488c63b1 100644 --- a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go +++ b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go @@ -756,6 +756,9 @@ func TestSchemaChange(t *testing.T) { } }) }) + t.Run("summary: validate completed_timestamp", func(t *testing.T) { + onlineddl.ValidateCompletedTimestamp(t, &vtParams) + }) } func insertRow(t *testing.T) { diff --git a/go/test/endtoend/onlineddl/vtgate_util.go b/go/test/endtoend/onlineddl/vtgate_util.go index 86ea963465a..1b6def8ecf4 100644 --- a/go/test/endtoend/onlineddl/vtgate_util.go +++ b/go/test/endtoend/onlineddl/vtgate_util.go @@ -35,6 +35,14 @@ import ( "github.com/stretchr/testify/require" ) +var ( + testsStartupTime time.Time +) + +func init() { + testsStartupTime = time.Now() +} + // VtgateExecQuery runs a query on VTGate using given query params func VtgateExecQuery(t *testing.T, vtParams *mysql.ConnParams, query string, expectError string) *sqltypes.Result { t.Helper() @@ -344,3 +352,68 @@ func WaitForThrottledTimestamp(t *testing.T, vtParams *mysql.ConnParams, uuid st t.Error("timeout waiting for last_throttled_timestamp to have nonempty value") return } + +// ValidateSequentialMigrationIDs validates that schem_migrations.id column, which is an AUTO_INCREMENT, does +// not have gaps +func ValidateSequentialMigrationIDs(t *testing.T, vtParams *mysql.ConnParams, shards []cluster.Shard) { + r := VtgateExecQuery(t, vtParams, "show vitess_migrations", "") + shardMin := map[string]uint64{} + shardMax := map[string]uint64{} + shardCount := map[string]uint64{} + + for _, row := range r.Named().Rows { + id := row.AsUint64("id", 0) + require.NotZero(t, id) + + shard := row.AsString("shard", "") + require.NotEmpty(t, shard) + + if _, ok := shardMin[shard]; !ok { + shardMin[shard] = id + shardMax[shard] = id + } + if id < shardMin[shard] { + shardMin[shard] = id + } + if id > shardMax[shard] { + shardMax[shard] = id + } + shardCount[shard]++ + } + require.NotEmpty(t, shards) + assert.Equal(t, len(shards), len(shardMin)) + assert.Equal(t, len(shards), len(shardMax)) + assert.Equal(t, len(shards), len(shardCount)) + for shard, count := range shardCount { + assert.NotZero(t, count) + assert.Equalf(t, count, shardMax[shard]-shardMin[shard]+1, "mismatch: shared=%v, count=%v, min=%v, max=%v", shard, count, shardMin[shard], shardMax[shard]) + } +} + +// ValidateCompletedTimestamp ensures that any migration in `cancelled`, `completed`, `failed` statuses +// has a non-nil and valid `completed_timestamp` value. +func ValidateCompletedTimestamp(t *testing.T, vtParams *mysql.ConnParams) { + require.False(t, testsStartupTime.IsZero()) + r := VtgateExecQuery(t, vtParams, "show vitess_migrations", "") + + completedTimestampNumValidations := 0 + for _, row := range r.Named().Rows { + migrationStatus := row.AsString("migration_status", "") + require.NotEmpty(t, migrationStatus) + switch migrationStatus { + case string(schema.OnlineDDLStatusComplete), + string(schema.OnlineDDLStatusFailed), + string(schema.OnlineDDLStatusCancelled): + { + assert.False(t, row["completed_timestamp"].IsNull()) + // Also make sure the timestamp is "real", and that it is recent. + timestamp := row.AsString("completed_timestamp", "") + completedTime, err := time.Parse(sqltypes.TimestampFormat, timestamp) + assert.NoError(t, err) + assert.Greater(t, completedTime.Unix(), testsStartupTime.Unix()) + completedTimestampNumValidations++ + } + } + } + assert.NotZero(t, completedTimestampNumValidations) +} diff --git a/go/test/endtoend/onlineddl/vttablet_util.go b/go/test/endtoend/onlineddl/vttablet_util.go index 4d4e88b8189..893d312d977 100644 --- a/go/test/endtoend/onlineddl/vttablet_util.go +++ b/go/test/endtoend/onlineddl/vttablet_util.go @@ -31,7 +31,7 @@ import ( ) // WaitForVReplicationStatus waits for a vreplication stream to be in one of given states, or timeout -func WaitForVReplicationStatus(t *testing.T, vtParams *mysql.ConnParams, shards []cluster.Shard, uuid string, timeout time.Duration, expectStatuses ...string) (status string) { +func WaitForVReplicationStatus(t *testing.T, vtParams *mysql.ConnParams, shards []cluster.Shard, uuid string, timeout time.Duration, expectStatuses ...string) (status string) { // nolint:revive query, err := sqlparser.ParseAndBind("select workflow, state from _vt.vreplication where workflow=%a", sqltypes.StringBindVariable(uuid), diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index 4663722983c..d64ee55e7b7 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -101,6 +101,13 @@ var vexecInsertTemplates = []string{ 'val1', 'val2', 'val3', 'val4', 'val5', 'val6', 'val7', 'val8', 'val9', FROM_UNIXTIME(0), 'vala', 'valb' )`, } +var ( + // fixCompletedTimestampDone fixes a nil `completed_tiemstamp` columns, see + // https://github.com/vitessio/vitess/issues/13927 + // The fix is in release-18.0 + // TODO: remove in release-19.0 + fixCompletedTimestampDone bool +) var emptyResult = &sqltypes.Result{} var acceptableDropTableIfExistsErrorCodes = []int{mysql.ERCantFindFile, mysql.ERNoSuchTable} @@ -3754,13 +3761,17 @@ func (e *Executor) gcArtifacts(ctx context.Context) error { e.migrationMutex.Lock() defer e.migrationMutex.Unlock() - if _, err := e.execQuery(ctx, sqlFixCompletedTimestamp); err != nil { - // This query fixes a bug where stale migrations were marked as 'failed' without updating 'completed_timestamp' - // see https://github.com/vitessio/vitess/issues/8499 - // Running this query retroactively sets completed_timestamp - // This 'if' clause can be removed in version v13 - return err + // v18 fix. Remove in v19 + if !fixCompletedTimestampDone { + if _, err := e.execQuery(ctx, sqlFixCompletedTimestamp); err != nil { + // This query fixes a bug where stale migrations were marked as 'cancelled' or 'failed' without updating 'completed_timestamp' + // Running this query retroactively sets completed_timestamp + // This fix is created in v18 and can be removed in v19 + return err + } + fixCompletedTimestampDone = true } + query, err := sqlparser.ParseAndBind(sqlSelectUncollectedArtifacts, sqltypes.Int64BindVariable(int64((retainOnlineDDLTables).Seconds())), ) diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go index 41449572508..833148939b0 100644 --- a/go/vt/vttablet/onlineddl/schema.go +++ b/go/vt/vttablet/onlineddl/schema.go @@ -133,7 +133,8 @@ const ( migration_uuid=%a ` sqlUpdateMigrationStatusFailedOrCancelled = `UPDATE _vt.schema_migrations - SET migration_status=IF(cancelled_timestamp IS NULL, 'failed', 'cancelled') + SET migration_status=IF(cancelled_timestamp IS NULL, 'failed', 'cancelled'), + completed_timestamp=NOW(6) WHERE migration_uuid=%a ` @@ -414,7 +415,7 @@ const ( SET completed_timestamp=NOW() WHERE - migration_status='failed' + migration_status IN ('cancelled', 'failed') AND cleanup_timestamp IS NULL AND completed_timestamp IS NULL `