Skip to content

Commit f3fb0ea

Browse files
author
Shlomi Noach
authored
Merge pull request #368 from github/fix-infinite-cutover-loop
cut-over failure on test-on-replica starts replication again
2 parents e0cf2ad + fc268ab commit f3fb0ea

File tree

6 files changed

+77
-9
lines changed

6 files changed

+77
-9
lines changed

doc/hooks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ The full list of supported hooks is best found in code: [hooks.go](https://githu
4444
- `gh-ost-on-interactive-command`
4545
- `gh-ost-on-row-copy-complete`
4646
- `gh-ost-on-stop-replication`
47+
- `gh-ost-on-start-replication`
4748
- `gh-ost-on-begin-postponed`
4849
- `gh-ost-on-before-cut-over`
4950
- `gh-ost-on-success`

go/logic/applier.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,11 +593,22 @@ func (this *Applier) RenameTablesRollback() (renameError error) {
593593
// and have them written to the binary log, so that we can then read them via streamer.
594594
func (this *Applier) StopSlaveIOThread() error {
595595
query := `stop /* gh-ost */ slave io_thread`
596-
log.Infof("Stopping replication")
596+
log.Infof("Stopping replication IO thread")
597597
if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil {
598598
return err
599599
}
600-
log.Infof("Replication stopped")
600+
log.Infof("Replication IO thread stopped")
601+
return nil
602+
}
603+
604+
// StartSlaveIOThread is applicable with --test-on-replica
605+
func (this *Applier) StartSlaveIOThread() error {
606+
query := `start /* gh-ost */ slave io_thread`
607+
log.Infof("Starting replication IO thread")
608+
if _, err := sqlutils.ExecNoPrepare(this.db, query); err != nil {
609+
return err
610+
}
611+
log.Infof("Replication IO thread started")
601612
return nil
602613
}
603614

@@ -640,6 +651,18 @@ func (this *Applier) StopReplication() error {
640651
return nil
641652
}
642653

654+
// StartReplication is used by `--test-on-replica` on cut-over failure
655+
func (this *Applier) StartReplication() error {
656+
if err := this.StartSlaveIOThread(); err != nil {
657+
return err
658+
}
659+
if err := this.StartSlaveSQLThread(); err != nil {
660+
return err
661+
}
662+
log.Infof("Replication started")
663+
return nil
664+
}
665+
643666
// GetSessionLockName returns a name for the special hint session voluntary lock
644667
func (this *Applier) GetSessionLockName(sessionId int64) string {
645668
return fmt.Sprintf("gh-ost.%d.lock", sessionId)

go/logic/hooks.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
onFailure = "gh-ost-on-failure"
3131
onStatus = "gh-ost-on-status"
3232
onStopReplication = "gh-ost-on-stop-replication"
33+
onStartReplication = "gh-ost-on-start-replication"
3334
)
3435

3536
type HooksExecutor struct {
@@ -152,3 +153,7 @@ func (this *HooksExecutor) onStatus(statusMessage string) error {
152153
func (this *HooksExecutor) onStopReplication() error {
153154
return this.executeHooks(onStopReplication)
154155
}
156+
157+
func (this *HooksExecutor) onStartReplication() error {
158+
return this.executeHooks(onStartReplication)
159+
}

go/logic/migrator.go

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,38 @@ func (this *Migrator) ExecOnFailureHook() (err error) {
385385
return this.hooksExecutor.onFailure()
386386
}
387387

388+
func (this *Migrator) handleCutOverResult(cutOverError error) (err error) {
389+
if this.migrationContext.TestOnReplica {
390+
// We're merly testing, we don't want to keep this state. Rollback the renames as possible
391+
this.applier.RenameTablesRollback()
392+
}
393+
if cutOverError == nil {
394+
return nil
395+
}
396+
// Only on error:
397+
398+
if this.migrationContext.TestOnReplica {
399+
// With `--test-on-replica` we stop replication thread, and then proceed to use
400+
// the same cut-over phase as the master would use. That means we take locks
401+
// and swap the tables.
402+
// The difference is that we will later swap the tables back.
403+
if err := this.hooksExecutor.onStartReplication(); err != nil {
404+
return log.Errore(err)
405+
}
406+
if this.migrationContext.TestOnReplicaSkipReplicaStop {
407+
log.Warningf("--test-on-replica-skip-replica-stop enabled, we are not starting replication.")
408+
} else {
409+
log.Debugf("testing on replica. Starting replication IO thread after cut-over failure")
410+
if err := this.retryOperation(this.applier.StartReplication); err != nil {
411+
return log.Errore(err)
412+
}
413+
}
414+
}
415+
return nil
416+
}
417+
388418
// cutOver performs the final step of migration, based on migration
389-
// type (on replica? bumpy? safe?)
419+
// type (on replica? atomic? safe?)
390420
func (this *Migrator) cutOver() (err error) {
391421
if this.migrationContext.Noop {
392422
log.Debugf("Noop operation; not really swapping tables")
@@ -441,18 +471,18 @@ func (this *Migrator) cutOver() (err error) {
441471
return err
442472
}
443473
}
444-
// We're merly testing, we don't want to keep this state. Rollback the renames as possible
445-
defer this.applier.RenameTablesRollback()
446-
// We further proceed to do the cutover by normal means; the 'defer' above will rollback the swap
447474
}
448475
if this.migrationContext.CutOverType == base.CutOverAtomic {
449476
// Atomic solution: we use low timeout and multiple attempts. But for
450477
// each failed attempt, we throttle until replication lag is back to normal
451478
err := this.atomicCutOver()
479+
this.handleCutOverResult(err)
452480
return err
453481
}
454482
if this.migrationContext.CutOverType == base.CutOverTwoStep {
455-
return this.cutOverTwoStep()
483+
err := this.cutOverTwoStep()
484+
this.handleCutOverResult(err)
485+
return err
456486
}
457487
return log.Fatalf("Unknown cut-over type: %d; should never get here!", this.migrationContext.CutOverType)
458488
}
@@ -1064,8 +1094,10 @@ func (this *Migrator) onApplyEventStruct(eventStruct *applyEventStruct) error {
10641094

10651095
availableEvents := len(this.applyEventsQueue)
10661096
batchSize := int(atomic.LoadInt64(&this.migrationContext.DMLBatchSize))
1067-
if availableEvents > batchSize {
1068-
availableEvents = batchSize
1097+
if availableEvents > batchSize-1 {
1098+
// The "- 1" is because we already consumed one event: the original event that led to this function getting called.
1099+
// So, if DMLBatchSize==1 we wish to not process any further events
1100+
availableEvents = batchSize - 1
10691101
}
10701102
for i := 0; i < availableEvents; i++ {
10711103
additionalStruct := <-this.applyEventsQueue
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
# Sample hook file for gh-ost-on-start-replication
4+
# Useful for RDS/Aurora setups, see https://github.com/github/gh-ost/issues/163
5+
6+
echo "$(date) gh-ost-on-start-replication $GH_OST_DATABASE_NAME.$GH_OST_TABLE_NAME $GH_OST_MIGRATED_HOST" >> /tmp/gh-ost.log
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/bin/bash
22

33
# Sample hook file for gh-ost-on-stop-replication
4+
# Useful for RDS/Aurora setups, see https://github.com/github/gh-ost/issues/163
45

56
echo "$(date) gh-ost-on-stop-replication $GH_OST_DATABASE_NAME.$GH_OST_TABLE_NAME $GH_OST_MIGRATED_HOST" >> /tmp/gh-ost.log

0 commit comments

Comments
 (0)