Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raise orc-dead-tablet after vtorc reparent #513

Merged
merged 3 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ require (
github.com/kr/text v0.2.0
github.com/mitchellh/mapstructure v1.5.0
github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249
github.com/slackhq/vitess-addons v0.19.0
github.com/slackhq/vitess-addons v0.19.1
github.com/slok/noglog v0.2.0
github.com/spf13/afero v1.11.0
github.com/spf13/jwalterweatherman v1.1.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/sjmudd/stopwatch v0.1.1 h1:x45OvxFB5OtCkjvYtzRF5fWB857Jzjjk84Oyd5C5ebw=
github.com/sjmudd/stopwatch v0.1.1/go.mod h1:BLw0oIQJ1YLXBO/q9ufK/SgnKBVIkC2qrm6uy78Zw6U=
github.com/slackhq/vitess-addons v0.19.0 h1:+dWkQENsu8YYgsKesOKWqb3+vj66OY1WMvYOn9lmZ+I=
github.com/slackhq/vitess-addons v0.19.0/go.mod h1:E7i+cxyIY+I4An/JAvalQ9Ze2MjKlEx0u2nFXE4fgR0=
github.com/slackhq/vitess-addons v0.19.1 h1:k8f8pAJ2zqtetN+dnehAs7DFcZnI9IQRSL18ZMwNRCw=
github.com/slackhq/vitess-addons v0.19.1/go.mod h1:ZMzBBtadSA1MEuNIfZerztxLMhRFO+tmBZxv5HuV4lE=
github.com/slok/noglog v0.2.0 h1:1czu4l2EoJ8L92UwdSXXa1Y+c5TIjFAFm2P+mjej95E=
github.com/slok/noglog v0.2.0/go.mod h1:TfKxwpEZPT+UA83bQ6RME146k0MM4e8mwHLf6bhcGDI=
github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
Expand Down
18 changes: 17 additions & 1 deletion go/vt/vtorc/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ import (
"encoding/json"
"fmt"
"math/rand"
"os"
"time"

"github.com/patrickmn/go-cache"
"github.com/slackhq/vitess-addons/go/external"

"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/log"
Expand Down Expand Up @@ -81,6 +83,9 @@ var (

// recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed
recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...)

vtopsExec = external.NewExecVTOps(os.Getenv("VTOPS_PATH"), os.Getenv("VTOPS_HTTP_PROXY"), "vtorc", os.Getenv("HOSTNAME"))
vtopsSlackChannel = os.Getenv("SLACK_CHANNEL")
)

// recoveryFunction is the code of the recovery function to be used
Expand Down Expand Up @@ -297,6 +302,7 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
_ = AuditTopologyRecovery(topologyRecovery, message)
_ = inst.AuditOperation(recoveryName, analysisEntry.AnalyzedInstanceAlias, message)
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%v: successfully promoted %+v", recoveryName, promotedReplica.InstanceAlias))
vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true)
}
}

Expand Down Expand Up @@ -590,7 +596,6 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) {
func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (err error) {
countPendingRecoveries.Add(1)
defer countPendingRecoveries.Add(-1)

checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode)
analysisEntry.IsActionableRecovery = isActionableRecovery
Expand All @@ -605,8 +610,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
}
}

vtopsExec.SendSlackMessage(fmt.Sprintf("[VTOrc] No recovery available for %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)

return nil
}

// we have a recovery function; its execution still depends on filters if not disabled.
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
Expand Down Expand Up @@ -707,15 +715,22 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
}

if !isActionableRecovery {
vtopsExec.SendSlackMessage(fmt.Sprintf("No actionable recovery on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)
}

recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry)
if !recoveryAttempted {
return err
}
recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode)
recoveriesCounter.Add(recoveryName, 1)
if err != nil {
vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery failed on %s for problem %s. Error: %s", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis, err.Error()), vtopsSlackChannel, true)
recoveriesFailureCounter.Add(recoveryName, 1)
} else {
vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery succeeded on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)
recoveriesSuccessfulCounter.Add(recoveryName, 1)
}
if topologyRecovery == nil {
Expand Down Expand Up @@ -813,6 +828,7 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
_ = AuditTopologyRecovery(topologyRecovery, message)
_ = inst.AuditOperation(string(analysisEntry.Analysis), analysisEntry.AnalyzedInstanceAlias, message)
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.InstanceAlias))
vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true)
}
}

Expand Down
Loading