Skip to content

Commit

Permalink
Create rolling runner option (#62)
Browse files Browse the repository at this point in the history
Co-authored-by: Kevin Logan <[email protected]>
  • Loading branch information
KevinLoganBS and Kevin Logan committed Jul 6, 2020
1 parent a93b405 commit cf141fc
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 12 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ If you need to run serial mode against an ASG with an expected `desired_capacity
./bouncer serial -a hashi-use1-stag-worker-linux:3,hashi-use1-stag-worker-windows:2
```

## Rolling

Rolling is the same as serial, but does not decrement. This means `min_size`, `max_size`, and `desired_capacity` can all be the same value.

## Canary

Made for bouncing ASGs of arbitrary size where additional nodes and scale in before the old nodes have scaled out. `./bouncer canary --help` for all available options. Ex:
Expand Down
7 changes: 2 additions & 5 deletions aws/asg.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,10 @@ func (c *Clients) CompleteLifecycleAction(asgName *string, instID *string, lifec
}

// TerminateInstanceInASG calls https://docs.aws.amazon.com/cli/latest/reference/autoscaling/terminate-instance-in-auto-scaling-group.html
func (c *Clients) TerminateInstanceInASG(instID *string) error {
// This call decrements the desired capacity so that we don't get into a race condition
// where the replacement starts booting before the node we've told to terminate has terminated
dumb := true
func (c *Clients) TerminateInstanceInASG(instID *string, decrement *bool) error {
input := autoscaling.TerminateInstanceInAutoScalingGroupInput{
InstanceId: instID,
ShouldDecrementDesiredCapacity: &dumb,
ShouldDecrementDesiredCapacity: decrement,
}
_, err := c.ASGClient.TerminateInstanceInAutoScalingGroup(&input)
return errors.Wrapf(err, "error terminating instance %s", *instID)
Expand Down
8 changes: 4 additions & 4 deletions bouncer/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (r *BaseRunner) abandonLifecycle(inst *Instance, hook *string) error {

// KillInstance calls TerminateInstanceInAutoscalingGroup, or, if the instance is stuck
// in a lifecycle hook, issues an ABANDON to it, killing it more forcefully
func (r *BaseRunner) KillInstance(inst *Instance) error {
func (r *BaseRunner) KillInstance(inst *Instance, decrement *bool) error {
log.WithFields(log.Fields{
"ASG": *inst.AutoscalingGroup.AutoScalingGroupName,
"InstanceID": *inst.ASGInstance.InstanceId,
Expand All @@ -179,18 +179,18 @@ func (r *BaseRunner) KillInstance(inst *Instance) error {
return errors.Wrap(err, "error executing pre-terminate command")
}
}
err := r.terminateInstanceInASG(inst)
err := r.terminateInstanceInASG(inst, decrement)
return errors.Wrap(err, "error terminating instance")
}

func (r *BaseRunner) terminateInstanceInASG(inst *Instance) error {
func (r *BaseRunner) terminateInstanceInASG(inst *Instance, decrement *bool) error {
log.WithFields(log.Fields{
"ASG": *inst.AutoscalingGroup.AutoScalingGroupName,
"InstanceID": *inst.ASGInstance.InstanceId,
}).Info("Terminating instance")
r.resetTimeout()
r.noopCheck()
return r.awsClients.TerminateInstanceInASG(inst.ASGInstance.InstanceId)
return r.awsClients.TerminateInstanceInASG(inst.ASGInstance.InstanceId, decrement)
}

// SetDesiredCapacity Updates desired capacity of ASG
Expand Down
3 changes: 2 additions & 1 deletion canary/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ func (r *Runner) Run() error {
// We have the correct number of new instances, we just need
// to get rid of the old ones
// Let's issue all their terminates right here
decrement := true
for _, oldInst := range asgSet.GetOldInstances() {
err := r.KillInstance(oldInst)
err := r.KillInstance(oldInst, &decrement)
if err != nil {
return errors.Wrap(err, "error killing instance")
}
Expand Down
108 changes: 108 additions & 0 deletions cmd/rolling.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Copyright 2017 Palantir Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
log "github.com/Sirupsen/logrus"
"github.com/palantir/bouncer/bouncer"
"github.com/palantir/bouncer/rolling"
"github.com/pkg/errors"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

var rollingCmd = &cobra.Command{
Use: "rolling",
Short: "Run bouncer in rolling",
Long: `Run bouncer in rolling mode, where we bounce one node at a time from the list of ASGs.`,
Run: func(cmd *cobra.Command, args []string) {
log.SetLevel(logLevelFromViper())

log.Debug("rolling called")
if log.GetLevel() == log.DebugLevel {
cmd.DebugFlags()
viper.Debug()
}

asgsString := viper.GetString("rolling.asgs")
if asgsString == "" {
log.Fatal("You must specify ASGs to cycle nodes from (in a comma-delimited list)")
}

commandString := viper.GetString("rolling.command")
noop := viper.GetBool("rolling.noop")
force := viper.GetBool("rolling.force")
termHook := viper.GetString("terminate-hook")
pendHook := viper.GetString("pending-hook")
timeout := timeoutFromViper()

log.Debugf("Binding vars, got %+v %+v %+v %+v", asgsString, noop, version, commandString)

log.Info("Beginning bouncer rolling run")

var defCap int64
defCap = 1
opts := bouncer.RunnerOpts{
Noop: noop,
Force: force,
AsgString: asgsString,
CommandString: commandString,
DefaultCapacity: &defCap,
TerminateHook: termHook,
PendingHook: pendHook,
ItemTimeout: timeout,
}

r, err := rolling.NewRunner(&opts)
if err != nil {
log.Fatal(errors.Wrap(err, "error initializing runner"))
}

r.MustValidatePrereqs()

err = r.Run()
if err != nil {
log.Fatal(errors.Wrap(err, "error in run"))
}
},
}

func init() {
RootCmd.AddCommand(rollingCmd)

rollingCmd.Flags().BoolP("noop", "n", false, "Run this in noop mode, and only print what you would do")
err := viper.BindPFlag("rolling.noop", rollingCmd.Flags().Lookup("noop"))
if err != nil {
log.Fatal(errors.Wrap(err, "Binding PFlag 'noop' to viper var 'rolling.noop' failed: %s"))
}

rollingCmd.Flags().StringP("asgs", "a", "", "ASGs to check for nodes to cycle in")
err = viper.BindPFlag("rolling.asgs", rollingCmd.Flags().Lookup("asgs"))
if err != nil {
log.Fatal(errors.Wrap(err, "Binding PFlag 'asgs' to viper var 'rolling.asgs' failed: %s"))
}

rollingCmd.Flags().StringP("preterminatecall", "p", "", "External command to run before host is removed from its ELB & terminate process begins")
err = viper.BindPFlag("rolling.command", rollingCmd.Flags().Lookup("preterminatecall"))
if err != nil {
log.Fatal(errors.Wrap(err, "Binding PFlag 'command' to viper var 'rolling.command' failed: %s"))
}

rollingCmd.Flags().BoolP("force", "f", false, "Force all nodes to be recycled, even if they're running the latest launch config")
err = viper.BindPFlag("rolling.force", rollingCmd.Flags().Lookup("force"))
if err != nil {
log.Fatal(errors.Wrap(err, "Binding PFlag 'force' to viper var 'rolling.force' failed: %s"))
}
}
3 changes: 2 additions & 1 deletion full/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ start:
set := asgSetWrapper(asg)

if set.IsOldInstance() {
err := r.KillInstance(set.GetBestOldInstance())
decrement := true
err := r.KillInstance(set.GetBestOldInstance(), &decrement)
if err != nil {
return errors.Wrap(err, "failed to kill instance")
}
Expand Down
113 changes: 113 additions & 0 deletions rolling/runner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright 2017 Palantir Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package rolling

import (
"os"

log "github.com/Sirupsen/logrus"
"github.com/palantir/bouncer/bouncer"
"github.com/pkg/errors"
)

// Runner holds data for a particular rolling run
type Runner struct {
bouncer.BaseRunner
}

// NewRunner instantiates a new rolling runner
func NewRunner(opts *bouncer.RunnerOpts) (*Runner, error) {
br, err := bouncer.NewBaseRunner(opts)
if err != nil {
return nil, errors.Wrap(err, "error getting base runner")
}

r := Runner{
*br,
}
return &r, nil
}

func (r *Runner) killBestOldInstance(asgSet *bouncer.ASGSet) error {
bestOld := asgSet.GetBestOldInstance()
decrement := false
err := r.KillInstance(bestOld, &decrement)
return errors.Wrap(err, "error killing instance")
}

// MustValidatePrereqs checks that the batch runner is safe to proceed
func (r *Runner) MustValidatePrereqs() {
asgSet, err := r.NewASGSet()
if err != nil {
log.Fatal(errors.Wrap(err, "error building ASGSet"))
}

divergedASGs := asgSet.GetDivergedASGs()
if len(divergedASGs) != 0 {
for _, badASG := range divergedASGs {
log.WithFields(log.Fields{
"ASG": *badASG.ASG.AutoScalingGroupName,
"desired_capacity actual": *badASG.ASG.DesiredCapacity,
"desired_capacity given": badASG.DesiredASG.DesiredCapacity,
}).Error("ASG desired capacity doesn't match expected starting value")
}
os.Exit(1)
}

for _, asg := range asgSet.ASGs {
if *asg.ASG.DesiredCapacity == 0 {
log.WithFields(log.Fields{
"ASG": *asg.ASG.AutoScalingGroupName,
}).Warn("ASG desired capacity is 0 - nothing to do here")
os.Exit(0)
}
}
}

// Run has the meat of the batch job
func (r *Runner) Run() error {
for {
if r.TimedOut() {
return errors.Errorf("timeout exceeded, something is probably wrong with rollout")
}

// Rebuild the state of the world every iteration of the loop because instance and ASG statuses are changing
log.Debug("Beginning new rolling run check")
asgSet, err := r.NewASGSet()
if err != nil {
return errors.Wrap(err, "error building ASGSet")
}

// See if we're still waiting on a change we made previously to finish or settle
if asgSet.IsNewUnhealthy() || asgSet.IsTerminating() || asgSet.IsImmutableAutoscalingEvent() || asgSet.IsCountMismatch() {
r.Sleep()
continue
}

// If there are any old instances which are now ready to be terminated, let's do it
if asgSet.IsOldInstance() {
err = r.killBestOldInstance(asgSet)
if err != nil {
return errors.Wrap(err, "error finding or killing best old instance")
}

r.Sleep()
continue
}

log.Info("Didn't find any old instances or ASGs - we're done here!")
return nil
}
}
3 changes: 2 additions & 1 deletion serial/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ func NewRunner(opts *bouncer.RunnerOpts) (*Runner, error) {

func (r *Runner) killBestOldInstance(asgSet *bouncer.ASGSet) error {
bestOld := asgSet.GetBestOldInstance()
err := r.KillInstance(bestOld)
decrement := true
err := r.KillInstance(bestOld, &decrement)
return errors.Wrap(err, "error killing instance")
}

Expand Down

0 comments on commit cf141fc

Please sign in to comment.