diff --git a/cluster.go b/cluster.go index e8f84a2f..3e9fb0df 100644 --- a/cluster.go +++ b/cluster.go @@ -321,14 +321,11 @@ const DefaultRepairHealthCheckCommandTimeoutSeconds = 30 const DefaultRepairCommandTimeoutSeconds = 30 type Retire struct { - ShutdownCommand []string `json:"shutdown_command"` - CheckCommand []string `json:"check_command"` - CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"` - CheckTimeoutSeconds *int `json:"check_timeout_seconds,omitempty"` + OptionalCommand []string `json:"optional_command,omitempty"` + OptionalCommandTimeoutSeconds *int `json:"optional_command_timeout_seconds,omitempty"` } -const DefaultRetireCommandTimeoutSeconds = 30 -const DefaultRetireCheckTimeoutSeconds = 300 +const DefaultRetireOptionalCommandTimeoutSeconds = 30 // Options is a set of optional parameters for k8s components. type Options struct { diff --git a/mtest/cke-cluster.yml b/mtest/cke-cluster.yml index b7058361..ddd10b8e 100644 --- a/mtest/cke-cluster.yml +++ b/mtest/cke-cluster.yml @@ -30,10 +30,8 @@ repair: watch_seconds: 30 health_check_command: ["sh", "-c", "test -f /tmp/mtest-repair-$1 && echo true", "health_check"] retire: - shutdown_command: ["true"] - check_command: ["bash", "-c", "echo 'Off'"] - command_timeout_seconds: 30 - check_timeout_seconds: 300 + optional_command: ["true"] + optional_command_timeout_seconds: 30 options: kube-api: extra_binds: diff --git a/op/kube_node_remove.go b/op/kube_node_remove.go index 7a8798f7..7db7e81e 100644 --- a/op/kube_node_remove.go +++ b/op/kube_node_remove.go @@ -7,7 +7,6 @@ import ( "time" "github.com/cybozu-go/cke" - "github.com/cybozu-go/log" "github.com/cybozu-go/well" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -41,10 +40,8 @@ func (o *kubeNodeRemove) NextCommand() cke.Commander { return nodeRemoveCommand{ o.apiserver, o.nodes, - o.config.ShutdownCommand, - o.config.CheckCommand, - o.config.CommandTimeoutSeconds, - o.config.CheckTimeoutSeconds, + o.config.OptionalCommand, + o.config.OptionalCommandTimeoutSeconds, } } @@ -55,12 +52,10 @@ func (o *kubeNodeRemove) Targets() []string { } type nodeRemoveCommand struct { - apiserver *cke.Node - nodes []*corev1.Node - shutdownCommand []string - checkCommand []string - timeoutSeconds *int - checkTimeoutSeconds *int + apiserver *cke.Node + nodes []*corev1.Node + optionalCommand []string + optionalCommandTimeoutSeconds *int } func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error { @@ -92,66 +87,25 @@ func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ st return fmt.Errorf("failed to patch node %s: %v", n.Name, err) } } - err := func() error { - ctx := ctx - timeout := cke.DefaultRetireCommandTimeoutSeconds - if c.timeoutSeconds != nil { - timeout = *c.timeoutSeconds - } - if timeout != 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout)) - defer cancel() - } - args := append(c.shutdownCommand[1:], n.Name) - command := well.CommandContext(ctx, c.shutdownCommand[0], args...) - return command.Run() - }() - if err != nil { - return fmt.Errorf("failed to shutdown node %s: %v", n.Name, err) - } - - err = func() error { - ctx := ctx - checkTimeout := cke.DefaultRetireCheckTimeoutSeconds - if c.checkTimeoutSeconds != nil { - checkTimeout = *c.checkTimeoutSeconds - } - timeout := time.After(time.Duration(checkTimeout) * time.Second) - ticker := time.NewTicker(10 * time.Second) - for { - select { - case <-timeout: - return fmt.Errorf("timeout") - case <-ticker.C: - args := append(c.checkCommand[1:], n.Name) - command := well.CommandContext(ctx, c.checkCommand[0], args...) - stdout, err := command.Output() - if err != nil { - log.Warn("failed to check shutdown status of node", map[string]interface{}{ - log.FnError: err, - "node": n.Name, - }) - continue - } - if strings.TrimSuffix(string(stdout), "\n") == "Off" { - return nil - } + if len(c.optionalCommand) != 0 { + err := func() error { + ctx := ctx + timeout := cke.DefaultRetireOptionalCommandTimeoutSeconds + if c.optionalCommandTimeoutSeconds != nil { + timeout = *c.optionalCommandTimeoutSeconds } + if timeout != 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout)) + defer cancel() + } + args := append(c.optionalCommand[1:], n.Name) + command := well.CommandContext(ctx, c.optionalCommand[0], args...) + return command.Run() + }() + if err != nil { + return fmt.Errorf("failed to execute optional command in retirement %s: %v", n.Name, err) } - }() - if err != nil { - return fmt.Errorf("failed to check shutdown status of node %s: %v", n.Name, err) - } - shutdownTaint := corev1.Taint{ - Key: "node.kubernetes.io/out-of-service", - Value: "nodeshutdown", - Effect: corev1.TaintEffectNoExecute, - } - n.Spec.Taints = append(n.Spec.Taints, shutdownTaint) - _, err = nodesAPI.Update(ctx, n, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("failed to update node %s: %v", n.Name, err) } err = nodesAPI.Delete(ctx, n.Name, metav1.DeleteOptions{})