Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/machine/machine.proto
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ message RebootRequest {
enum Mode {
DEFAULT = 0;
POWERCYCLE = 1;
FORCE = 2;
}
Mode mode = 1;
}
Expand Down
3 changes: 2 additions & 1 deletion cmd/talosctl/cmd/talos/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ var eventsCmd = &cobra.Command{

event, err := client.UnmarshalEvent(ev)
if err != nil {
if errors.Is(err, client.ErrEventNotSupported) {
var errBadEvent client.EventNotSupportedError
if errors.As(err, &errBadEvent) {
return nil
}

Expand Down
4 changes: 3 additions & 1 deletion cmd/talosctl/cmd/talos/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ var rebootCmd = &cobra.Command{
// skips kexec and reboots with power cycle
case "powercycle":
opts = append(opts, client.WithPowerCycle)
case "force":
opts = append(opts, client.WithForce)
case "default":
default:
return fmt.Errorf("invalid reboot mode: %q", rebootCmdFlags.mode)
Expand Down Expand Up @@ -85,7 +87,7 @@ func rebootGetActorID(opts ...client.RebootMode) func(ctx context.Context, c *cl
}

func init() {
rebootCmd.Flags().StringVarP(&rebootCmdFlags.mode, "mode", "m", "default", "select the reboot mode: \"default\", \"powercycle\" (skips kexec)")
rebootCmd.Flags().StringVarP(&rebootCmdFlags.mode, "mode", "m", "default", "select the reboot mode: \"default\", \"powercycle\" (skips kexec), \"force\" (skips graceful teardown)")
rebootCmdFlags.addTrackActionFlags(rebootCmd)
addCommand(rebootCmd)
}
4 changes: 4 additions & 0 deletions cmd/talosctl/pkg/talos/action/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,10 @@ func (a *nodeTracker) handleEvent(event client.Event) error {
Status: reporter.StatusRunning,
})

if msg.GetSequence() == "reboot" {
return retry.ExpectedErrorf("reboot sequence completed")
}

if errStr != "" {
return fmt.Errorf("sequence error: %s", msg.GetError().GetMessage())
}
Expand Down
9 changes: 9 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ Additionally `talosctl image cache-create` has some changes:
* multiple instances (`--platform=linux/amd64 --platform=linux/arm64`);
"""

[notes.force-reboot]
title = "Talos force reboot"
description = """\
Talos now supports a "force" reboot mode, which allows skipping the graceful userland termination.
It can be used in situations where a userland service (e.g. the kubelet) gets stuck during graceful shutdown, causing the regular reboot flow to fail.

In addition, `talosctl` was updated to support this feature via `talosctl reboot --mode force`.
"""

[notes.kernel-module]
title = "Kernel Module"
description = """\
Expand Down
2 changes: 1 addition & 1 deletion internal/app/machined/pkg/runtime/sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ type Sequencer interface {
Boot(Runtime) []Phase
Initialize(Runtime) []Phase
Install(Runtime) []Phase
Reboot(Runtime) []Phase
Reboot(Runtime, *machine.RebootRequest) []Phase
Reset(Runtime, ResetOptions) []Phase
Shutdown(Runtime, *machine.ShutdownRequest) []Phase
StageUpgrade(Runtime, *machine.UpgradeRequest) []Phase
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,14 @@ func (c *Controller) phases(seq runtime.Sequence, data any) ([]runtime.Phase, er

phases = c.s.Shutdown(c.r, in)
case runtime.SequenceReboot:
phases = c.s.Reboot(c.r)
var in *machine.RebootRequest
if req, ok := data.(*machine.RebootRequest); ok {
in = req
} else {
log.Printf("warning: API reboot missing reboot request")
}

phases = c.s.Reboot(c.r, in)
case runtime.SequenceUpgrade:
in, ok := data.(*machine.UpgradeRequest)
if !ok {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (m *mockSequencer) Install(r runtime.Runtime) []runtime.Phase {
return m.phases[runtime.SequenceInstall]
}

func (m *mockSequencer) Reboot(r runtime.Runtime) []runtime.Phase {
func (m *mockSequencer) Reboot(r runtime.Runtime, _ *machine.RebootRequest) []runtime.Phase {
return m.phases[runtime.SequenceReboot]
}

Expand Down
27 changes: 16 additions & 11 deletions internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,18 +233,23 @@ func (*Sequencer) Boot(r runtime.Runtime) []runtime.Phase {
}

// Reboot is the reboot sequence.
func (*Sequencer) Reboot(r runtime.Runtime) []runtime.Phase {
phases := PhaseList{}.Append(
"cleanup",
StopAllPods,
).Append(
"dbus",
StopDBus,
).
AppendList(stopAllPhaselist(r, true)).
Append("reboot", Reboot)
func (*Sequencer) Reboot(r runtime.Runtime, in *machineapi.RebootRequest) []runtime.Phase {
phases := PhaseList{}

return phases
if in.GetMode() != machineapi.RebootRequest_FORCE {
phases = phases.
Append(
"cleanup",
StopAllPods,
).
Append(
"dbus",
StopDBus,
).
AppendList(stopAllPhaselist(r, true))
}

return phases.Append("reboot", Reboot)
}

// Reset is the reset sequence.
Expand Down
2 changes: 2 additions & 0 deletions internal/integration/api/network-config.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,8 @@ func (suite *NetworkConfigSuite) TestLinkAliasConfig() {

// TestVirtualIPConfig tests configuring virtual IPs.
func (suite *NetworkConfigSuite) TestVirtualIPConfig() {
suite.T().Skip("[TODO]: this test causes kube-apiserver to restart causing random failure")

if suite.Cluster == nil || suite.Cluster.Provisioner() != base.ProvisionerQEMU {
suite.T().Skip("skipping if cluster is not qemu")
}
Expand Down
67 changes: 67 additions & 0 deletions internal/integration/api/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ import (
"context"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"

"github.com/siderolabs/go-retry/retry"

"github.com/siderolabs/talos/internal/integration/base"
machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
"github.com/siderolabs/talos/pkg/machinery/client"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
)
Expand Down Expand Up @@ -71,6 +73,71 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
}
}

// TestForcedReboot force-reboots cluster node by node,
// ensuring that the 'cleanup' phase/'stopAllPods' task doesn't run.
func (suite *RebootSuite) TestForcedReboot() { //nolint:gocyclo
if !suite.Capabilities().SupportsReboot {
suite.T().Skip("cluster doesn't support reboots")
}

nodes := suite.DiscoverNodeInternalIPs(suite.ctx)
suite.Require().NotEmpty(nodes)

for _, node := range nodes {
suite.T().Log("force rebooting node", node)

nodeCtx := client.WithNode(suite.ctx, node)

var (
sawStopAllPods atomic.Bool
sawCleanupPhase atomic.Bool
)

// watch events so we can verify graceful teardown did not happen
watchCtx, watchCancel := context.WithCancel(nodeCtx)
eventsCh := make(chan client.EventResult)
suite.Require().NoError(suite.Client.EventsWatchV2(watchCtx, eventsCh))

go func() {
for {
select {
case <-watchCtx.Done():
return
case ev := <-eventsCh:
if ev.Error != nil {
continue
}

switch msg := ev.Event.Payload.(type) {
case *machineapi.TaskEvent:
if msg.GetTask() == "stopAllPods" {
sawStopAllPods.Store(true)
}
case *machineapi.PhaseEvent:
if msg.GetPhase() == "cleanup" {
sawCleanupPhase.Store(true)
}
}
}
}
}()

suite.AssertRebooted(
suite.ctx, node, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx, client.WithForce))
}, 10*time.Minute,
suite.CleanupFailedPods,
)

watchCancel()

suite.Require().Falsef(sawCleanupPhase.Load(), "cleanup phase must not run during forced reboot")
suite.Require().Falsef(sawStopAllPods.Load(), "stopAllPods task must not run during forced reboot")
}

suite.WaitForBootDone(suite.ctx)
}

// TestRebootMultiple reboots a node, issues consequent reboots
// reboot should cancel boot sequence, and cancel another reboot.
func (suite *RebootSuite) TestRebootMultiple() {
Expand Down
10 changes: 7 additions & 3 deletions pkg/machinery/api/machine/machine.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/machinery/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,11 @@ func WithPowerCycle(req *machineapi.RebootRequest) {
req.Mode = machineapi.RebootRequest_POWERCYCLE
}

// WithForce option runs the Reboot fun in force mode.
func WithForce(req *machineapi.RebootRequest) {
req.Mode = machineapi.RebootRequest_FORCE
}

// Reboot implements the proto.MachineServiceClient interface.
func (c *Client) Reboot(ctx context.Context, opts ...RebootMode) error {
_, err := c.RebootWithResponse(ctx, opts...)
Expand Down
16 changes: 13 additions & 3 deletions pkg/machinery/client/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,15 @@ import (
"github.com/siderolabs/talos/pkg/machinery/proto"
)

// ErrEventNotSupported is returned from the event decoder when we encounter an unknown event.
var ErrEventNotSupported = errors.New("event is not supported")
// EventNotSupportedError is returned from the event decoder when we encounter an unknown event.
type EventNotSupportedError struct {
TypeURL string
}

// Error implements the error interface.
func (e EventNotSupportedError) Error() string {
return fmt.Sprintf("event is not supported: %s", e.TypeURL)
}

// EventsOptionFunc defines the options for the Events API.
type EventsOptionFunc func(opts *machineapi.EventsRequest)
Expand Down Expand Up @@ -239,6 +246,7 @@ func UnmarshalEvent(event *machineapi.Event) (*Event, error) {
&machineapi.ConfigValidationErrorEvent{},
&machineapi.AddressEvent{},
&machineapi.MachineStatusEvent{},
&machineapi.RestartEvent{},
} {
if typeURL == "talos/runtime/"+string(eventType.ProtoReflect().Descriptor().FullName()) {
msg = eventType
Expand All @@ -249,7 +257,9 @@ func UnmarshalEvent(event *machineapi.Event) (*Event, error) {

if msg == nil {
// We haven't implemented the handling of this event yet.
return nil, ErrEventNotSupported
return nil, EventNotSupportedError{
TypeURL: typeURL,
}
}

if err := proto.Unmarshal(event.GetData().GetValue(), msg); err != nil {
Expand Down
1 change: 1 addition & 0 deletions website/content/v1.12/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -4252,6 +4252,7 @@ File type.
| ---- | ------ | ----------- |
| DEFAULT | 0 | |
| POWERCYCLE | 1 | |
| FORCE | 2 | |



Expand Down
2 changes: 1 addition & 1 deletion website/content/v1.12/reference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -2721,7 +2721,7 @@ talosctl reboot [flags]
--debug debug operation from kernel logs. --wait is set to true when this flag is set
-e, --endpoints strings override default endpoints in Talos configuration
-h, --help help for reboot
-m, --mode string select the reboot mode: "default", "powercycle" (skips kexec) (default "default")
-m, --mode string select the reboot mode: "default", "powercycle" (skips kexec), "force" (skips graceful teardown) (default "default")
-n, --nodes strings target the specified nodes
--siderov1-keys-dir string The path to the SideroV1 auth PGP keys directory. Defaults to 'SIDEROV1_KEYS_DIR' env variable if set, otherwise '$HOME/.talos/keys'. Only valid for Contexts that use SideroV1 auth.
--talosconfig string The path to the Talos configuration file. Defaults to 'TALOSCONFIG' env variable if set, otherwise '$HOME/.talos/config' and '/var/run/secrets/talos.dev/config' in order.
Expand Down
Loading