Skip to content

Commit

Permalink
Implement OS Version reconcile (#86)
Browse files Browse the repository at this point in the history
* Implement OS Version reconcile

Signed-off-by: Andrea Mazzotti <[email protected]>
  • Loading branch information
anmazzotti authored Sep 26, 2024
1 parent 007de86 commit b3c1be9
Show file tree
Hide file tree
Showing 34 changed files with 1,271 additions and 201 deletions.
12 changes: 7 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ CONTROLLER_TOOLS_VERSION ?= v0.16.1
CAPI_VERSION?=$(shell grep "sigs.k8s.io/cluster-api" go.mod | awk '{print $$NF}')
# Dev Image building
KUBEADM_READY_OS ?= ""
ELEMENTAL_TOOLKIT_IMAGE ?= ghcr.io/rancher/elemental-toolkit/elemental-cli:nightly
ELEMENTAL_TOOLKIT_IMAGE ?= ghcr.io/rancher/elemental-toolkit/elemental-cli:v2.2.0
ELEMENTAL_AGENT_IMAGE ?= ghcr.io/rancher-sandbox/cluster-api-provider-elemental/agent:latest
ELEMENTAL_OS_IMAGE?=docker.io/local/elemental-capi-os:dev
ELEMENTAL_ISO_IMAGE?=docker.io/local/elemental-capi-iso:dev

# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN))
Expand Down Expand Up @@ -266,16 +268,16 @@ endif
--build-arg "KUBEADM_READY=${KUBEADM_READY_OS}" \
--build-arg "ELEMENTAL_TOOLKIT=${ELEMENTAL_TOOLKIT_IMAGE}" \
--build-arg "ELEMENTAL_AGENT=${ELEMENTAL_AGENT_IMAGE}" \
-t elemental-os:dev -f Dockerfile.os .
-t ${ELEMENTAL_OS_IMAGE} -f Dockerfile.os .

.PHONY: build-iso
build-iso: build-os
$(CONTAINER_TOOL) build \
--build-arg ELEMENTAL_OS_IMAGE=docker.io/library/elemental-os:dev \
-t docker.io/library/elemental-iso:dev \
--build-arg ELEMENTAL_OS_IMAGE=${ELEMENTAL_OS_IMAGE} \
-t ${ELEMENTAL_ISO_IMAGE} \
-f Dockerfile.iso .
$(CONTAINER_TOOL) run -v ./iso:/iso \
--entrypoint cp docker.io/library/elemental-iso:dev \
--entrypoint cp ${ELEMENTAL_ISO_IMAGE} \
-r /elemental-iso/. /iso

.PHONY: update-test-capi-crds
Expand Down
18 changes: 18 additions & 0 deletions api/v1beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ const (
LabelElementalHostBootstrapped = "elementalhost.infrastructure.cluster.x-k8s.io/bootstrapped"
LabelElementalHostNeedsReset = "elementalhost.infrastructure.cluster.x-k8s.io/needs-reset"
LabelElementalHostReset = "elementalhost.infrastructure.cluster.x-k8s.io/reset"
LabelElementalHostInPlaceUpdate = "elementalhost.infrastructure.cluster.x-k8s.io/in-place-update"
InPlaceUpdatePending = "pending"
InPlaceUpdateDone = "done"
)

// HostPhases.
Expand All @@ -48,6 +51,7 @@ const (
PhaseRunning = HostPhase("Running")
PhaseTriggeringReset = HostPhase("Triggering Reset")
PhaseResetting = HostPhase("Resetting")
PhaseOSVersionReconcile = HostPhase("Reconciling OS Version")
)

// Conditions.
Expand Down Expand Up @@ -88,6 +92,20 @@ const (
WaitingForResetReasonSeverity clusterv1.ConditionSeverity = clusterv1.ConditionSeverityInfo
// ResetFailedReason indicates that the Host reset failed.
ResetFailedReason = "ResetFailed"

// OSVersionReady describes the Host OS version reconciliation phase.
OSVersionReady clusterv1.ConditionType = "OSVersionReady"
// WaitingOSReconcileReason indicates that the Host OS version needs to be reconciled.
WaitingOSReconcileReason = "WaitingForOSReconcile"
WaitingOSReconcileReasonSeverity clusterv1.ConditionSeverity = clusterv1.ConditionSeverityInfo
// InPlaceUpdateNotPendingReason indicates that the Host OS version needs to be reconciled, but no in-place-update is pending.
InPlaceUpdateNotPendingReason = "InPlaceUpdateNotPending"
InPlaceUpdateNotPendingReasonSeverity clusterv1.ConditionSeverity = clusterv1.ConditionSeverityWarning
// OSVersionReconciliationFailedReason indicates that the attempted Host OS version reconciliation failed.
OSVersionReconciliationFailedReason = "OSVersionReconciliationFailed"
// WaitingForPostReconcileRebootReason indicates that the Host OS version was applied and the Host is going to reboot.
WaitingForPostReconcileRebootReason = "WaitingForPostReconcileReboot"
WaitingForPostReconcileRebootReasonSeverity clusterv1.ConditionSeverity = clusterv1.ConditionSeverityInfo
)

// ElementalMachine Conditions and Reasons.
Expand Down
8 changes: 8 additions & 0 deletions api/v1beta1/elementalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v1beta1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
)

Expand All @@ -35,6 +36,13 @@ type ElementalHostSpec struct {
// PubKey is the host public key to verify when authenticating
// Elemental API requests for this host.
PubKey string `json:"pubKey,omitempty"`
// OSVersionManagement defines the OS Version and options to be reconciled
// on the host. The supported schema depends on the OSPlugin in use by
// the elementa-agent.
// +optional
// +kubebuilder:validation:Schemaless
// +kubebuilder:validation:XPreserveUnknownFields
OSVersionManagement map[string]runtime.RawExtension `json:"osVersionManagement,omitempty" yaml:"osVersionManagement,omitempty"`
}

// ElementalHostStatus defines the observed state of ElementalHost.
Expand Down
10 changes: 10 additions & 0 deletions api/v1beta1/elementalmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v1beta1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
)

Expand All @@ -37,6 +38,15 @@ type ElementalMachineSpec struct {
// using this host.
// +optional
HostRef *corev1.ObjectReference `json:"hostRef,omitempty"`

// OSVersionManagement defines the OS Version and options to be reconciled
// on the host. The supported schema depends on the OSPlugin in use by
// the elementa-agent. Whenever an ElementalHost is associated to this
// ElementalMachine, the OSVersionManagement will be applied to it.
// +optional
// +kubebuilder:validation:Schemaless
// +kubebuilder:validation:XPreserveUnknownFields
OSVersionManagement map[string]runtime.RawExtension `json:"osVersionManagement,omitempty" yaml:"osVersionManagement,omitempty"`
}

// ElementalMachineStatus defines the observed state of ElementalMachine.
Expand Down
14 changes: 14 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions cmd/agent/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package agent

import (
infrastructurev1 "github.com/rancher-sandbox/cluster-api-provider-elemental/api/v1beta1"
"github.com/rancher-sandbox/cluster-api-provider-elemental/internal/agent/log"
"github.com/rancher-sandbox/cluster-api-provider-elemental/pkg/agent/osplugin"
)

// handlePost handles post conditions such as Reboot or PowerOff.
// A true flag is returned if any of the conditions is true, to highlight the program should exit.
func handlePost(osPlugin osplugin.Plugin, post infrastructurev1.PostAction) bool {
if post.PowerOff {
log.Info("Powering off system")
if err := osPlugin.PowerOff(); err != nil {
log.Error(err, "Powering off system")
}
return true
} else if post.Reboot {
log.Info("Rebooting system")
if err := osPlugin.Reboot(); err != nil {
log.Error(err, "Rebooting system")
}
return true
}
return false
}
38 changes: 18 additions & 20 deletions cmd/agent/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"github.com/rancher-sandbox/cluster-api-provider-elemental/internal/agent/log"
"github.com/rancher-sandbox/cluster-api-provider-elemental/internal/agent/phase"
"github.com/rancher-sandbox/cluster-api-provider-elemental/internal/api"
"github.com/rancher-sandbox/cluster-api-provider-elemental/pkg/agent/osplugin"
"github.com/spf13/cobra"
)

Expand Down Expand Up @@ -60,6 +59,24 @@ This command will reconcile the remote ElementalHost resource describing this ho
return
}

// Handle Upgrade
needsInplaceUpdate := host.InPlaceUpgrade == infrastructurev1.InPlaceUpdatePending
if !host.Bootstrapped || needsInplaceUpdate {
log.Info("Reconciling OS Version")
osVersionHandler := phase.NewOSVersionHandler(*agentContext)
post, err := osVersionHandler.Reconcile(host.OSVersionManagement, needsInplaceUpdate)
if err != nil {
log.Error(err, "handling OS reconciliation")
log.Debugf("Waiting %s...", agentContext.Config.Agent.Reconciliation.String())
time.Sleep(agentContext.Config.Agent.Reconciliation)
continue
}
if handlePost(agentContext.Plugin, post) {
// Exit the program if we are rebooting to apply bootstrap
return
}
}

// Handle bootstrap if needed
if host.BootstrapReady && !host.Bootstrapped {
log.Info("Handling bootstrap application")
Expand All @@ -86,22 +103,3 @@ This command will reconcile the remote ElementalHost resource describing this ho
func init() {
rootCmd.AddCommand(runCmd)
}

// handlePost handles post conditions such as Reboot or PowerOff.
// A true flag is returned if any of the conditions is true, to highlight the program should exit.
func handlePost(osPlugin osplugin.Plugin, post infrastructurev1.PostAction) bool {
if post.PowerOff {
log.Info("Powering off system")
if err := osPlugin.PowerOff(); err != nil {
log.Error(err, "Powering off system")
}
return true
} else if post.Reboot {
log.Info("Rebooting system")
if err := osPlugin.Reboot(); err != nil {
log.Error(err, "Rebooting system")
}
return true
}
return false
}
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ spec:
type: string
type: object
x-kubernetes-map-type: atomic
osVersionManagement:
description: |-
OSVersionManagement defines the OS Version and options to be reconciled
on the host. The supported schema depends on the OSPlugin in use by
the elementa-agent.
x-kubernetes-preserve-unknown-fields: true
pubKey:
description: |-
PubKey is the host public key to verify when authenticating
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ spec:
type: string
type: object
x-kubernetes-map-type: atomic
osVersionManagement:
description: |-
OSVersionManagement defines the OS Version and options to be reconciled
on the host. The supported schema depends on the OSPlugin in use by
the elementa-agent. Whenever an ElementalHost is associated to this
ElementalMachine, the OSVersionManagement will be applied to it.
x-kubernetes-preserve-unknown-fields: true
providerID:
description: |-
ProviderID references the associated ElementalHost.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ spec:
type: string
type: object
x-kubernetes-map-type: atomic
osVersionManagement:
description: |-
OSVersionManagement defines the OS Version and options to be reconciled
on the host. The supported schema depends on the OSPlugin in use by
the elementa-agent. Whenever an ElementalHost is associated to this
ElementalMachine, the OSVersionManagement will be applied to it.
x-kubernetes-preserve-unknown-fields: true
providerID:
description: |-
ProviderID references the associated ElementalHost.
Expand Down
10 changes: 10 additions & 0 deletions doc/HOST_PHASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,13 @@ If the `OSPlugin` resets the host successfully, the remote `ElementalHost` is up

It is expected to re-start the lifecycle of the host at this point if desired.
This means running `elemental-agent register --install` to perform a new registration and a fresh installation of the system.

### Reconciling OS Version

The `Reconciling OS Version` happens during the [Running](#running) phase, if a new OS Version has to be reconciled **and** the host needs to reboot to apply it.

Note that if the `ElementalHost` does not need to reboot to reconcile an OS Version, then this phase will not be shown and the `ElementalHost` last applied OS Version will be considered reconciled already.

The `OSPlugin` in use determines whether the host needs a reboot or not, for example to run a new kernel, or to boot from an updated partition.

For more information, you can read the related [documentation](./OS_VERSION_RECONCILE.md).
105 changes: 105 additions & 0 deletions doc/OS_VERSION_RECONCILE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# OS Version Reconcile

The `ElementalHost` API supports user defined "OS Version" schemas.
The `ElementalHost.spec.osVersionManagement` can be populated with arbitrary information that will be passed to the `elemental-agent` running on the host system, in order to reconcile a desired OS Version state.

This information can be anything, for example a list of packages to refresh, a set of commands to run, an OCI image to upgrade to. It depends on the [OS Plugin](./ELEMENTAL_AGENT.md#plugins) in use.

## Upgrading a single host

The `ElementalHost.spec.osVersionManagement` can be configured directly, for example to apply a certain version to a single host.

Hosts that are not yet bootstrapped will try to reconcile the version on the next `elemental-agent` reconcile loop.

For example, using the [Elemental plugin](./PLUGIN_ELEMENTAL.md):

```bash
kubectl patch elementalhost my-elemental-host -p '{"spec":{"osVersionManagement":{"osVersion":{"imageUri":"oci://my-registry/my-image:v1.2.3"}}}}' --type=merge
```

```yaml
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: ElementalHost
metadata:
labels:
elementalhost.infrastructure.cluster.x-k8s.io/installed: "true"
name: my-elemental-host
namespace: default
spec:
osVersionManagement:
osVersion:
imageUri: oci://my-registry/my-image:v1.2.3
upgradeRecovery: false
debug: false
```
If the host will need a reboot due to upgrades (signaled by the OS Plugin in use), then the `ElementalHost` will enter the `Reconciling OS Version` phase. Otherwise the reconcile will be considered successful.

The `Ready` condition can be can be used to determine whether the Host has been upgraded successfully:

```bash
kubectl wait --for=condition=ready elementalhost my-elemental-host
```

### In-place updates

While the [In-place updates proposal](https://github.com/kubernetes-sigs/cluster-api/pull/11029) is not yet finalized, the Elemental provider offers a rudimentary way of updating `ElementalHosts` that are already bootstrapped and part of a cluster.

Be aware that this requires you to [safely drain the node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/) before proceeding.

The `elementalhost.infrastructure.cluster.x-k8s.io/in-place-update` label with a value of `pending` can be used to tell the `elemental-agent` that the OS Version has to be reconciled, even if the host is already bootstrapped.

```bash
kubectl label elementalhost my-to-be-updated-host elementalhost.infrastructure.cluster.x-k8s.io/in-place-update=pending
```

Since the `ElementalHost` is already associated to an `ElementalMachine`, the OS Version is reconciled from the latter, therefore the `ElementalMachine` needs to be patched instead with the desired version:

```bash
kubectl patch elementalmachine my-elemental-machine -p '{"spec":{"osVersionManagement":{"osVersion":{"imageUri":"oci://my-registry/my-image:v1.2.3"}}}}' --type=merge
```

Once the update is successful, the label will automatically mutate to `done`.

## Upgrade bootstrapped hosts with machine rollouts

Elemental supports upgrading hosts during [machine rollouts](https://cluster-api.sigs.k8s.io/tasks/upgrading-clusters).

The desired OS Version can be defined on the `ElementalMachineTemplate` prior triggering a rollout to replace nodes.
For example:

```yaml
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: ElementalMachineTemplate
metadata:
name: kubeadm-md-0
namespace: default
spec:
template:
spec:
osVersionManagement:
osVersion:
imageUri: oci://my-registry/my-image:v1.2.3
upgradeRecovery: false
debug: false
```

Upon triggering a rollout, new `Machines` will be created to replace the old ones, and during `ElementalMachine` to `ElementalHost` association, the `osVersionManagement` field will be forwarded to the `ElementalHost` to be applied **before** bootstrapping.

Also note that Elemental hosts will undergo reset when a `Machine` is deleted. Normally this would require you to have at least a spare +1 `ElementalHost` to begin the rollout with, or downscale your node pool by 1 to reset one `ElementalHost` first.

```bash
kubectl patch machinedeployment kubeadm-md-0 -p '{"spec":{"replicas":1}}' --type=merge
```

One way to start a rollout is to trigger it directly:

```bash
clusterctl alpha rollout restart machinedeployment/kubeadm-md-0
```

The nodes can be upscaled again to the desired amount after rollout is finished:

```bash
kubectl patch machinedeployment kubeadm-md-0 -p '{"spec":{"replicas":2}}' --type=merge
```
Loading

0 comments on commit b3c1be9

Please sign in to comment.