Skip to content
74 changes: 74 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,80 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn
return oapi.RestoreInstance200JSONResponse(instanceToOAPI(*result)), nil
}

// PromoteInstanceToTemplate promotes a standby instance into a fork-only template.
// The id parameter can be an instance ID, name, or ID prefix.
// Note: Resolution is handled by ResolveResource middleware.
Comment thread
sjmiller609 marked this conversation as resolved.
func (s *ApiService) PromoteInstanceToTemplate(ctx context.Context, request oapi.PromoteInstanceToTemplateRequestObject) (oapi.PromoteInstanceToTemplateResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.PromoteInstanceToTemplate500JSONResponse{
Code: "internal_error",
Message: "resource not resolved",
}, nil
}
log := logger.FromContext(ctx)

result, err := s.InstanceManager.PromoteToTemplate(ctx, inst.Id)
if err != nil {
switch {
case errors.Is(err, instances.ErrNotFound):
return oapi.PromoteInstanceToTemplate404JSONResponse{
Code: "not_found",
Message: "instance not found",
}, nil
case errors.Is(err, instances.ErrInvalidState):
return oapi.PromoteInstanceToTemplate409JSONResponse{
Code: "invalid_state",
Message: err.Error(),
}, nil
default:
log.ErrorContext(ctx, "failed to promote instance to template", "error", err)
return oapi.PromoteInstanceToTemplate500JSONResponse{
Code: "internal_error",
Message: "failed to promote instance to template",
}, nil
}
}
return oapi.PromoteInstanceToTemplate200JSONResponse(instanceToOAPI(*result)), nil
}

// DemoteInstanceTemplate demotes a template back to standby so it can be restored or deleted.
// The id parameter can be an instance ID, name, or ID prefix.
// Note: Resolution is handled by ResolveResource middleware.
func (s *ApiService) DemoteInstanceTemplate(ctx context.Context, request oapi.DemoteInstanceTemplateRequestObject) (oapi.DemoteInstanceTemplateResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.DemoteInstanceTemplate500JSONResponse{
Code: "internal_error",
Message: "resource not resolved",
}, nil
}
log := logger.FromContext(ctx)

result, err := s.InstanceManager.DemoteTemplate(ctx, inst.Id)
if err != nil {
switch {
case errors.Is(err, instances.ErrNotFound):
return oapi.DemoteInstanceTemplate404JSONResponse{
Code: "not_found",
Message: "instance not found",
}, nil
case errors.Is(err, instances.ErrInvalidState):
return oapi.DemoteInstanceTemplate409JSONResponse{
Code: "invalid_state",
Message: err.Error(),
}, nil
default:
log.ErrorContext(ctx, "failed to demote template", "error", err)
return oapi.DemoteInstanceTemplate500JSONResponse{
Code: "internal_error",
Message: "failed to demote template",
}, nil
}
}
return oapi.DemoteInstanceTemplate200JSONResponse(instanceToOAPI(*result)), nil
}

// ForkInstance forks an instance from stopped or standby into a new instance.
// The id parameter can be an instance ID, name, or ID prefix.
// Note: Resolution is handled by ResolveResource middleware.
Expand Down
8 changes: 8 additions & 0 deletions lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ func (m *mockInstanceManager) RestoreInstance(ctx context.Context, id string) (*
return nil, nil
}

func (m *mockInstanceManager) PromoteToTemplate(ctx context.Context, id string) (*instances.Instance, error) {
return nil, nil
}

func (m *mockInstanceManager) DemoteTemplate(ctx context.Context, id string) (*instances.Instance, error) {
return nil, nil
}

func (m *mockInstanceManager) RestoreSnapshot(ctx context.Context, id string, snapshotID string, req instances.RestoreSnapshotRequest) (*instances.Instance, error) {
return nil, instances.ErrNotSupported
}
Expand Down
13 changes: 13 additions & 0 deletions lib/instances/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM
- `Paused` - VM paused (CH native)
- `Shutdown` - VM shutdown, VMM exists (CH native)
- `Standby` - No VMM, snapshot exists (can restore)
- `Template` - Standby snapshot promoted to fork-only parent; cannot wake while live forks exist

### Why Config Disk? (configdisk.go)

Expand Down Expand Up @@ -54,6 +55,8 @@ Manages VM instance lifecycle across multiple hypervisors (Cloud Hypervisor, QEM

`metadata.json` also carries controller-owned auto-standby runtime timestamps when that feature is enabled, so idle countdown state can survive Hypeman restarts.

`metadata.json` also stores `IsTemplate`, `ForkOfTemplate`, and `HotPagesPath`. `HotPagesPath` is reserved for the UFFD prefetch path and is cleared on demote.

**Benefits:**
- Content-addressable IDs (ULID = time-ordered)
- Self-contained: all instance data in one directory
Expand Down Expand Up @@ -100,6 +103,16 @@ Any State → Stopped
2. Delete all instance data
```

**PromoteToTemplate / DemoteTemplate:**
```
Standby → Template (promote)
Template → Standby (demote; refused while live forks exist)
```
- Promotion is an explicit caller step; forking a Standby does not auto-promote.
- A Template cannot be restored or deleted directly. RestoreInstance on a Template returns an error — callers must DemoteTemplate first.
- Live forks are counted at read time by scanning `ForkOfTemplate` across instances; both Demote and Delete refuse while that count is > 0.
- Forks of a Template are plain instances (Standby by default). They record `ForkOfTemplate = parent.Id` but do not inherit `IsTemplate` or `HotPagesPath`.

## Snapshot Optimization (standby.go, restore.go)

**Reduce snapshot size:**
Expand Down
10 changes: 10 additions & 0 deletions lib/instances/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ func (m *manager) deleteInstance(
stored := &meta.StoredMetadata
log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State)

if inst.State == StateTemplate {
forks, err := m.countTemplateForks(id)
if err != nil {
return fmt.Errorf("count forks of template %s: %w", id, err)
}
if forks > 0 {
return fmt.Errorf("%w: cannot delete template %s with %d live fork(s); delete forks first", ErrInvalidState, id, forks)
}
}
Comment thread
sjmiller609 marked this conversation as resolved.

target, err := m.cancelAndWaitCompressionJob(ctx, m.snapshotJobKeyForInstance(id))
if err != nil {
return fmt.Errorf("wait for instance compression to stop: %w", err)
Expand Down
43 changes: 33 additions & 10 deletions lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
return nil, "", fmt.Errorf("standby source instance: %w", err)
}

// Running fork is a one-shot clone that restores the source afterward.
// Promotion is now an explicit caller step, so the running flow simply
// doesn't promote — there's no skip flag to thread anymore.
forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true)
if forkErr == nil {
if err := m.rotateSourceVsockForRestore(ctx, id, forked.Id); err != nil {
Expand Down Expand Up @@ -104,14 +107,14 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
return nil, "", forkErr
}
return forked, targetState, nil
case StateStopped, StateStandby:
case StateStopped, StateStandby, StateTemplate:
forked, err := m.forkInstanceFromStoppedOrStandby(ctx, id, req, false)
if err != nil {
return nil, "", err
}
return forked, targetState, nil
default:
return nil, "", fmt.Errorf("%w: cannot fork from state %s (must be Stopped or Standby, or Running with from_running=true)", ErrInvalidState, source.State)
return nil, "", fmt.Errorf("%w: cannot fork from state %s (must be Stopped, Standby, or Template, or Running with from_running=true)", ErrInvalidState, source.State)
}
}

Expand Down Expand Up @@ -205,10 +208,10 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
stored := &meta.StoredMetadata

switch source.State {
case StateStopped, StateStandby:
case StateStopped, StateStandby, StateTemplate:
// allowed
default:
return nil, fmt.Errorf("%w: cannot fork from state %s (must be Stopped or Standby)", ErrInvalidState, source.State)
return nil, fmt.Errorf("%w: cannot fork from state %s (must be Stopped, Standby, or Template)", ErrInvalidState, source.State)
}

if !supportValidated {
Expand Down Expand Up @@ -250,7 +253,9 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
})
defer cu.Clean()

if source.State == StateStandby {
fromSnapshot := source.State == StateStandby || source.State == StateTemplate

if fromSnapshot {
if err := m.ensureSnapshotMemoryReady(ctx, m.paths.InstanceSnapshotLatest(id), m.snapshotJobKeyForInstance(id), stored.HypervisorType); err != nil {
return nil, fmt.Errorf("prepare standby snapshot for fork: %w", err)
}
Expand Down Expand Up @@ -286,17 +291,22 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
// phase (Standby for snapshot forks, Stopped for stopped forks) will be
// recorded by the appropriate operation when the fork is acted on.
forkMeta.Phases.Reset()
switch source.State {
case StateStandby:
if fromSnapshot {
forkMeta.Phases.Record(phasetracking.PhaseStandby, now)
case StateStopped:
} else {
forkMeta.Phases.Record(phasetracking.PhaseStopped, now)
}

// Template-only fields don't carry forward to the fork; the fork is a fresh
// instance regardless of whether the parent is a template.
forkMeta.IsTemplate = false
forkMeta.HotPagesPath = ""
forkMeta.ForkOfTemplate = ""
Comment thread
sjmiller609 marked this conversation as resolved.
Comment thread
sjmiller609 marked this conversation as resolved.

// Keep the original CID for snapshot-based forks.
// Rewriting CID in restored memory snapshots is not reliable across
// hypervisors.
if source.State == StateStandby {
if fromSnapshot {
forkMeta.VsockCID = stored.VsockCID
} else {
forkMeta.VsockCID = generateVsockCID(forkID)
Expand All @@ -309,7 +319,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
forkMeta.MAC = ""
}

if source.State == StateStandby {
if fromSnapshot {
snapshotConfigPath := m.paths.InstanceSnapshotConfig(forkID)
netCfg := (*hypervisor.ForkNetworkConfig)(nil)
if forkMeta.NetworkEnabled {
Expand All @@ -331,6 +341,15 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
}
}

// If the source is already a Template, record the parent linkage so it
// can be counted as a live fork. Live forks are counted at read time by
// scanning ForkOfTemplate across all instances. Plain Standby forks
// don't get this linkage — promotion is an explicit lifecycle step the
// caller must perform via PromoteToTemplate.
if fromSnapshot && stored.IsTemplate {
forkMeta.ForkOfTemplate = stored.Id
}

newMeta := &metadata{StoredMetadata: forkMeta}
if err := m.saveMetadata(newMeta); err != nil {
return nil, fmt.Errorf("save fork metadata: %w", err)
Expand Down Expand Up @@ -384,6 +403,10 @@ func resolveForkTargetState(requested State, sourceState State) (State, error) {
switch sourceState {
case StateRunning, StateStandby, StateStopped:
return sourceState, nil
case StateTemplate:
// Forks of a template are plain Standby instances; the fork itself
// is never a template.
return StateStandby, nil
default:
return "", fmt.Errorf("%w: cannot derive fork target state from source state %s", ErrInvalidState, sourceState)
}
Expand Down
73 changes: 72 additions & 1 deletion lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ type Manager interface {
ForkSnapshot(ctx context.Context, snapshotID string, req ForkSnapshotRequest) (*Instance, error)
StandbyInstance(ctx context.Context, id string, req StandbyInstanceRequest) (*Instance, error)
RestoreInstance(ctx context.Context, id string) (*Instance, error)
// PromoteToTemplate marks a Standby instance as a fork-only Template.
// Requires state == Standby. Idempotent if already a Template.
PromoteToTemplate(ctx context.Context, id string) (*Instance, error)
// DemoteTemplate flips a Template back to Standby so it can be woken or
// deleted. Requires no live forks (instances with ForkOfTemplate == id).
DemoteTemplate(ctx context.Context, id string) (*Instance, error)
RestoreSnapshot(ctx context.Context, id string, snapshotID string, req RestoreSnapshotRequest) (*Instance, error)
StopInstance(ctx context.Context, id string) (*Instance, error)
StartInstance(ctx context.Context, id string, req StartInstanceRequest) (*Instance, error)
Expand Down Expand Up @@ -435,7 +441,9 @@ func (m *manager) StandbyInstance(ctx context.Context, id string, req StandbyIns
return inst, err
}

// RestoreInstance restores an instance from standby
// RestoreInstance restores an instance from standby. Templates must be
// demoted via DemoteTemplate first; this method does not auto-demote so
// that the lifecycle remains explicit.
func (m *manager) RestoreInstance(ctx context.Context, id string) (*Instance, error) {
lock := m.getInstanceLock(id)
lock.Lock()
Expand All @@ -454,6 +462,69 @@ func (m *manager) RestoreInstance(ctx context.Context, id string) (*Instance, er
return inst, err
}

// PromoteToTemplate marks a Standby instance as a fork-only Template.
// Standby is the only legal source state. Idempotent: re-promoting a
// Template returns it as-is.
func (m *manager) PromoteToTemplate(ctx context.Context, id string) (*Instance, error) {
lock := m.getInstanceLock(id)
lock.Lock()
defer lock.Unlock()
meta, err := m.loadMetadata(id)
if err != nil {
return nil, err
}
inst := m.toInstance(ctx, meta)
if inst.State == StateTemplate {
return &inst, nil
}
if inst.State != StateStandby {
return nil, fmt.Errorf("%w: cannot promote instance in state %s to template (must be Standby)", ErrInvalidState, inst.State)
}
meta.IsTemplate = true
if err := m.saveMetadata(meta); err != nil {
return nil, fmt.Errorf("save metadata after template promote: %w", err)
}
promoted := m.toInstance(ctx, meta)
return &promoted, nil
}
Comment thread
sjmiller609 marked this conversation as resolved.

// DemoteTemplate flips a Template back to Standby so it can be woken or
// deleted. Refuses while any live forks still reference this id via
// ForkOfTemplate.
func (m *manager) DemoteTemplate(ctx context.Context, id string) (*Instance, error) {
lock := m.getInstanceLock(id)
lock.Lock()
defer lock.Unlock()
meta, err := m.loadMetadata(id)
if err != nil {
return nil, err
}
inst := m.toInstance(ctx, meta)
if inst.State == StateStandby {
return &inst, nil
}
if inst.State != StateTemplate {
return nil, fmt.Errorf("%w: cannot demote instance in state %s (must be Template)", ErrInvalidState, inst.State)
}
forks, err := m.countTemplateForks(id)
if err != nil {
return nil, fmt.Errorf("count forks of template %s: %w", id, err)
}
if forks > 0 {
return nil, fmt.Errorf("%w: cannot demote template %s with %d live fork(s); delete forks first", ErrInvalidState, id, forks)
}
if err := StateTemplate.CanTransitionTo(StateStandby); err != nil {
return nil, err
}
meta.IsTemplate = false
meta.HotPagesPath = ""
if err := m.saveMetadata(meta); err != nil {
return nil, fmt.Errorf("save metadata after template demote: %w", err)
}
demoted := m.toInstance(ctx, meta)
return &demoted, nil
}

func (m *manager) RestoreSnapshot(ctx context.Context, id string, snapshotID string, req RestoreSnapshotRequest) (*Instance, error) {
lock := m.getInstanceLock(id)
lock.Lock()
Expand Down
5 changes: 5 additions & 0 deletions lib/instances/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1497,12 +1497,17 @@ func TestStateTransitions(t *testing.T) {
{"Standby to Paused", StateStandby, StatePaused, false},
{"Shutdown to Stopped", StateShutdown, StateStopped, false},
{"Standby to Stopped", StateStandby, StateStopped, false},
{"Standby to Template", StateStandby, StateTemplate, false},
{"Template to Standby", StateTemplate, StateStandby, false},
{"Template to Stopped", StateTemplate, StateStopped, false},
// Invalid transitions
{"Running to Standby", StateRunning, StateStandby, true},
{"Stopped to Running", StateStopped, StateRunning, true},
{"Stopped to Initializing", StateStopped, StateInitializing, true},
{"Standby to Running", StateStandby, StateRunning, true},
{"Initializing to Paused", StateInitializing, StatePaused, true},
{"Template to Running", StateTemplate, StateRunning, true},
{"Template to Paused", StateTemplate, StatePaused, true},
}

for _, tt := range tests {
Expand Down
Loading
Loading