Skip to content

Commit c8952f7

Browse files
manninglucasgvisor-bot
authored andcommitted
Add support for executing a binary before saving and after restoring.
PiperOrigin-RevId: 764936811
1 parent 90f75e2 commit c8952f7

File tree

16 files changed

+493
-59
lines changed

16 files changed

+493
-59
lines changed

pkg/sentry/control/BUILD

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ package(
66
)
77

88
proto_library(
9-
name = "control",
9+
name = "control_api",
1010
srcs = ["control.proto"],
1111
visibility = ["//visibility:public"],
1212
deps = [
@@ -27,14 +27,16 @@ go_library(
2727
"pprof.go",
2828
"proc.go",
2929
"state.go",
30+
"state_impl.go",
3031
"usage.go",
3132
],
3233
visibility = [
3334
"//:sandbox",
3435
],
3536
deps = [
36-
":control_go_proto",
37+
":control_api_go_proto",
3738
"//pkg/abi/linux",
39+
"//pkg/cleanup",
3840
"//pkg/context",
3941
"//pkg/eventchannel",
4042
"//pkg/fd",
@@ -43,8 +45,12 @@ go_library(
4345
"//pkg/metric",
4446
"//pkg/metric:metric_go_proto",
4547
"//pkg/prometheus",
48+
"//pkg/sentry/devices/memdev",
49+
"//pkg/sentry/devices/nvproxy",
50+
"//pkg/sentry/fdcollector",
4651
"//pkg/sentry/fdimport",
4752
"//pkg/sentry/fsimpl/host",
53+
"//pkg/sentry/fsimpl/pipefs",
4854
"//pkg/sentry/fsimpl/user",
4955
"//pkg/sentry/fsmetric",
5056
"//pkg/sentry/kernel",
@@ -59,6 +65,7 @@ go_library(
5965
"//pkg/sentry/watchdog",
6066
"//pkg/sync",
6167
"//pkg/tcpip/link/sniffer",
68+
"//pkg/timing",
6269
"//pkg/urpc",
6370
"//pkg/usermem",
6471
"@org_golang_google_protobuf//types/known/timestamppb",

pkg/sentry/control/lifecycle.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
"gvisor.dev/gvisor/pkg/eventchannel"
2525
"gvisor.dev/gvisor/pkg/fd"
2626
"gvisor.dev/gvisor/pkg/log"
27-
pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
27+
pb "gvisor.dev/gvisor/pkg/sentry/control/control_api_go_proto"
2828
"gvisor.dev/gvisor/pkg/sentry/fdimport"
2929
"gvisor.dev/gvisor/pkg/sentry/fsimpl/user"
3030
"gvisor.dev/gvisor/pkg/sentry/kernel"

pkg/sentry/control/state.go

Lines changed: 232 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,41 @@ package control
1717
import (
1818
"errors"
1919
"fmt"
20+
"strings"
21+
"time"
2022

23+
"gvisor.dev/gvisor/pkg/abi/linux"
24+
"gvisor.dev/gvisor/pkg/cleanup"
25+
"gvisor.dev/gvisor/pkg/log"
26+
"gvisor.dev/gvisor/pkg/sentry/fdcollector"
27+
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
2128
"gvisor.dev/gvisor/pkg/sentry/kernel"
29+
"gvisor.dev/gvisor/pkg/sentry/limits"
2230
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
2331
"gvisor.dev/gvisor/pkg/sentry/state"
32+
"gvisor.dev/gvisor/pkg/sentry/vfs"
2433
"gvisor.dev/gvisor/pkg/sentry/watchdog"
34+
"gvisor.dev/gvisor/pkg/timing"
2535
"gvisor.dev/gvisor/pkg/urpc"
2636
)
2737

38+
// SaveRestoreExecMode is the mode for the save/restore binary.
39+
type SaveRestoreExecMode string
40+
41+
const (
42+
// DefaultSaveRestoreExecTimeout is the default timeout for the save/restore
43+
// binary.
44+
DefaultSaveRestoreExecTimeout = 10 * time.Minute
45+
// SaveRestoreExecSave is the save mode for the save/restore exec.
46+
SaveRestoreExecSave SaveRestoreExecMode = "save"
47+
// SaveRestoreExecRestore is the restore mode for the save/restore exec.
48+
SaveRestoreExecRestore SaveRestoreExecMode = "restore"
49+
// SaveRestoreExecResume is the resume mode for the save/restore binary.
50+
SaveRestoreExecResume SaveRestoreExecMode = "resume"
51+
52+
saveRestoreExecEnvVar = "GVISOR_SAVE_RESTORE_AUTO_EXEC_MODE"
53+
)
54+
2855
// ErrInvalidFiles is returned when the urpc call to Save does not include an
2956
// appropriate file payload (e.g. there is no output file!).
3057
var ErrInvalidFiles = errors.New("exactly one file must be provided")
@@ -59,6 +86,18 @@ type SaveOpts struct {
5986
// Resume indicates if the sandbox process should continue running
6087
// after checkpointing.
6188
Resume bool
89+
90+
// SaveRestoreExecArgv is the argv of the save/restore binary split by spaces.
91+
// The first element is the path to the binary.
92+
SaveRestoreExecArgv string
93+
94+
// SaveRestoreExecTimeout is the timeout for waiting for the save/restore
95+
// binary.
96+
SaveRestoreExecTimeout time.Duration
97+
98+
// SaveRestoreExecContainerID is the ID of the container that the
99+
// save/restore binary executes in.
100+
SaveRestoreExecContainerID string
62101
}
63102

64103
// Save saves the running system.
@@ -97,5 +136,197 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
97136
}
98137
defer saveOpts.PagesFile.Close()
99138
}
100-
return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog)
139+
if err := PreSave(s.Kernel, o); err != nil {
140+
return err
141+
}
142+
if err := saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog); err != nil {
143+
return err
144+
}
145+
if o.Resume {
146+
err = PostResume(s.Kernel, nil)
147+
}
148+
return err
149+
}
150+
151+
// PreSave is called before saving the kernel.
152+
func PreSave(k *kernel.Kernel, o *SaveOpts) error {
153+
if o.SaveRestoreExecArgv != "" {
154+
saveRestoreExecArgv := strings.Split(o.SaveRestoreExecArgv, " ")
155+
if err := ConfigureSaveRestoreExec(k, saveRestoreExecArgv, o.SaveRestoreExecTimeout, o.SaveRestoreExecContainerID); err != nil {
156+
return fmt.Errorf("failed to configure save/restore binary: %w", err)
157+
}
158+
if err := SaveRestoreExec(k, SaveRestoreExecSave); err != nil {
159+
return fmt.Errorf("failed to exec save/restore binary: %w", err)
160+
}
161+
}
162+
return preSaveImpl(k, o)
163+
}
164+
165+
// PostResume is called after resuming the kernel.
166+
//
167+
// Precondition: The kernel should be running.
168+
func PostResume(k *kernel.Kernel, timeline *timing.Timeline) error {
169+
if k.IsPaused() {
170+
// The kernel is still paused (double-pause can happen with Docker which
171+
// calls pause first and then checkpoint command). The final resume command
172+
// will invoke save/restore binary if necessary.
173+
return nil
174+
}
175+
if k.TaskSet().IsExiting() {
176+
// This can occur when kernel is saved with control.SaveOpts.Resume=false.
177+
// We can not invoke the save/restore binary on such a kernel.
178+
return nil
179+
}
180+
if err := SaveRestoreExec(k, SaveRestoreExecResume); err != nil {
181+
return fmt.Errorf("failed to wait for save/restore binary: %w", err)
182+
}
183+
return postResumeImpl(k, timeline)
184+
}
185+
186+
// PostRestore is called after restoring the kernel.
187+
//
188+
// Precondition: The kernel should be running.
189+
func PostRestore(k *kernel.Kernel, timeline *timing.Timeline) error {
190+
if k.IsPaused() {
191+
// The kernel is still paused (double-pause can happen with Docker which
192+
// calls pause first and then checkpoint command). The final resume command
193+
// will invoke cuda-checkpoint if necessary.
194+
return nil
195+
}
196+
if k.TaskSet().IsExiting() {
197+
// This can occur when kernel is saved with control.SaveOpts.Resume=false.
198+
// We can not invoke cuda-checkpoint on such a kernel.
199+
return nil
200+
}
201+
if err := SaveRestoreExec(k, SaveRestoreExecRestore); err != nil {
202+
return fmt.Errorf("failed to wait for save/restore binary: %w", err)
203+
}
204+
return postRestoreImpl(k, timeline)
205+
}
206+
207+
// SaveRestoreExec creates a new process that executes the save/restore
208+
// binary specified by k.SaveRestoreExecConfig and waits for it to finish.
209+
//
210+
// Precondition: The kernel should be running; k.SetSaveRestoreExecConfig should
211+
// be setup with an argv, otherwise this function is a no-op.
212+
func SaveRestoreExec(k *kernel.Kernel, mode SaveRestoreExecMode) error {
213+
if k.SaveRestoreExecConfig == nil {
214+
return nil
215+
}
216+
217+
leader := k.SaveRestoreExecConfig.LeaderTask
218+
argv := k.SaveRestoreExecConfig.Argv
219+
timeout := k.SaveRestoreExecConfig.Timeout
220+
sctx := k.SupervisorContext()
221+
contID := leader.ContainerID()
222+
mntns := leader.MountNamespace()
223+
if mntns == nil || !mntns.TryIncRef() {
224+
log.Warningf("PID %d in container %q has exited, skipping CUDA checkpoint for it", leader.ThreadGroup().ID(), contID)
225+
return nil
226+
}
227+
mntns.IncRef()
228+
root := mntns.Root(sctx)
229+
cu := cleanup.Make(func() {
230+
root.DecRef(sctx)
231+
})
232+
defer cu.Clean()
233+
ctx := vfs.WithRoot(sctx, root)
234+
cu.Add(func() {
235+
mntns.DecRef(ctx)
236+
})
237+
238+
fdTable := k.NewFDTable()
239+
cu.Add(func() {
240+
fdTable.DecRef(sctx)
241+
})
242+
var execOut *fdcollector.Agent
243+
rfd, wfd, err := pipefs.NewConnectedPipeFDs(ctx, k.PipeMount(), 0 /* flags */)
244+
if err != nil {
245+
log.Warningf("Failed to create stdout/stderr pipe for %s: %v", argv[0], err)
246+
} else {
247+
if _, err := fdTable.NewFDAt(ctx, 1, wfd, kernel.FDFlags{}); err != nil {
248+
log.Warningf("Failed to make pipe stdout for %s: %v", argv[0], err)
249+
}
250+
if _, err := fdTable.NewFDAt(ctx, 2, wfd, kernel.FDFlags{}); err != nil {
251+
log.Warningf("Failed to make pipe stderr for %s: %v", argv[0], err)
252+
}
253+
wfd.DecRef(ctx)
254+
execOut = fdcollector.NewAgent(ctx, rfd, argv[0]) // transfers ownership of rfd
255+
cu.Add(execOut.Stop)
256+
}
257+
// TODO(b/419041893): Support running the save/restore binary with container
258+
// env vars without relying on the Saver().
259+
var envv []string
260+
if k.Saver() != nil {
261+
envv = k.Saver().SpecEnviron(contID)
262+
}
263+
264+
proc := Proc{
265+
Kernel: k,
266+
}
267+
execArgs := ExecArgs{
268+
Filename: argv[0],
269+
Argv: argv,
270+
Envv: append(envv, fmt.Sprintf("%s=%s", saveRestoreExecEnvVar, mode)),
271+
ContainerID: contID,
272+
MountNamespace: mntns,
273+
PIDNamespace: k.RootPIDNamespace(),
274+
Limits: limits.NewLimitSet(),
275+
FDTable: fdTable,
276+
}
277+
tg, _, _, err := ExecAsync(&proc, &execArgs)
278+
if err != nil {
279+
return fmt.Errorf("failed to exec save/restore binary: %w", err)
280+
}
281+
282+
waitC := make(chan struct{})
283+
go func() {
284+
tg.WaitExited()
285+
waitC <- struct{}{}
286+
}()
287+
select {
288+
case <-waitC:
289+
if tg.ExitStatus() != 0 {
290+
return fmt.Errorf("%v exited with non-zero status %d", argv[0], tg.ExitStatus())
291+
}
292+
case <-time.After(timeout):
293+
tg.SendSignal(&linux.SignalInfo{Signo: int32(linux.SIGKILL)})
294+
return fmt.Errorf("%s timed out after %v", argv[0], timeout)
295+
}
296+
log.Debugf("save/restore binary %s output: %s", argv[0], execOut.String())
297+
return nil
298+
}
299+
300+
// ConfigureSaveRestoreExec sets the configuration for the save/restore binary.
301+
// If containerID is empty, the global init process will be used for the
302+
// save/restore binary's leader task.
303+
func ConfigureSaveRestoreExec(k *kernel.Kernel, argv []string, timeout time.Duration, containerID string) error {
304+
if k.SaveRestoreExecConfig != nil {
305+
return fmt.Errorf("save/restore binary is already set")
306+
}
307+
k.SaveRestoreExecConfig = &kernel.SaveRestoreExecConfig{
308+
Argv: argv,
309+
Timeout: timeout,
310+
}
311+
312+
var leader *kernel.Task
313+
if containerID != "" {
314+
for _, tg := range k.RootPIDNamespace().ThreadGroups() {
315+
// Find all processes with no parent (root of execution).
316+
if tg.Leader().Parent() == nil {
317+
cid := tg.Leader().ContainerID()
318+
if cid == containerID {
319+
leader = tg.Leader()
320+
break
321+
}
322+
}
323+
}
324+
if leader == nil {
325+
return fmt.Errorf("failed to find process associated with container %s", containerID)
326+
}
327+
} else {
328+
leader = k.GlobalInit().Leader()
329+
}
330+
k.SaveRestoreExecConfig.LeaderTask = leader
331+
return nil
101332
}

pkg/sentry/control/state_impl.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//go:build !false
16+
// +build !false
17+
18+
package control
19+
20+
import (
21+
"gvisor.dev/gvisor/pkg/sentry/kernel"
22+
"gvisor.dev/gvisor/pkg/timing"
23+
)
24+
25+
func preSaveImpl(k *kernel.Kernel, o *SaveOpts) error {
26+
return nil
27+
}
28+
29+
func postRestoreImpl(k *kernel.Kernel, _ *timing.Timeline) error {
30+
return nil
31+
}
32+
33+
func postResumeImpl(k *kernel.Kernel, _ *timing.Timeline) error {
34+
return nil
35+
}

pkg/sentry/fdcollector/BUILD

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
load("//tools:defs.bzl", "go_library")
2+
3+
package(
4+
default_applicable_licenses = ["//:license"],
5+
licenses = ["notice"],
6+
)
7+
8+
go_library(
9+
name = "fdcollector",
10+
srcs = ["fdcollector.go"],
11+
visibility = ["//:sandbox"],
12+
deps = [
13+
"//pkg/context",
14+
"//pkg/errors/linuxerr",
15+
"//pkg/log",
16+
"//pkg/sentry/vfs",
17+
"//pkg/sync",
18+
"//pkg/usermem",
19+
"//pkg/waiter",
20+
],
21+
)

0 commit comments

Comments
 (0)