@@ -17,14 +17,41 @@ package control
17
17
import (
18
18
"errors"
19
19
"fmt"
20
+ "strings"
21
+ "time"
20
22
23
+ "gvisor.dev/gvisor/pkg/abi/linux"
24
+ "gvisor.dev/gvisor/pkg/cleanup"
25
+ "gvisor.dev/gvisor/pkg/log"
26
+ "gvisor.dev/gvisor/pkg/sentry/fdcollector"
27
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
21
28
"gvisor.dev/gvisor/pkg/sentry/kernel"
29
+ "gvisor.dev/gvisor/pkg/sentry/limits"
22
30
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
23
31
"gvisor.dev/gvisor/pkg/sentry/state"
32
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
24
33
"gvisor.dev/gvisor/pkg/sentry/watchdog"
34
+ "gvisor.dev/gvisor/pkg/timing"
25
35
"gvisor.dev/gvisor/pkg/urpc"
26
36
)
27
37
38
+ // SaveRestoreExecMode is the mode for the save/restore binary.
39
+ type SaveRestoreExecMode string
40
+
41
+ const (
42
+ // DefaultSaveRestoreExecTimeout is the default timeout for the save/restore
43
+ // binary.
44
+ DefaultSaveRestoreExecTimeout = 10 * time .Minute
45
+ // SaveRestoreExecSave is the save mode for the save/restore exec.
46
+ SaveRestoreExecSave SaveRestoreExecMode = "save"
47
+ // SaveRestoreExecRestore is the restore mode for the save/restore exec.
48
+ SaveRestoreExecRestore SaveRestoreExecMode = "restore"
49
+ // SaveRestoreExecResume is the resume mode for the save/restore binary.
50
+ SaveRestoreExecResume SaveRestoreExecMode = "resume"
51
+
52
+ saveRestoreExecEnvVar = "GVISOR_SAVE_RESTORE_AUTO_EXEC_MODE"
53
+ )
54
+
28
55
// ErrInvalidFiles is returned when the urpc call to Save does not include an
29
56
// appropriate file payload (e.g. there is no output file!).
30
57
var ErrInvalidFiles = errors .New ("exactly one file must be provided" )
@@ -59,6 +86,18 @@ type SaveOpts struct {
59
86
// Resume indicates if the sandbox process should continue running
60
87
// after checkpointing.
61
88
Resume bool
89
+
90
+ // SaveRestoreExecArgv is the argv of the save/restore binary split by spaces.
91
+ // The first element is the path to the binary.
92
+ SaveRestoreExecArgv string
93
+
94
+ // SaveRestoreExecTimeout is the timeout for waiting for the save/restore
95
+ // binary.
96
+ SaveRestoreExecTimeout time.Duration
97
+
98
+ // SaveRestoreExecContainerID is the ID of the container that the
99
+ // save/restore binary executes in.
100
+ SaveRestoreExecContainerID string
62
101
}
63
102
64
103
// Save saves the running system.
@@ -97,5 +136,197 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
97
136
}
98
137
defer saveOpts .PagesFile .Close ()
99
138
}
100
- return saveOpts .Save (s .Kernel .SupervisorContext (), s .Kernel , s .Watchdog )
139
+ if err := PreSave (s .Kernel , o ); err != nil {
140
+ return err
141
+ }
142
+ if err := saveOpts .Save (s .Kernel .SupervisorContext (), s .Kernel , s .Watchdog ); err != nil {
143
+ return err
144
+ }
145
+ if o .Resume {
146
+ err = PostResume (s .Kernel , nil )
147
+ }
148
+ return err
149
+ }
150
+
151
+ // PreSave is called before saving the kernel.
152
+ func PreSave (k * kernel.Kernel , o * SaveOpts ) error {
153
+ if o .SaveRestoreExecArgv != "" {
154
+ saveRestoreExecArgv := strings .Split (o .SaveRestoreExecArgv , " " )
155
+ if err := ConfigureSaveRestoreExec (k , saveRestoreExecArgv , o .SaveRestoreExecTimeout , o .SaveRestoreExecContainerID ); err != nil {
156
+ return fmt .Errorf ("failed to configure save/restore binary: %w" , err )
157
+ }
158
+ if err := SaveRestoreExec (k , SaveRestoreExecSave ); err != nil {
159
+ return fmt .Errorf ("failed to exec save/restore binary: %w" , err )
160
+ }
161
+ }
162
+ return preSaveImpl (k , o )
163
+ }
164
+
165
+ // PostResume is called after resuming the kernel.
166
+ //
167
+ // Precondition: The kernel should be running.
168
+ func PostResume (k * kernel.Kernel , timeline * timing.Timeline ) error {
169
+ if k .IsPaused () {
170
+ // The kernel is still paused (double-pause can happen with Docker which
171
+ // calls pause first and then checkpoint command). The final resume command
172
+ // will invoke save/restore binary if necessary.
173
+ return nil
174
+ }
175
+ if k .TaskSet ().IsExiting () {
176
+ // This can occur when kernel is saved with control.SaveOpts.Resume=false.
177
+ // We can not invoke the save/restore binary on such a kernel.
178
+ return nil
179
+ }
180
+ if err := SaveRestoreExec (k , SaveRestoreExecResume ); err != nil {
181
+ return fmt .Errorf ("failed to wait for save/restore binary: %w" , err )
182
+ }
183
+ return postResumeImpl (k , timeline )
184
+ }
185
+
186
+ // PostRestore is called after restoring the kernel.
187
+ //
188
+ // Precondition: The kernel should be running.
189
+ func PostRestore (k * kernel.Kernel , timeline * timing.Timeline ) error {
190
+ if k .IsPaused () {
191
+ // The kernel is still paused (double-pause can happen with Docker which
192
+ // calls pause first and then checkpoint command). The final resume command
193
+ // will invoke cuda-checkpoint if necessary.
194
+ return nil
195
+ }
196
+ if k .TaskSet ().IsExiting () {
197
+ // This can occur when kernel is saved with control.SaveOpts.Resume=false.
198
+ // We can not invoke cuda-checkpoint on such a kernel.
199
+ return nil
200
+ }
201
+ if err := SaveRestoreExec (k , SaveRestoreExecRestore ); err != nil {
202
+ return fmt .Errorf ("failed to wait for save/restore binary: %w" , err )
203
+ }
204
+ return postRestoreImpl (k , timeline )
205
+ }
206
+
207
+ // SaveRestoreExec creates a new process that executes the save/restore
208
+ // binary specified by k.SaveRestoreExecConfig and waits for it to finish.
209
+ //
210
+ // Precondition: The kernel should be running; k.SetSaveRestoreExecConfig should
211
+ // be setup with an argv, otherwise this function is a no-op.
212
+ func SaveRestoreExec (k * kernel.Kernel , mode SaveRestoreExecMode ) error {
213
+ if k .SaveRestoreExecConfig == nil {
214
+ return nil
215
+ }
216
+
217
+ leader := k .SaveRestoreExecConfig .LeaderTask
218
+ argv := k .SaveRestoreExecConfig .Argv
219
+ timeout := k .SaveRestoreExecConfig .Timeout
220
+ sctx := k .SupervisorContext ()
221
+ contID := leader .ContainerID ()
222
+ mntns := leader .MountNamespace ()
223
+ if mntns == nil || ! mntns .TryIncRef () {
224
+ log .Warningf ("PID %d in container %q has exited, skipping CUDA checkpoint for it" , leader .ThreadGroup ().ID (), contID )
225
+ return nil
226
+ }
227
+ mntns .IncRef ()
228
+ root := mntns .Root (sctx )
229
+ cu := cleanup .Make (func () {
230
+ root .DecRef (sctx )
231
+ })
232
+ defer cu .Clean ()
233
+ ctx := vfs .WithRoot (sctx , root )
234
+ cu .Add (func () {
235
+ mntns .DecRef (ctx )
236
+ })
237
+
238
+ fdTable := k .NewFDTable ()
239
+ cu .Add (func () {
240
+ fdTable .DecRef (sctx )
241
+ })
242
+ var execOut * fdcollector.Agent
243
+ rfd , wfd , err := pipefs .NewConnectedPipeFDs (ctx , k .PipeMount (), 0 /* flags */ )
244
+ if err != nil {
245
+ log .Warningf ("Failed to create stdout/stderr pipe for %s: %v" , argv [0 ], err )
246
+ } else {
247
+ if _ , err := fdTable .NewFDAt (ctx , 1 , wfd , kernel.FDFlags {}); err != nil {
248
+ log .Warningf ("Failed to make pipe stdout for %s: %v" , argv [0 ], err )
249
+ }
250
+ if _ , err := fdTable .NewFDAt (ctx , 2 , wfd , kernel.FDFlags {}); err != nil {
251
+ log .Warningf ("Failed to make pipe stderr for %s: %v" , argv [0 ], err )
252
+ }
253
+ wfd .DecRef (ctx )
254
+ execOut = fdcollector .NewAgent (ctx , rfd , argv [0 ]) // transfers ownership of rfd
255
+ cu .Add (execOut .Stop )
256
+ }
257
+ // TODO(b/419041893): Support running the save/restore binary with container
258
+ // env vars without relying on the Saver().
259
+ var envv []string
260
+ if k .Saver () != nil {
261
+ envv = k .Saver ().SpecEnviron (contID )
262
+ }
263
+
264
+ proc := Proc {
265
+ Kernel : k ,
266
+ }
267
+ execArgs := ExecArgs {
268
+ Filename : argv [0 ],
269
+ Argv : argv ,
270
+ Envv : append (envv , fmt .Sprintf ("%s=%s" , saveRestoreExecEnvVar , mode )),
271
+ ContainerID : contID ,
272
+ MountNamespace : mntns ,
273
+ PIDNamespace : k .RootPIDNamespace (),
274
+ Limits : limits .NewLimitSet (),
275
+ FDTable : fdTable ,
276
+ }
277
+ tg , _ , _ , err := ExecAsync (& proc , & execArgs )
278
+ if err != nil {
279
+ return fmt .Errorf ("failed to exec save/restore binary: %w" , err )
280
+ }
281
+
282
+ waitC := make (chan struct {})
283
+ go func () {
284
+ tg .WaitExited ()
285
+ waitC <- struct {}{}
286
+ }()
287
+ select {
288
+ case <- waitC :
289
+ if tg .ExitStatus () != 0 {
290
+ return fmt .Errorf ("%v exited with non-zero status %d" , argv [0 ], tg .ExitStatus ())
291
+ }
292
+ case <- time .After (timeout ):
293
+ tg .SendSignal (& linux.SignalInfo {Signo : int32 (linux .SIGKILL )})
294
+ return fmt .Errorf ("%s timed out after %v" , argv [0 ], timeout )
295
+ }
296
+ log .Debugf ("save/restore binary %s output: %s" , argv [0 ], execOut .String ())
297
+ return nil
298
+ }
299
+
300
+ // ConfigureSaveRestoreExec sets the configuration for the save/restore binary.
301
+ // If containerID is empty, the global init process will be used for the
302
+ // save/restore binary's leader task.
303
+ func ConfigureSaveRestoreExec (k * kernel.Kernel , argv []string , timeout time.Duration , containerID string ) error {
304
+ if k .SaveRestoreExecConfig != nil {
305
+ return fmt .Errorf ("save/restore binary is already set" )
306
+ }
307
+ k .SaveRestoreExecConfig = & kernel.SaveRestoreExecConfig {
308
+ Argv : argv ,
309
+ Timeout : timeout ,
310
+ }
311
+
312
+ var leader * kernel.Task
313
+ if containerID != "" {
314
+ for _ , tg := range k .RootPIDNamespace ().ThreadGroups () {
315
+ // Find all processes with no parent (root of execution).
316
+ if tg .Leader ().Parent () == nil {
317
+ cid := tg .Leader ().ContainerID ()
318
+ if cid == containerID {
319
+ leader = tg .Leader ()
320
+ break
321
+ }
322
+ }
323
+ }
324
+ if leader == nil {
325
+ return fmt .Errorf ("failed to find process associated with container %s" , containerID )
326
+ }
327
+ } else {
328
+ leader = k .GlobalInit ().Leader ()
329
+ }
330
+ k .SaveRestoreExecConfig .LeaderTask = leader
331
+ return nil
101
332
}
0 commit comments