Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions g3doc/user_guide/checkpoint_restore.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,25 @@ docker start --checkpoint <checkpoint-name> <container-name>
`--checkpoint-dir` flag but this will be required when restoring from a
checkpoint made in another container.

## Networking

Checkpoint/restore is supported with `--network=sandbox` (default),
`--network=none`, and `--network=host`.

With `--network=host`, host sockets cannot be saved, so:

- Checkpoint with `--leave-running` does not touch the running sandbox's
sockets. It keeps using them as before.
- TCP listening sockets are re-created during restore and keep accepting
new connections. Connections that were pending in the backlog at
checkpoint time are lost. If the listen address cannot be bound on the
restoring host, the restore fails.
- Sockets that were connected at checkpoint time return `ECONNABORTED`, and
`epoll_wait` on them returns `EPOLLERR | EPOLLHUP` immediately.
Applications must reconnect after restore.
- Network configuration visible inside the sandbox (interface statistics,
TCP buffer sizes) reflects the host the sandbox was restored on.

## Checkpoint & Restore with different CPU features

When restoring a state file, gVisor verifies that the target host machine
Expand Down
10 changes: 9 additions & 1 deletion pkg/sentry/socket/hostinet/BUILD
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
load("//tools:defs.bzl", "go_library")
load("//tools:defs.bzl", "go_library", "go_test")

package(
default_applicable_licenses = ["//:license"],
licenses = ["notice"],
)

go_test(
name = "hostinet_test",
size = "small",
srcs = ["stack_test.go"],
library = ":hostinet",
)

go_library(
name = "hostinet",
srcs = [
"hostinet.go",
"netlink.go",
"save_restore.go",
"socket.go",
"socket_unsafe.go",
"sockopt.go",
Expand Down
157 changes: 157 additions & 0 deletions pkg/sentry/socket/hostinet/save_restore.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Copyright 2026 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostinet

import (
"context"
"fmt"

"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/sync"
)

// Host socket fds cannot cross a checkpoint boundary, so Socket.fd is not
// saved and the checkpointed sandbox retains ownership of it.
//
// Listening sockets carry no connection state, so beforeSave records what is
// needed to re-create them and Stack.Restore re-creates the host socket with
// socket/bind/listen. Connections that were pending in the backlog are lost.
//
// All other sockets are restored with fd -1, so host socket operations fail
// with ECONNABORTED and Readiness reports hangup/error to wake pollers
// immediately.

// listenerState describes a listening host socket so restore can re-create it.
//
// +stateify savable
type listenerState struct {
addr []byte
backlog int32
reuseAddr int32
reusePort int32
v6Only int32
bindToDevice string
}

var restoredListeners struct {
mu sync.Mutex
sockets []*Socket
}

// beforeSave is invoked by stateify.
func (s *Socket) beforeSave() {
s.savedListener = nil
if s.fd < 0 {
return
}
accepting, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_ACCEPTCONN)
if err != nil || accepting == 0 {
return
}
addr, err := getsockname(s.fd)
if err != nil {
panic(fmt.Sprintf("getsockname on listening host socket failed during save: %v", err))
}
l := &listenerState{
addr: addr,
backlog: s.listenBacklog.Load(),
}
if v, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_REUSEADDR); err == nil {
l.reuseAddr = int32(v)
}
if v, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_REUSEPORT); err == nil {
l.reusePort = int32(v)
}
if s.family == unix.AF_INET6 {
if v, err := unix.GetsockoptInt(s.fd, unix.IPPROTO_IPV6, unix.IPV6_V6ONLY); err == nil {
l.v6Only = int32(v)
}
}
if dev, err := unix.GetsockoptString(s.fd, unix.SOL_SOCKET, unix.SO_BINDTODEVICE); err == nil {
l.bindToDevice = dev
}
s.savedListener = l
}

// afterLoad is invoked by stateify.
func (s *Socket) afterLoad(context.Context) {
s.fd = -1
if s.savedListener != nil {
restoredListeners.mu.Lock()
restoredListeners.sockets = append(restoredListeners.sockets, s)
restoredListeners.mu.Unlock()
}
}

// restoreListeners re-creates the host sockets for saved listening sockets.
func restoreListeners() {
restoredListeners.mu.Lock()
sockets := restoredListeners.sockets
restoredListeners.sockets = nil
restoredListeners.mu.Unlock()
for _, s := range sockets {
if err := s.restoreListener(); err != nil {
panic(fmt.Sprintf("failed to restore listening host socket (family=%d, addr=%x): %v", s.family, s.savedListener.addr, err))
}
}
}

func (s *Socket) restoreListener() error {
l := s.savedListener
fd, err := unix.Socket(s.family, int(s.stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, s.protocol)
if err != nil {
return fmt.Errorf("creating socket: %w", err)
}
if l.reuseAddr != 0 {
if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_REUSEADDR, int(l.reuseAddr)); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("setting SO_REUSEADDR: %w", err)
}
}
if l.reusePort != 0 {
if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_REUSEPORT, int(l.reusePort)); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("setting SO_REUSEPORT: %w", err)
}
}
if s.family == unix.AF_INET6 {
if err := unix.SetsockoptInt(fd, unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, int(l.v6Only)); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("setting IPV6_V6ONLY: %w", err)
}
}
if l.bindToDevice != "" {
if err := unix.SetsockoptString(fd, unix.SOL_SOCKET, unix.SO_BINDTODEVICE, l.bindToDevice); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("setting SO_BINDTODEVICE to %q: %w", l.bindToDevice, err)
}
}
if err := bind(fd, l.addr); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("binding: %w", err)
}
if err := unix.Listen(fd, int(l.backlog)); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("listening: %w", err)
}
if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
_ = unix.Close(fd)
return fmt.Errorf("registering with fdnotifier: %w", err)
}
s.fd = fd
s.savedListener = nil
return nil
}
70 changes: 64 additions & 6 deletions pkg/sentry/socket/hostinet/socket.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,24 @@ type Socket struct {
// masks, as when e.g. applications repeatedly call poll() with the same
// event mask, or blocking accept() / read() / write() / recvmsg() /
// sendmsg() / etc., on the same socket.
persistentEventMu sync.Mutex
persistentEventMu sync.Mutex `state:"nosave"`
persistentEventMask atomicbitops.Uint64
persistentEntry waiter.Entry

// fd is the host socket fd. It must have O_NONBLOCK, so that operations
// will return EWOULDBLOCK instead of blocking on the host. This allows us to
// handle blocking behavior independently in the sentry.
fd int
fd int `state:"nosave"`

// recvClosed indicates that the socket has been shutdown for reading
// (SHUT_RD or SHUT_RDWR).
recvClosed atomicbitops.Bool

// listenBacklog is the backlog passed to the most recent listen(2).
listenBacklog atomicbitops.Int32

// savedListener is set by beforeSave if this socket is listening.
savedListener *listenerState
}

var _ = socket.Socket(&Socket{})
Expand Down Expand Up @@ -175,6 +181,9 @@ func newSocket(t *kernel.Task, family int, stype linux.SockType, protocol int, f
// Release implements vfs.FileDescriptionImpl.Release.
func (s *Socket) Release(ctx context.Context) {
kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
if s.fd < 0 {
return
}
fdnotifier.RemoveFD(int32(s.fd))
_ = unix.Close(s.fd)
}
Expand All @@ -186,6 +195,9 @@ func (s *Socket) Epollable() bool {

// Ioctl implements vfs.FileDescriptionImpl.
func (s *Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
if s.fd < 0 {
return 0, linuxerr.ECONNABORTED
}
return ioctl(ctx, s.fd, uio, sysno, args)
}

Expand All @@ -202,6 +214,10 @@ func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.Read
return 0, linuxerr.EOPNOTSUPP
}

if s.fd < 0 {
return 0, linuxerr.ECONNABORTED
}

reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
defer hostfd.PutReadWriterAt(reader)
n, err := dst.CopyOutFrom(ctx, reader)
Expand All @@ -226,6 +242,10 @@ func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.Wri
return 0, linuxerr.EOPNOTSUPP
}

if s.fd < 0 {
return 0, linuxerr.ECONNABORTED
}

writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
defer hostfd.PutReadWriterAt(writer)
n, err := src.CopyInTo(ctx, writer)
Expand Down Expand Up @@ -303,11 +323,18 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int

// Readiness implements waiter.Waitable.Readiness.
func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
if s.fd < 0 {
return waiter.EventHUp | waiter.EventErr | waiter.EventRdHUp
}
return fdnotifier.NonBlockingPoll(int32(s.fd), mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *Socket) EventRegister(e *waiter.Entry) error {
if s.fd < 0 {
s.queue.EventRegister(e)
return nil
}
if em, pem := e.Mask(), waiter.EventMask(s.persistentEventMask.Load()); em&^pem != 0 {
s.persistentEventMu.Lock()
pem = waiter.EventMask(s.persistentEventMask.RacyLoad())
Expand Down Expand Up @@ -362,6 +389,9 @@ func (s *Socket) eventUnregisterTransient(e *waiter.Entry) {

// Connect implements socket.Socket.Connect.
func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
if s.fd < 0 {
return syserr.ErrConnectionAborted
}
if len(sockaddr) > sizeofSockaddr {
sockaddr = sockaddr[:sizeofSockaddr]
}
Expand Down Expand Up @@ -424,6 +454,9 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr

// Accept implements socket.Socket.Accept.
func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
if s.fd < 0 {
return 0, nil, 0, syserr.ErrConnectionAborted
}
var peerAddr linux.SockAddr
var peerAddrBuf []byte
var peerAddrlen uint32
Expand Down Expand Up @@ -484,24 +517,36 @@ func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking

// Bind implements socket.Socket.Bind.
func (s *Socket) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
if s.fd < 0 {
return syserr.ErrConnectionAborted
}
if len(sockaddr) > sizeofSockaddr {
sockaddr = sockaddr[:sizeofSockaddr]
}

_, _, errno := unix.Syscall(unix.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
if errno != 0 {
return syserr.FromError(errno)
if err := bind(s.fd, sockaddr); err != nil {
return syserr.FromError(err)
}
return nil
}

// Listen implements socket.Socket.Listen.
func (s *Socket) Listen(_ *kernel.Task, backlog int) *syserr.Error {
return syserr.FromError(unix.Listen(s.fd, backlog))
if s.fd < 0 {
return syserr.ErrConnectionAborted
}
if err := unix.Listen(s.fd, backlog); err != nil {
return syserr.FromError(err)
}
s.listenBacklog.Store(int32(backlog))
return nil
}

// Shutdown implements socket.Socket.Shutdown.
func (s *Socket) Shutdown(_ *kernel.Task, how int) *syserr.Error {
if s.fd < 0 {
return syserr.ErrConnectionAborted
}
switch how {
case unix.SHUT_RD, unix.SHUT_RDWR:
// Mark the socket as closed for reading.
Expand Down Expand Up @@ -559,6 +604,9 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
if flags&^allowedRecvMsgFlags != 0 {
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
}
if s.fd < 0 {
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrConnectionAborted
}

var senderAddrBuf []byte
var controlBuf []byte
Expand Down Expand Up @@ -745,6 +793,10 @@ func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flag
return 0, syserr.ErrInvalidArgument
}

if s.fd < 0 {
return 0, syserr.ErrConnectionAborted
}

// If the src is zero-length, call SENDTO directly with a null buffer in
// order to generate poll/epoll notifications.
if src.NumBytes() == 0 {
Expand Down Expand Up @@ -840,6 +892,12 @@ func translateIOSyscallError(err error) error {

// State implements socket.Socket.State.
func (s *Socket) State() uint32 {
if s.fd < 0 {
if s.stype == linux.SOCK_STREAM {
return linux.TCP_CLOSE
}
return 0
}
info := linux.TCPInfo{}
buf := make([]byte, linux.SizeOfTCPInfo)
var err error
Expand Down
Loading