Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pkg/abi/linux/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ const (
MOUNT_ATTR_NODIRATIME = 0x00000080
MOUNT_ATTR_IDMAP = 0x00100000
MOUNT_ATTR_NOSYMFOLLOW = 0x00200000
AT_RECURSIVE = 0x8000
)

// Constants for open_tree(2).
const (
OPEN_TREE_CLONE = (1 << 0)
OPEN_TREE_NAMESPACE = (1 << 1)
OPEN_TREE_CLOEXEC = O_CLOEXEC
)

// Constants for unlinkat(2).
Expand Down
7 changes: 6 additions & 1 deletion pkg/sentry/fsimpl/mountfd/mountfd.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,12 @@ type Fd struct {
}

// New creates a new mount object file descriptor from the anonymous mount namespace anonNs
// and the mount at the root of anonNS.
// and the mount at the root of anonNS. Consumes a reference on anonNS.
func New(ctx context.Context, anonNS *vfs.MountNamespace, fileFlags uint32) (*vfs.FileDescription, error) {
if !anonNS.Anon() {
panic("mountfd created with a non-anonymous namespace")
}

fd := &Fd{
anonNS: anonNS,
}
Expand All @@ -54,6 +58,7 @@ func New(ctx context.Context, anonNS *vfs.MountNamespace, fileFlags uint32) (*vf
DenyPWrite: true,
})
if err != nil {
anonNS.DecRef(ctx)
return nil, err
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/sentry/syscalls/linux/linux64.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ var AMD64 = &kernel.SyscallTable{
425: syscalls.PartiallySupported("io_uring_setup", IOUringSetup, "Not all flags and functionality supported.", nil),
426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil),
427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil),
428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil),
428: syscalls.Supported("open_tree", OpenTree),
429: syscalls.PartiallySupported("move_mount", MoveMount, "Options MOVE_MOUNT_SET_GROUP and MOVE_MOUNT_BENEATH are not supported.", nil),
430: syscalls.PartiallySupported("fsopen", FSOpen, "Message retrieval interface not supported.", nil),
431: syscalls.PartiallySupported("fsconfig", FSConfig, "Only options FSCONFIG_SET_FLAG, FSCONFIG_SET_STRING, and FSCONFIG_CMD_CREATE are supported. All option parsing and error handling happens upon FSCONFIG_CMD_CREATE.", nil),
Expand Down Expand Up @@ -708,7 +708,7 @@ var ARM64 = &kernel.SyscallTable{
425: syscalls.PartiallySupported("io_uring_setup", IOUringSetup, "Not all flags and functionality supported.", nil),
426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil),
427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil),
428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil),
428: syscalls.Supported("open_tree", OpenTree),
429: syscalls.PartiallySupported("move_mount", MoveMount, "Options MOVE_MOUNT_SET_GROUP and MOVE_MOUNT_BENEATH are not supported.", nil),
430: syscalls.PartiallySupported("fsopen", FSOpen, "Message retrieval interface not supported.", nil),
431: syscalls.PartiallySupported("fsconfig", FSConfig, "Only options FSCONFIG_SET_FLAG, FSCONFIG_SET_STRING, and FSCONFIG_CMD_CREATE are supported. All option parsing and error handling happens upon FSCONFIG_CMD_CREATE.", nil),
Expand Down
14 changes: 13 additions & 1 deletion pkg/sentry/syscalls/linux/sys_mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func Mount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr,
flags = flags &^ linux.MS_MGC_MSK
}

const unsupported = linux.MS_UNBINDABLE | linux.MS_MOVE | linux.MS_NODIRATIME
const unsupported = linux.MS_UNBINDABLE | linux.MS_NODIRATIME

// Linux just allows passing any flags to mount(2) - it won't fail when
// unknown or unsupported flags are passed. Since we don't implement
Expand Down Expand Up @@ -112,6 +112,18 @@ func Mount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr,
return 0, nil, t.Kernel().VFS().BindAt(t, creds, &sourceTpop.pop, &target.pop, flags&linux.MS_REC != 0)
case flags&(linux.MS_SHARED|linux.MS_PRIVATE|linux.MS_SLAVE|linux.MS_UNBINDABLE) != 0:
return 0, nil, t.Kernel().VFS().SetMountPropagationAt(t, creds, &target.pop, uint32(flags))
case flags&linux.MS_MOVE != 0:
sourcePath, err := copyInPath(t, sourceAddr)
if err != nil {
return 0, nil, err
}
var sourceTpop taskPathOperation
sourceTpop, err = getTaskPathOperation(t, linux.AT_FDCWD, sourcePath, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer sourceTpop.Release(t)
return 0, nil, t.Kernel().VFS().MoveMountAt(t, creds, t.MountNamespace(), &sourceTpop.pop, &target.pop)
}

// Only copy in source, fstype, and data if we are doing a normal mount.
Expand Down
97 changes: 95 additions & 2 deletions pkg/sentry/syscalls/linux/sys_mount_fd.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,103 @@ func MoveMount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintp

// Re-attach the mount to the destination mountpoint
vfsObj := t.Kernel().VFS()
err = vfsObj.MoveMountAt(t, creds, &from.pop, &to.pop)
err = vfsObj.MoveMountAt(t, creds, t.MountNamespace(), &from.pop, &to.pop)
if err != nil {
return 0, nil, err
}

return 0, nil, nil
}

// OpenTree implements Linux syscall open_tree(2).
func OpenTree(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
fromAddr := args[1].Pointer()
flags := args[2].Uint()

// TODO(b/270247637): gVisor does not yet support automount, so
// AT_NO_AUTOMOUNT flag is a no-op.
flags &= ^(uint32(linux.AT_NO_AUTOMOUNT))

if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.OPEN_TREE_CLOEXEC|linux.OPEN_TREE_CLONE|linux.AT_RECURSIVE) != 0 {
return 0, nil, linuxerr.EINVAL
}

recursive := flags&linux.AT_RECURSIVE == linux.AT_RECURSIVE
clone := flags&linux.OPEN_TREE_CLONE == linux.OPEN_TREE_CLONE
noFollow := flags&linux.AT_SYMLINK_NOFOLLOW == linux.AT_SYMLINK_NOFOLLOW
emptyPath := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
closeOnExec := flags&linux.OPEN_TREE_CLOEXEC == linux.OPEN_TREE_CLOEXEC

// AT_RECURSIVE requires OPEN_TREE_CLONE
if recursive && !clone {
return 0, nil, linuxerr.EINVAL
}

// OPEN_TREE_CLONE: Must have CAP_SYS_ADMIN in the current mount namespace's
// associated user namespace.
creds := t.Credentials()
if clone && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) {
return 0, nil, linuxerr.EPERM
}

// Lookup the specified path
fromPath, err := copyInPath(t, fromAddr)
if err != nil {
return 0, nil, err
}
from, err := getTaskPathOperation(t, dirfd, fromPath, shouldAllowEmptyPath(emptyPath), shouldFollowFinalSymlink(!noFollow))
if err != nil {
return 0, nil, err
}
defer from.Release(t)

vfsObj := t.Kernel().VFS()

if clone {
// OPEN_TREE_CLONE: clone the mount tree into an anonymous mount ns

// Fetch the path's mount vd
fromVd, err := vfsObj.GetDentryAt(t, t.Credentials(), &from.pop, &vfs.GetDentryOptions{CheckSearchable: true})
if err != nil {
return 0, nil, err
}
defer fromVd.DecRef(t)

// Clone the mount (or the mount tree, depending on AT_RECURSIVE) into a new anonymous NS
anonNS, err := vfsObj.CloneTreeToAnonNS(t, t.MountNamespace(), fromVd, t.Kernel(), recursive)
if err != nil {
return 0, nil, err
}

// Construct a mountfd object
mountFile, err := mountfd.New(t, anonNS, linux.O_RDONLY)
if err != nil {
return 0, nil, err
}
defer mountFile.DecRef(t)
mountFd, err := t.NewFDFrom(0, mountFile, kernel.FDFlags{
CloseOnExec: closeOnExec,
})
if err != nil {
return 0, nil, err
}
return uintptr(mountFd), nil, nil
}

// No OPEN_TREE_CLONE: just return a normal O_PATH fd.
openatFlags := uint32(linux.O_PATH)
if noFollow {
openatFlags |= linux.O_NOFOLLOW
}
file, err := vfsObj.OpenAt(t, t.Credentials(), &from.pop, &vfs.OpenOptions{Flags: openatFlags})
if err != nil {
return 0, nil, err
}
defer file.DecRef(t)

fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: closeOnExec,
})

return uintptr(fd), nil, err
}
8 changes: 3 additions & 5 deletions pkg/sentry/vfs/mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
// The path lookups for source and target checks traversal permissions against creds.
//
// Roughly analogous to Linux fs/namespace.c:do_move_mount().
func (vfs *VirtualFilesystem) MoveMountAt(ctx context.Context, creds *auth.Credentials, source *PathOperation, target *PathOperation) error {
func (vfs *VirtualFilesystem) MoveMountAt(ctx context.Context, creds *auth.Credentials, taskMountNs *MountNamespace, source *PathOperation, target *PathOperation) error {
// Lookup the source path
sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{CheckSearchable: true})
if err != nil {
Expand Down Expand Up @@ -504,13 +504,11 @@ func (vfs *VirtualFilesystem) MoveMountAt(ctx context.Context, creds *auth.Crede
}
// And the destination, if not in our mount ns, must be:
// - Mounted
// - In an anonymous mount ns
// - In an appropriate anonymous mount ns
if !vfs.validInMountNS(ctx, mp.mount) {
if mp.mount.umounted || !mp.mount.ns.anon {
if mp.mount.umounted || !mp.mount.ns.anonCanBeOperatedOn(taskMountNs) {
return linuxerr.EINVAL
}
// TODO(b/513024543): when open_tree(2) is implemented, we may need to start tracking
// and checking the mount namespace's "owner"
}
}

Expand Down
122 changes: 110 additions & 12 deletions pkg/sentry/vfs/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package vfs

import (
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/refs"
Expand All @@ -29,6 +30,9 @@ import (
//
// +stateify savable
type MountNamespace struct {
// ID is the immutable mount namespace ID.
ID uint64

// Refs is the reference count for this mount namespace.
Refs refs.TryRefCounter

Expand Down Expand Up @@ -64,6 +68,14 @@ type MountNamespace struct {

// anon indicates whether the mount namespace is anonymous.
anon bool

// For anonymous mount namespaces, originatorID is the ID of the mount
// namespace where the tree originated from. Used for permission checks.
// 0 is a special value that indicates "no permission checks required."
// All non-anonymous mount namespaces will have originatorID == 0. Some
// anonymous mount namespaces may have originatorID == 0 (such as "fresh"
// trees created using fsmount(2)).
originatorID uint64
}

// Namespace is the namespace interface.
Expand All @@ -73,6 +85,19 @@ type Namespace interface {
UserNamespace() *auth.UserNamespace
}

// newMountNamespace initializes a new mount namespace.
// This method is not intended to be used directly; instead, use one of the
// NewMountNamespace*() methods, which will set up the ns root as well.
func (vfs *VirtualFilesystem) newMountNamespace(owner *auth.UserNamespace, anon bool) *MountNamespace {
return &MountNamespace{
ID: vfs.lastMountNamespaceID.Add(1),
vfs: vfs,
Owner: owner,
mountpoints: make(map[*Dentry]uint32),
anon: anon,
}
}

// NewMountNamespace returns a new mount namespace with a root filesystem
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
Expand Down Expand Up @@ -122,12 +147,7 @@ func (vfs *VirtualFilesystem) NewMountNamespaceFrom(
nsfs NamespaceInodeGetter,
anon bool,
) *MountNamespace {
mntns := &MountNamespace{
vfs: vfs,
Owner: creds.UserNamespace,
mountpoints: make(map[*Dentry]uint32),
anon: anon,
}
mntns := vfs.newMountNamespace(creds.UserNamespace, anon)
if nsfs == nil {
refs := &namespaceDefaultRefs{destroy: mntns.Destroy}
refs.InitRefs()
Expand Down Expand Up @@ -175,12 +195,7 @@ func (vfs *VirtualFilesystem) CloneMountNamespace(
cwd *VirtualDentry,
nsfs NamespaceInodeGetter,
) (*MountNamespace, error) {
newns := &MountNamespace{
vfs: vfs,
Owner: uns,
mountpoints: make(map[*Dentry]uint32),
}

newns := vfs.newMountNamespace(uns, false)
newns.Refs = nsfs.GetNamespaceInode(ctx, newns)
vfs.lockMounts()
defer vfs.unlockMounts(ctx)
Expand All @@ -206,6 +221,72 @@ func (vfs *VirtualFilesystem) CloneMountNamespace(
return newns, nil
}

// CloneTreeToAnonNS implements open_tree(2)'s OPEN_TREE_CLONE. It makes a copy of the existing
// mount tree at fromVd, placing it at the root of a new anonymous mount namespace.
func (vfs *VirtualFilesystem) CloneTreeToAnonNS(
ctx context.Context,
taskMountNs *MountNamespace,
fromVd VirtualDentry,
nsfs NamespaceInodeGetter,
recursive bool,
) (*MountNamespace, error) {
newNs := vfs.newMountNamespace(taskMountNs.Owner, true)
newNs.Refs = nsfs.GetNamespaceInode(ctx, newNs)
newNsCleanup := cleanup.Make(func() {
newNs.DecRef(ctx)
})
defer newNsCleanup.Clean()

fromMnt := fromVd.mount

vfs.lockMounts()
defer vfs.unlockMounts(ctx)

// Keep track of the originator of this anon ns for later permission checking.
if fromMnt.ns != nil {
if fromMnt.ns.anon {
newNs.originatorID = fromMnt.ns.originatorID
} else {
newNs.originatorID = fromMnt.ns.ID
}
}

// Sanity checks

// TODO(b/305893463): When MS_UNBINDABLE is added,
// MS_UNBINDABLE mounts should be rejected here.

fsName := fromMnt.Filesystem().FilesystemType().Name()
// fromMnt must be either:
// - In the same mount ns as the current task
// - In an appropriate anonymous mount namespace
// nsfs mounts are exempted from these requirements.
// TODO(b/513023394): when pidfd-fs is implemented, it will also be exempted.
if fromMnt.ns != taskMountNs && (fromMnt.ns == nil || !fromMnt.ns.anonCanBeOperatedOn(taskMountNs)) && fsName != nsfsName {
return nil, linuxerr.EINVAL
}

if !recursive && vfs.mountHasLockedChildren(fromMnt, fromVd) {
return nil, linuxerr.EINVAL
}

var newRoot *Mount
var err error
if recursive {
newRoot, err = vfs.cloneMountTree(ctx, fromMnt, fromVd.dentry, 0, nil)
} else {
newRoot, err = vfs.cloneMount(fromMnt, fromVd.dentry, nil, 0)
}
if err != nil {
return nil, err
}
newNs.root = newRoot
newNs.root.ns = newNs
vfs.commitChildren(ctx, newRoot)
newNsCleanup.Release()
return newNs, nil
}

// Destroy implements nsfs.Namespace.Destroy.
func (mntns *MountNamespace) Destroy(ctx context.Context) {
vfs := mntns.vfs
Expand Down Expand Up @@ -243,6 +324,11 @@ func (mntns *MountNamespace) TryIncRef() bool {
return mntns.Refs.TryIncRef()
}

// Anon returns whether the namespace is anonymous or not.
func (mntns *MountNamespace) Anon() bool {
return mntns.anon
}

// Root returns mntns' root. If the root is over-mounted, it returns the top
// mount.
// May return an empty virtual dentry if mntns is an anonymous mount namespace and its root
Expand Down Expand Up @@ -285,3 +371,15 @@ func (mntns *MountNamespace) checkMountCount(ctx context.Context, mnt *Mount) er
mntns.pending += mnts
return nil
}

// anonCanBeOperatedOn checks whether the mount namespace is both anonymous
// and accessible by the mount namespace `by`.
//
// It is analogous to fs/namespace.c:check_anonymous_mnt() in Linux.
func (mntns *MountNamespace) anonCanBeOperatedOn(by *MountNamespace) bool {
if !mntns.anon {
return false
}

return mntns.originatorID == 0 || mntns.originatorID == by.ID
}
Loading
Loading