diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 6f788c5eff..4ebcf89562 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -1548,7 +1548,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo opts = "ro" } if mnt.flags.NoATime { - opts = ",noatime" + opts += ",noatime" } if mnt.flags.NoExec { opts += ",noexec" diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 24583a5bbd..41cb9ede00 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -465,9 +465,13 @@ func lisafsNeededForDirectFSSuppression(spec *specs.Spec, mountHints *boot.PodMo // makeRPCMountOpener returns a MountOpener that opens mount sources via the // gofer-to-host RPC channel. func makeRPCMountOpener(goferToHostRPC *urpc.Client) sandboxsetup.MountOpener { - return func(m *specs.Mount) (*os.File, error) { + return func(m *specs.Mount, flags uint32) (*os.File, error) { + req := container.OpenMountArgs{ + Mount: m, + Flags: flags, + } var res container.OpenMountResult - if err := goferToHostRPC.Call("goferToHostRPC.OpenMount", m, &res); err != nil { + if err := goferToHostRPC.Call("goferToHostRPC.OpenMount", &req, &res); err != nil { return nil, fmt.Errorf("opening %s: %w", m.Source, err) } return res.Files[0], nil diff --git a/runsc/cmd/sandboxsetup/gofer_mount.go b/runsc/cmd/sandboxsetup/gofer_mount.go index 36ed7ca87c..6d9938bb9b 100644 --- a/runsc/cmd/sandboxsetup/gofer_mount.go +++ b/runsc/cmd/sandboxsetup/gofer_mount.go @@ -39,12 +39,18 @@ const ProcFDBindMount = "/proc/fs" // vfioPathDir is the directory containing VFIO device nodes. const vfioPathDir = "/dev/vfio" -// MountOpener opens a mount source when the gofer process cannot access it -// directly (e.g. due to permission restrictions in a user namespace). It -// returns the opened file for the mount source. The caller is responsible -// for closing the returned file. It may be nil if all mounts are directly -// accessible. -type MountOpener func(m *specs.Mount) (*os.File, error) +// MountOpener is used in two cases: +// +// (1) when the gofer process cannot access a mount source directly (e.g. +// due to permission restrictions in a user namespace). In this case, the +// returned file is just the opened mount source. +// +// (2) when the mount is id-mapped, which the gofer process cannot configure +// on its own. In this case, the returned file is an open_tree() detached mount +// point which has had the requested id mapping configured using mount_setattr(). +// +// The caller is responsible for closing the returned file. +type MountOpener func(m *specs.Mount, flags uint32) (*os.File, error) // NewSocket creates a unet.Socket from a file descriptor. // It fatally exits if the socket cannot be created. @@ -257,6 +263,82 @@ func SetupRootFS(spec *specs.Spec, conf *config.Config, mountConfs []specutils.G return nil } +func msFlagsToMountAttr(flags uint32) (attrSet uint64, attrClr uint64) { + if flags&unix.MS_RDONLY != 0 { + attrSet |= unix.MOUNT_ATTR_RDONLY + } + if flags&unix.MS_NOSUID != 0 { + attrSet |= unix.MOUNT_ATTR_NOSUID + } + if flags&unix.MS_NODEV != 0 { + attrSet |= unix.MOUNT_ATTR_NODEV + } + if flags&unix.MS_NOEXEC != 0 { + attrSet |= unix.MOUNT_ATTR_NOEXEC + } + if flags&unix.MS_NODIRATIME != 0 { + attrSet |= unix.MOUNT_ATTR_NODIRATIME + } + + if flags&unix.MS_NOATIME != 0 { + attrSet |= unix.MOUNT_ATTR_NOATIME + attrClr |= unix.MOUNT_ATTR__ATIME + } else if flags&unix.MS_RELATIME != 0 { + attrSet |= unix.MOUNT_ATTR_RELATIME + attrClr |= unix.MOUNT_ATTR__ATIME + } else if flags&unix.MS_STRICTATIME != 0 { + attrSet |= unix.MOUNT_ATTR_STRICTATIME + attrClr |= unix.MOUNT_ATTR__ATIME + } + + return attrSet, attrClr +} + +func safeSetupAndMoveMount(srcFileFD int, src, dst, procPath string) error { + fi, err := os.Stat(src) + if err != nil { + return fmt.Errorf("stat(%q) failed: %v", src, err) + } + if fi.IsDir() { + if err := os.MkdirAll(dst, 0777); err != nil { + return fmt.Errorf("mkdir(%q) failed: %v", dst, err) + } + } else { + parent := filepath.Dir(dst) + if err := os.MkdirAll(parent, 0777); err != nil { + return fmt.Errorf("mkdir(%q) failed: %v", parent, err) + } + f, err := os.OpenFile(dst, unix.O_CREAT, 0777) + if err != nil { + return fmt.Errorf("open(%q) failed: %v", dst, err) + } + f.Close() + } + + fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("failed to safely move mount: Open(%s, _, _): %w", dst, err) + } + defer unix.Close(fd) + + if procPath == "" { + procPath = "/proc" + } + fdPath := fmt.Sprintf("%s/self/fd/%d", procPath, fd) + target, err := os.Readlink(fdPath) + if err != nil { + return fmt.Errorf("failed to safely move mount: Readlink(%s): %w", fdPath, err) + } + if dst != target { + return fmt.Errorf("failed to safely move mount: expected to open %s, but found %s", dst, target) + } + + if err := unix.MoveMount(srcFileFD, "", fd, "", unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_EMPTY_PATH); err != nil { + return fmt.Errorf("MoveMount(%d, %q) failed: %w", srcFileFD, dst, err) + } + return nil +} + // SetupMounts bind-mounts all mounts specified in the spec in their correct // location inside root. It resolves relative paths and symlinks, and creates // directories as needed. @@ -288,23 +370,28 @@ func SetupMounts(conf *config.Config, mounts []specs.Mount, root, procPath strin flags |= unix.MS_RDONLY } - log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) + log.Infof("Mounting src: %q, dst: %q, flags: %#x, idMapped: %t", m.Source, dst, flags, specutils.IsIDMappedMount(m)) src := m.Source var srcFile *os.File - if err := unix.Access(src, unix.R_OK); err != nil { + if specutils.IsIDMappedMount(m) || unix.Access(src, unix.R_OK) != nil { if mountOpener == nil { - return fmt.Errorf("cannot access mount source %q and no mount opener provided: %v", src, err) + return fmt.Errorf("cannot access mount source %q (or id mapped) and no mount opener provided", src) } // The current process doesn't have enough permissions - // to open the mount, so let's try to open it via the + // to open the mount (or it is ID mapped), so let's try to open it via the // caller-provided opener. - srcFile, err = mountOpener(&m) + srcFile, err = mountOpener(&m, flags) if err != nil { return fmt.Errorf("opening %s: %w", m.Source, err) } src = fmt.Sprintf("%s/self/fd/%d", procPath, srcFile.Fd()) } - err = specutils.SafeSetupAndMount(src, dst, m.Type, flags, procPath) + + if specutils.IsIDMappedMount(m) { + err = safeSetupAndMoveMount(int(srcFile.Fd()), src, dst, procPath) + } else { + err = specutils.SafeSetupAndMount(src, dst, m.Type, flags, procPath) + } if srcFile != nil { srcFile.Close() } @@ -319,7 +406,7 @@ func SetupMounts(conf *config.Config, mounts []specs.Mount, root, procPath strin defer unix.Close(dstFD) // Apply mount options after creating all mount points. // Otherwise they can be remounted into read-only. - defer func(dstFD int, flags uint32, dst string) { + defer func(dstFD int, flags uint32, dst string, isIDMapped bool) { path := fmt.Sprintf("/proc/self/fd/%d", dstFD) // The gofer process doesn't execute anything natively. flags |= unix.MS_NOSUID @@ -345,18 +432,34 @@ func SetupMounts(conf *config.Config, mounts []specs.Mount, root, procPath strin lockedFlags |= uint32(f.ms) } } - if lockedFlags&unix.MS_NOATIME|unix.MS_RELATIME == 0 { + if lockedFlags&(unix.MS_NOATIME|unix.MS_RELATIME) == 0 { lockedFlags |= unix.MS_STRICTATIME } - // The previous SafeSetupAndMount creates a new bind-mount, but - // it doesn't change mount flags. A separate MS_BIND|MS_REMOUNT - // has to be done to apply the mount options. - if err := unix.Mount("", path, "", uintptr(flags|lockedFlags|unix.MS_REMOUNT), ""); err != nil { - retErr = fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) - return + if isIDMapped { + attrSet, attrClr := msFlagsToMountAttr(flags | lockedFlags) + setattrFlags := uint(unix.AT_EMPTY_PATH) + if flags&unix.MS_REC != 0 { + setattrFlags |= unix.AT_RECURSIVE + } + attr := &unix.MountAttr{ + Attr_set: attrSet, + Attr_clr: attrClr, + } + if err := unix.MountSetattr(dstFD, "", setattrFlags, attr); err != nil { + retErr = fmt.Errorf("mount_setattr dst: %q, flags: %#x, err: %v", dst, flags, err) + return + } + } else { + // The previous SafeSetupAndMount creates a new bind-mount, but + // it doesn't change mount flags. A separate MS_BIND|MS_REMOUNT + // has to be done to apply the mount options. + if err := unix.Mount("", path, "", uintptr(flags|lockedFlags|unix.MS_REMOUNT), ""); err != nil { + retErr = fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) + return + } } - }(dstFD, flags, dst) + }(dstFD, flags, dst, specutils.IsIDMappedMount(m)) // Set propagation options that cannot be set together with other options. flags = specutils.PropOptionsToFlags(m.Options) diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 62aa9d72d6..44165034a6 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -76,6 +76,7 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/cleanup", + "//pkg/hostos", "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/fsimpl/erofs", diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index cc435019ce..fadffd0313 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -38,6 +38,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/hostos" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" @@ -81,10 +82,10 @@ func execute(conf *config.Config, cont *Container, name string, arg ...string) ( // executeCombinedOutput executes a process in the container and captures // stdout and stderr. If execFile is supplied, a host file will be executed. // Otherwise, the name argument is used to resolve the executable in the guest. -func executeCombinedOutput(conf *config.Config, cont *Container, execFile *os.File, name string, arg ...string) ([]byte, error) { +func executeCombinedOutputWithStatus(conf *config.Config, cont *Container, execFile *os.File, name string, arg ...string) ([]byte, int, error) { r, w, err := os.Pipe() if err != nil { - return nil, err + return nil, 0, err } defer r.Close() @@ -102,14 +103,29 @@ func executeCombinedOutput(conf *config.Config, cont *Container, execFile *os.Fi ws, err := cont.executeSync(conf, args) w.Close() if err != nil { - return nil, err + return nil, 0, err } + if !ws.Exited() { + return nil, 0, fmt.Errorf("process did not exit properly") + } + status := ws.ExitStatus() out, err := io.ReadAll(r) - switch { - case ws != 0 && err != nil: - err = fmt.Errorf("exec failed, status: %v, io.ReadAll failed: %v", ws, err) - case ws != 0: - err = fmt.Errorf("exec failed, status: %v", ws) + if err != nil { + return nil, status, err + } + + return out, status, err +} + +// executeCombinedOutput executes a process in the container and captures +// stdout and stderr. If execFile is supplied, a host file will be executed. +// Otherwise, the name argument is used to resolve the executable in the guest. +func executeCombinedOutput(conf *config.Config, cont *Container, execFile *os.File, name string, arg ...string) ([]byte, error) { + out, status, err := executeCombinedOutputWithStatus(conf, cont, execFile, name, arg...) + if status != 0 && err != nil { + err = fmt.Errorf("exec failed, status: %v, err: %v", status, err) + } else if status != 0 { + err = fmt.Errorf("exec failed, status: %v", status) } return out, err } @@ -2042,6 +2058,617 @@ func TestReadonlyMount(t *testing.T) { } } +func TestIDMappedMount(t *testing.T) { + for name, conf := range configs(t, false /* noOverlay */) { + t.Run(name, func(t *testing.T) { + kernelVersion, err := hostos.KernelVersion() + if err != nil { + t.Fatalf("Failed to check kernel version: %v", err) + } + if !kernelVersion.AtLeast(6, 3) { + t.Skipf("Skipping as kernel >=6.3 is required for id mapped mount tests") + } + + dir, err := os.MkdirTemp(testutil.TmpDir(), "id-mapped-mount") + if err != nil { + t.Fatalf("os.MkdirTemp() failed: %v", err) + } + if err := unix.Mount("tmpfs", dir, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(dir, unix.MNT_DETACH) + + // Create a file owned by 0:0 + testFilePath := filepath.Join(dir, "test-file") + f, err := os.OpenFile(testFilePath, os.O_RDONLY|os.O_CREATE, 0644) + if err != nil { + t.Fatalf("os.OpenFile() failed: %+v", err) + } + err = f.Chown(0, 0) + if err != nil { + t.Fatalf("chown failed: %v", err) + } + f.Close() + + spec, _ := sleepSpecConf(t) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"rbind", "idmap", "rw"}, + // Note that in a mount id-mapping context, the meaning of ContainerID + // and HostID is flipped from what the names would suggest. + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Verify that [dir]/test-file appears to be owned by nobody:nogroup + // in the container, since 0:0 on the host is not mapped in the mount's + // id mapping. + // Unfortunately, we cannot test with a valid mapping, since the lack of + // CAP_SETUID and CAP_SETGID in the test environment means we only have + // one uid-gid pair available to us. + expectedOwner := "65534:65534" + cmd := fmt.Sprintf("stat -c '%%u:%%g' '%s'", testFilePath) + out, err := executeCombinedOutput(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("exec failed, out: %v, err: %v", string(out), err) + } + outStr := strings.TrimSpace(string(out)) + if outStr != expectedOwner { + t.Fatalf("file should have been owned by %s, instead owned by %v", expectedOwner, outStr) + } + }) + } +} + +func TestIDMappedSubMount(t *testing.T) { + for name, conf := range configs(t, false /* noOverlay */) { + t.Run(name, func(t *testing.T) { + kernelVersion, err := hostos.KernelVersion() + if err != nil { + t.Fatalf("Failed to check kernel version: %v", err) + } + if !kernelVersion.AtLeast(6, 3) { + t.Skipf("Skipping as kernel >=6.3 is required for id mapped mount tests") + } + + dir, err := os.MkdirTemp(testutil.TmpDir(), "id-mapped-mount") + if err != nil { + t.Fatalf("os.MkdirTemp() failed: %v", err) + } + err = unix.Mount("tmpfs", dir, "tmpfs", 0, "") + if err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(dir, unix.MNT_DETACH) + + // Make a submount + subDir := filepath.Join(dir, "sub-mount") + err = os.Mkdir(subDir, 0777) + if err != nil { + t.Fatalf("os.Mkdir() failed: %v", err) + } + err = unix.Mount("tmpfs", subDir, "tmpfs", 0, "") + if err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(subDir, unix.MNT_DETACH) + + // Create a file owned by 0:0 + testFilePath := filepath.Join(subDir, "test-file") + f, err := os.OpenFile(testFilePath, os.O_RDONLY|os.O_CREATE, 0644) + if err != nil { + t.Fatalf("os.OpenFile() failed: %+v", err) + } + err = f.Chown(0, 0) + if err != nil { + t.Fatalf("chown failed: %v", err) + } + f.Close() + + spec, _ := sleepSpecConf(t) + ridmapPath := filepath.Join(dir, "ridmap") + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: ridmapPath, + Source: dir, + Type: "bind", + // ridmap: recursively apply id mapping + Options: []string{"rbind", "ridmap", "rw"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + idmapPath := filepath.Join(dir, "idmap") + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: idmapPath, + Source: dir, + Type: "bind", + // idmap: only apply id mapping at the top level + Options: []string{"rbind", "idmap", "rw"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Verify that [dir]/ridmap/sub-mount/test-file appears to be owned by + // nobody:nogroup in the container. + expectedOwner := "65534:65534" + cmd := fmt.Sprintf("stat -c '%%u:%%g' '%s'", filepath.Join(ridmapPath, "sub-mount", "test-file")) + out, err := executeCombinedOutput(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("exec failed, out: %v, err: %v", string(out), err) + } + outStr := strings.TrimSpace(string(out)) + if outStr != expectedOwner { + t.Fatalf("file should have been owned by %s, instead owned by %v", expectedOwner, outStr) + } + + // Verify that [dir]/idmap/sub-mount/test-file appears to be owned by + // 0:0 in the container (since id mapping shouldn't apply to the submount). + expectedOwner = "0:0" + cmd = fmt.Sprintf("stat -c '%%u:%%g' '%s'", filepath.Join(idmapPath, "sub-mount", "test-file")) + out, err = executeCombinedOutput(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("exec failed, out: %v, err: %v", string(out), err) + } + outStr = strings.TrimSpace(string(out)) + if outStr != expectedOwner { + t.Fatalf("file should have been owned by %s, instead owned by %v", expectedOwner, outStr) + } + }) + } +} + +func TestIDMappedMountFlags(t *testing.T) { + for name, conf := range configs(t, false /* noOverlay */) { + t.Run(name, func(t *testing.T) { + kernelVersion, err := hostos.KernelVersion() + if err != nil { + t.Fatalf("Failed to check kernel version: %v", err) + } + if !kernelVersion.AtLeast(6, 3) { + t.Skipf("Skipping as kernel >=6.3 is required for id mapped mount tests") + } + + dir, err := os.MkdirTemp(testutil.TmpDir(), "id-mapped-mount-flags") + if err != nil { + t.Fatalf("os.MkdirTemp() failed: %v", err) + } + if err := unix.Mount("tmpfs", dir, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(dir, unix.MNT_DETACH) + + dir1 := filepath.Join(dir, "mnt1") + dir2 := filepath.Join(dir, "mnt2") + if err := os.MkdirAll(dir1, 0777); err != nil { + t.Fatalf("os.MkdirAll() failed: %v", err) + } + if err := os.MkdirAll(dir2, 0777); err != nil { + t.Fatalf("os.MkdirAll() failed: %v", err) + } + + // Create a file in dir1 owned by 0:0 and a script/binary + testFilePath1 := filepath.Join(dir1, "test-file") + f, err := os.OpenFile(testFilePath1, os.O_RDONLY|os.O_CREATE, 0644) + if err != nil { + t.Fatalf("os.OpenFile() failed: %+v", err) + } + err = f.Chown(0, 0) + if err != nil { + t.Fatalf("chown failed: %v", err) + } + f.Close() + + execFilePath1 := filepath.Join(dir1, "test-exec") + f, err = os.OpenFile(execFilePath1, os.O_WRONLY|os.O_CREATE, 0755) + if err != nil { + t.Fatalf("os.OpenFile() failed: %+v", err) + } + if _, err := f.WriteString("#!/bin/sh\necho hello\n"); err != nil { + t.Fatalf("write failed: %v", err) + } + err = f.Chown(0, 0) + if err != nil { + t.Fatalf("chown failed: %v", err) + } + f.Close() + + spec, _ := sleepSpecConf(t) + // Test with two mounts, since nodiratime and relatime are mutually exclusive. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir1, + Source: dir1, + Type: "bind", + Options: []string{"rbind", "idmap", "ro", "noexec", "noatime", "nodiratime"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir2, + Source: dir2, + Type: "bind", + Options: []string{"rbind", "idmap", "rw", "relatime"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Verify ro inside container on dir1 + _, status, err := executeCombinedOutputWithStatus(conf, c, nil, "/bin/touch", filepath.Join(dir1, "test-ro")) + if err != nil { + t.Fatalf("execSync touch failed: %v", err) + } + if status == 0 { + t.Fatalf("touch on ro mount got exit status 0, want != 0") + } + + // Verify noexec inside container on dir1 + _, _, err = executeCombinedOutputWithStatus(conf, c, nil, execFilePath1) + if err == nil || !strings.Contains(err.Error(), "permission denied") { + t.Fatalf("execution on noexec mount got err: %v, want 'permission denied'", err) + } + + // Verify mount options in gofer mountinfo + mountinfo, err := os.ReadFile(fmt.Sprintf("/proc/%d/mountinfo", c.GoferPid.Load())) + if err != nil { + t.Fatalf("failed to read gofer mountinfo: %v", err) + } + + verifyMountinfo := func(dest string, wantOpts []string) { + var foundLine string + for _, line := range strings.Split(string(mountinfo), "\n") { + fields := strings.Fields(line) + if len(fields) >= 6 && fields[4] == dest { + foundLine = line + break + } + } + if foundLine == "" { + t.Fatalf("mount point ending with %s not found in gofer mountinfo:\n%s", dest, string(mountinfo)) + } + fields := strings.Fields(foundLine) + optsMap := make(map[string]bool) + for _, opt := range strings.Split(fields[5], ",") { + optsMap[opt] = true + } + for _, want := range wantOpts { + if !optsMap[want] { + t.Fatalf("mount point ending with %s (line: %q) missing expected option %q", dest, foundLine, want) + } + } + } + + verifyMountinfo(dir1, []string{"ro", "noexec", "noatime", "nodiratime"}) + mnt2Want := []string{"rw", "relatime"} + if strings.Contains(name, "-overlay") { + mnt2Want = []string{"ro", "relatime"} + } + verifyMountinfo(dir2, mnt2Want) + }) + } +} + +func TestIDMappedMountPropagation(t *testing.T) { + for name, conf := range configs(t, false /* noOverlay */) { + t.Run(name, func(t *testing.T) { + kernelVersion, err := hostos.KernelVersion() + if err != nil { + t.Fatalf("Failed to check kernel version: %v", err) + } + if !kernelVersion.AtLeast(6, 3) { + t.Skipf("Skipping as kernel >=6.3 is required for id mapped mount tests") + } + + dir, err := os.MkdirTemp(testutil.TmpDir(), "id-mapped-mount-prop") + if err != nil { + t.Fatalf("os.MkdirTemp() failed: %v", err) + } + if err := unix.Mount("tmpfs", dir, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(dir, unix.MNT_DETACH) + + if err := unix.Mount("", dir, "", unix.MS_SHARED, ""); err != nil { + t.Fatalf("make-shared failed: %v", err) + } + + subDir := filepath.Join(dir, "sub") + if err := os.MkdirAll(filepath.Join(subDir, "new-tmpfs"), 0777); err != nil { + t.Fatalf("mkdir failed: %v", err) + } + if err := os.MkdirAll(filepath.Join(subDir, "private-tmpfs"), 0777); err != nil { + t.Fatalf("mkdir failed: %v", err) + } + + slavePath := filepath.Join(dir, "slave") + privatePath := filepath.Join(dir, "private") + + spec, _ := sleepSpecConf(t) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: slavePath, + Source: dir, + Type: "bind", + Options: []string{"rbind", "ridmap", "rslave", "rw"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: privatePath, + Source: dir, + Type: "bind", + Options: []string{"rbind", "ridmap", "rprivate", "rw"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Mount tmpfs on host + newTmpfsPath := filepath.Join(subDir, "new-tmpfs") + if err := unix.Mount("tmpfs", newTmpfsPath, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(newTmpfsPath, unix.MNT_DETACH) + + privateTmpfsPath := filepath.Join(subDir, "private-tmpfs") + if err := unix.Mount("tmpfs", privateTmpfsPath, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(privateTmpfsPath, unix.MNT_DETACH) + + // Create a file in each tmpfs to verify propagation + if err := os.WriteFile(filepath.Join(newTmpfsPath, "prop-file"), []byte("data"), 0644); err != nil { + t.Fatalf("write file failed: %v", err) + } + if err := os.WriteFile(filepath.Join(privateTmpfsPath, "prop-file"), []byte("data"), 0644); err != nil { + t.Fatalf("write file failed: %v", err) + } + + // Verify slavePath sees prop-file + cmd := fmt.Sprintf("test -f '%s'", filepath.Join(slavePath, "sub", "new-tmpfs", "prop-file")) + _, status, err := executeCombinedOutputWithStatus(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil || status != 0 { + t.Fatalf("execSync stat failed (expected prop-file to exist in rslave mount), ws: %v, err: %v", status, err) + } + + // Verify privatePath does NOT see prop-file + cmd = fmt.Sprintf("test -f '%s'", filepath.Join(privatePath, "sub", "private-tmpfs", "prop-file")) + _, status, err = executeCombinedOutputWithStatus(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("execSync stat failed unexpectedly: %v", err) + } + if status == 0 { + t.Fatalf("stat on rprivate mount got exit status %d, want != 0", status) + } + }) + } +} + +func TestIDMappedMountFile(t *testing.T) { + for name, conf := range configs(t, false /* noOverlay */) { + t.Run(name, func(t *testing.T) { + kernelVersion, err := hostos.KernelVersion() + if err != nil { + t.Fatalf("Failed to check kernel version: %v", err) + } + if !kernelVersion.AtLeast(6, 3) { + t.Skipf("Skipping as kernel >=6.3 is required for id mapped mount tests") + } + + dir, err := os.MkdirTemp(testutil.TmpDir(), "id-mapped-mount-file") + if err != nil { + t.Fatalf("os.MkdirTemp() failed: %v", err) + } + if err := unix.Mount("tmpfs", dir, "tmpfs", 0, ""); err != nil { + t.Fatalf("mount tmpfs failed: %v", err) + } + defer unix.Unmount(dir, unix.MNT_DETACH) + + // Create a file owned by 0:0 + hostFilePath := filepath.Join(dir, "host-file") + f, err := os.OpenFile(hostFilePath, os.O_RDONLY|os.O_CREATE, 0644) + if err != nil { + t.Fatalf("os.OpenFile() failed: %+v", err) + } + err = f.Chown(0, 0) + if err != nil { + t.Fatalf("chown failed: %v", err) + } + f.Close() + + targetFilePath := filepath.Join(dir, "target-file") + + spec, _ := sleepSpecConf(t) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: targetFilePath, + Source: hostFilePath, + Type: "bind", + Options: []string{"rbind", "idmap", "rw"}, + UIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + GIDMappings: []specs.LinuxIDMapping{{ + ContainerID: 100000, + HostID: 0, + Size: 1, + }}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Verify that targetFilePath appears to be owned by nobody:nogroup + expectedOwner := "65534:65534" + cmd := fmt.Sprintf("stat -c '%%u:%%g' '%s'", targetFilePath) + out, err := executeCombinedOutput(conf, c, nil, "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("exec failed, out: %v, err: %v", string(out), err) + } + outStr := strings.TrimSpace(string(out)) + if outStr != expectedOwner { + t.Fatalf("file should have been owned by %s, instead owned by %v", expectedOwner, outStr) + } + }) + } +} + func TestUIDMap(t *testing.T) { for name, conf := range configs(t, true /* noOverlay */) { t.Run(name, func(t *testing.T) { diff --git a/runsc/container/gofer_to_host_rpc.go b/runsc/container/gofer_to_host_rpc.go index 9873c3be5b..2480757f60 100644 --- a/runsc/container/gofer_to_host_rpc.go +++ b/runsc/container/gofer_to_host_rpc.go @@ -18,15 +18,24 @@ import ( "fmt" "os" "runtime" + "slices" "sync" + "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/specutils" ) +// OpenMountArgs represents a mount to be opened along with its flags. +type OpenMountArgs struct { + Mount *specs.Mount + Flags uint32 +} + type openMountRequest struct { - mount *specs.Mount + args *OpenMountArgs result *OpenMountResult done chan error } @@ -45,13 +54,122 @@ type OpenMountResult struct { urpc.FilePayload } +func createIDMappedUserNS(uidMappings, gidMappings []specs.LinuxIDMapping) (*os.File, error) { + var sysUIDMaps []syscall.SysProcIDMap + for _, m := range uidMappings { + sysUIDMaps = append(sysUIDMaps, syscall.SysProcIDMap{ + ContainerID: int(m.ContainerID), + HostID: int(m.HostID), + Size: int(m.Size), + }) + } + + var sysGIDMaps []syscall.SysProcIDMap + for _, m := range gidMappings { + sysGIDMaps = append(sysGIDMaps, syscall.SysProcIDMap{ + ContainerID: int(m.ContainerID), + HostID: int(m.HostID), + Size: int(m.Size), + }) + } + + proc, err := os.StartProcess("/proc/self/exe", []string{"runsc[getUsernsFD]"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: sysUIDMaps, + GidMappings: sysGIDMaps, + Ptrace: true, + Pdeathsig: syscall.SIGKILL, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to start helper process for userns unshare: %w", err) + } + + defer func() { + proc.Kill() + proc.Wait() + }() + + // Ensure the Go runtime is using pidfds, which are required for the + // proc.Signal() call to correctly guarantee we opened the usernsfd + // for the right process. + // + // Technically, we could fetch the handle from the pidfd directly + // using the PIDFD_GET_USER_NAMESPACE ioctl, but this was only added + // in kernel 6.11. + pidfdUsed := false + proc.WithHandle(func(pidfd uintptr) { + pidfdUsed = true + }) + if !pidfdUsed { + return nil, fmt.Errorf("failed to refer to userns helper process as pidfds are not supported") + } + + usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, fmt.Errorf("failed to open user namespace descriptor for child PID %d: %w", proc.Pid, err) + } + + if err := proc.Signal(syscall.Signal(0)); err != nil { + usernsFD.Close() + return nil, fmt.Errorf("failed to verify userns helper process validity: %w", err) + } + + return usernsFD, nil +} + +func openIDMappedMount(req *OpenMountArgs) (*os.File, error) { + usernsFD, err := createIDMappedUserNS(req.Mount.UIDMappings, req.Mount.GIDMappings) + if err != nil { + return nil, err + } + defer usernsFD.Close() + + openTreeFlags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC) + if req.Flags&unix.MS_REC != 0 { + openTreeFlags |= unix.AT_RECURSIVE + } + + fd, err := unix.OpenTree(unix.AT_FDCWD, req.Mount.Source, openTreeFlags) + if err != nil { + return nil, fmt.Errorf("open_tree(%q) failed: %w", req.Mount.Source, err) + } + + setattrFlags := uint(unix.AT_EMPTY_PATH) + if slices.Contains(req.Mount.Options, "ridmap") { + setattrFlags |= unix.AT_RECURSIVE + } + + attr := &unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFD.Fd()), + } + + if err := unix.MountSetattr(fd, "", setattrFlags, attr); err != nil { + unix.Close(fd) + return nil, fmt.Errorf("mount_setattr(%q) failed: %w", req.Mount.Source, err) + } + + return os.NewFile(uintptr(fd), req.Mount.Source), nil +} + func (rpc *goferToHostRPC) handleRequest(req *openMountRequest) { defer close(req.done) - fd, err := os.OpenFile(req.mount.Source, unix.O_PATH|unix.O_CLOEXEC, 0) + + var fd *os.File + var err error + + if specutils.IsIDMappedMount(*req.args.Mount) { + fd, err = openIDMappedMount(req.args) + } else { + fd, err = os.OpenFile(req.args.Mount.Source, unix.O_PATH|unix.O_CLOEXEC, 0) + } if err != nil { req.done <- err return } + req.result.Files = []*os.File{fd} } @@ -75,7 +193,7 @@ func (rpc *goferToHostRPC) openMountThread() error { // OpenMount opens a specified mount and returns a file descriptor to it. It is // used when the mount isn't accessible from the gofer user namespace. -func (rpc *goferToHostRPC) OpenMount(m *specs.Mount, res *OpenMountResult) error { +func (rpc *goferToHostRPC) OpenMount(reqArg *OpenMountArgs, res *OpenMountResult) error { rpc.mu.Lock() defer rpc.mu.Unlock() @@ -96,7 +214,7 @@ func (rpc *goferToHostRPC) OpenMount(m *specs.Mount, res *OpenMountResult) error }() } req := openMountRequest{ - mount: m, + args: reqArg, result: res, done: make(chan error), } diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 72e30f916f..de7c345108 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -549,6 +549,11 @@ func HasMountConfig(m specs.Mount) bool { return IsGoferMount(m) || IsErofsMount(m) } +// IsIDMappedMount returns true if the given mount has a UID and GID mapping. +func IsIDMappedMount(m specs.Mount) bool { + return len(m.UIDMappings) > 0 && len(m.GIDMappings) > 0 +} + // MaybeConvertToBindMount converts mount type to "bind" in case any of the // mount options are either "bind" or "rbind" as required by the OCI spec. //