diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index faeeb0ee2b..6678f92ca5 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -5,6 +5,17 @@ package(default_applicable_licenses = ["//:license"]) licenses(["notice"]) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "gofer", + prefix = "inode", + template = "//pkg/refs:refs_template", + types = { + "T": "inode", + }, +) + go_template_instance( name = "string_list", out = "string_list.go", @@ -56,16 +67,17 @@ go_template_instance( go_library( name = "gofer", srcs = [ - "dentry_impl.go", "dentry_list.go", - "directfs_dentry.go", + "directfs_inode.go", "directory.go", "filesystem.go", "fstree.go", "gofer.go", "handle.go", "host_named_pipe.go", - "lisafs_dentry.go", + "inode_impl.go", + "inode_refs.go", + "lisafs_inode.go", "regular_file.go", "revalidate.go", "save_restore.go", @@ -112,6 +124,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserr", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/directfs_dentry.go b/pkg/sentry/fsimpl/gofer/directfs_inode.go similarity index 60% rename from pkg/sentry/fsimpl/gofer/directfs_dentry.go rename to pkg/sentry/fsimpl/gofer/directfs_inode.go index bdec2f196f..8a571083cf 100644 --- a/pkg/sentry/fsimpl/gofer/directfs_dentry.go +++ b/pkg/sentry/fsimpl/gofer/directfs_inode.go @@ -22,7 +22,6 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" @@ -67,29 +66,29 @@ func tryOpen(open func(int) (int, error)) (int, error) { return -1, err } -// getDirectfsRootDentry creates a new dentry representing the root dentry for -// this mountpoint. getDirectfsRootDentry takes ownership of rootHostFD and +// getDirectfsRootInode creates a new inode representing the root inode for +// this mountpoint. getDirectfsRootInode takes ownership of rootHostFD and // rootControlFD. -func (fs *filesystem) getDirectfsRootDentry(ctx context.Context, rootHostFD int, rootControlFD lisafs.ClientFD) (*dentry, error) { - d, err := fs.newDirectfsDentry(rootHostFD) +func (fs *filesystem) getDirectfsRootInode(ctx context.Context, rootHostFD int, rootControlFD lisafs.ClientFD) (*inode, error) { + inode, err := fs.newDirectfsInode(rootHostFD) if err != nil { - log.Warningf("newDirectfsDentry failed for mount point dentry: %v", err) + log.Warningf("newDirectfsInode failed for mount point dentry: %v", err) rootControlFD.Close(ctx, false /* flush */) return nil, err } - d.impl.(*directfsDentry).controlFDLisa = rootControlFD - return d, nil + inode.impl.(*directfsInode).controlFDLisa = rootControlFD + return inode, nil } -// directfsDentry is a host dentry implementation. It represents a dentry +// directfsInode is a host dentry implementation. It represents a dentry // backed by a host file descriptor. All operations are directly performed on // the host. A gofer is only involved for some operations on the mount point // dentry (when dentry.parent = nil). We are forced to fall back to the gofer // due to the lack of procfs in the sandbox process. // // +stateify savable -type directfsDentry struct { - dentry +type directfsInode struct { + inode // controlFD is the host FD to this file. controlFD is immutable until // destruction, which is synchronized with dentry.handleMu. @@ -107,59 +106,64 @@ type directfsDentry struct { controlFDLisa lisafs.ClientFD `state:"nosave"` } -// newDirectfsDentry creates a new dentry representing the given file. The dentry +// newDirectfsInode creates a new dentry representing the given file. The dentry // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. // newDirectDentry takes ownership of controlFD. -func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) { +func (fs *filesystem) newDirectfsInode(controlFD int) (*inode, error) { var stat unix.Stat_t + var ret *inode = nil if err := unix.Fstat(controlFD, &stat); err != nil { log.Warningf("failed to fstat(2) FD %d: %v", controlFD, err) _ = unix.Close(controlFD) return nil, err } inoKey := inoKeyFromStat(&stat) - d := &directfsDentry{ - dentry: dentry{ - fs: fs, - inoKey: inoKey, - ino: fs.inoFromKey(inoKey), - mode: atomicbitops.FromUint32(stat.Mode), - uid: atomicbitops.FromUint32(stat.Uid), - gid: atomicbitops.FromUint32(stat.Gid), - blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), - size: atomicbitops.FromUint64(uint64(stat.Size)), - atime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)), - mtime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)), - ctime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)), - nlink: atomicbitops.FromUint32(uint32(stat.Nlink)), - }, - controlFD: controlFD, - } - d.dentry.init(d) - fs.syncMu.Lock() - fs.syncableDentries.PushBack(&d.syncableListEntry) - fs.syncMu.Unlock() - return &d.dentry, nil + + if fs.opts.enableInodeSharing { + cachedInode := fs.findInode(inoKey) + if cachedInode != nil { + ret = cachedInode + } + } + + if ret == nil { + i := &directfsInode{ + inode: inode{}, + controlFD: controlFD, + } + + // Initalize the inode before use or setting fields + i.init(fs, fs.inoFromKey(inoKey), &inoKey, i) + ret = &i.inode + ret.nlink.Store(uint32(stat.Nlink)) + ret.mode.Store(uint32(stat.Mode)) + ret.uid.Store(uint32(stat.Uid)) + ret.gid.Store(uint32(stat.Gid)) + ret.size.Store(uint64(stat.Size)) + ret.blockSize.Store(uint32(stat.Blksize)) + ret.atime.Store(dentryTimestampFromUnix(stat.Atim)) + ret.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) + ret.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) + ret.inoKey = inoKey + } + return ret, nil } // Precondition: fs.renameMu is locked. -func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) { +func (i *directfsInode) openHandle(ctx context.Context, flags uint32, d *dentry) (handle, error) { parent := d.parent.Load() if parent == nil { // This is a mount point. We don't have parent. Fallback to using lisafs. - if !d.controlFDLisa.Ok() { - panic("directfsDentry.controlFDLisa is not set for mount point dentry") + if !i.controlFDLisa.Ok() { + panic("directfsInode.controlFDLisa is not set for mount point dentry") } - openFD, hostFD, err := d.controlFDLisa.OpenAt(ctx, flags) + openFD, hostFD, err := i.controlFDLisa.OpenAt(ctx, flags) if err != nil { return noHandle, err } - d.fs.client.CloseFD(ctx, openFD, true /* flush */) + i.inode.fs.client.CloseFD(ctx, openFD, true /* flush */) if hostFD < 0 { log.Warningf("gofer did not donate an FD for mount point") return noHandle, unix.EIO @@ -170,7 +174,7 @@ func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, // The only way to re-open an FD with different flags is via procfs or // openat(2) from the parent. Procfs does not exist here. So use parent. flags |= hostOpenFlags - openFD, err := unix.Openat(parent.impl.(*directfsDentry).controlFD, d.name, int(flags), 0) + openFD, err := unix.Openat(parent.inode.impl.(*directfsInode).controlFD, d.name, int(flags), 0) if err != nil { return noHandle, err } @@ -178,10 +182,10 @@ func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, } // Precondition: fs.renameMu is locked. -func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { - d.handleMu.Lock() - defer d.handleMu.Unlock() - if d.controlFDLisa.Ok() { +func (i *directfsInode) ensureLisafsControlFD(ctx context.Context, d *dentry) error { + i.handleMu.Lock() + defer d.inode.handleMu.Unlock() + if d.inode.impl.(*directfsInode).controlFDLisa.Ok() { return nil } @@ -189,9 +193,9 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { root := d for root.parent.Load() != nil { names = append(names, root.name) - root = root.parent.Load().impl.(*directfsDentry) + root = root.parent.Load() } - if !root.controlFDLisa.Ok() { + if !root.inode.impl.(*directfsInode).controlFDLisa.Ok() { panic("controlFDLisa is not set for mount point dentry") } if len(names) == 0 { @@ -202,7 +206,7 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { for i := 0; i < len(names)/2; i++ { names[i], names[last-i] = names[last-i], names[i] } - status, inodes, err := root.controlFDLisa.WalkMultiple(ctx, names) + status, inodes, err := root.inode.impl.(*directfsInode).controlFDLisa.WalkMultiple(ctx, names) if err != nil { return err } @@ -210,7 +214,7 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { // Close everything except for inodes[last] if it exists. for i := 0; i < len(inodes) && i < last; i++ { flush := i == last-1 || i == len(inodes)-1 - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush) } }() switch status { @@ -220,16 +224,16 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { log.Warningf("intermediate path component was a symlink? names = %v, inodes = %+v", names, inodes) return unix.ELOOP case lisafs.WalkSuccess: - d.controlFDLisa = d.fs.client.NewFD(inodes[last].ControlFD) + d.inode.impl.(*directfsInode).controlFDLisa = d.inode.fs.client.NewFD(inodes[last].ControlFD) return nil } panic("unreachable") } -// Precondition: d.metadataMu must be locked. +// Precondition: i.inode.metadataMu must be locked. // -// +checklocks:d.metadataMu -func (d *directfsDentry) updateMetadataLocked(h handle) error { +// +checklocks:i.inode.metadataMu +func (i *directfsInode) updateMetadataLocked(h handle) error { handleMuRLocked := false if h.fd < 0 { // Use open FDs in preferenece to the control FD. Control FDs may be opened @@ -238,17 +242,17 @@ func (d *directfsDentry) updateMetadataLocked(h handle) error { // filesystem implementations may update a writable FD's metadata after // writes, without making metadata updates immediately visible to read-only // FDs representing the same file. - d.handleMu.RLock() + i.inode.handleMu.RLock() switch { - case d.writeFD.RacyLoad() >= 0: - h.fd = d.writeFD.RacyLoad() + case i.inode.writeFD.RacyLoad() >= 0: + h.fd = i.inode.writeFD.RacyLoad() handleMuRLocked = true - case d.readFD.RacyLoad() >= 0: - h.fd = d.readFD.RacyLoad() + case i.inode.readFD.RacyLoad() >= 0: + h.fd = i.inode.readFD.RacyLoad() handleMuRLocked = true default: - h.fd = int32(d.controlFD) - d.handleMu.RUnlock() + h.fd = int32(i.controlFD) + i.inode.handleMu.RUnlock() } } @@ -256,42 +260,42 @@ func (d *directfsDentry) updateMetadataLocked(h handle) error { err := unix.Fstat(int(h.fd), &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). - d.handleMu.RUnlock() // +checklocksforce: complex case. + i.inode.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err } - return d.updateMetadataFromStatLocked(&stat) + return i.updateMetadataFromStatLocked(&stat) } // Precondition: fs.renameMu is locked if d is a socket. -func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error { - if d.isSymlink() { +func (i *directfsInode) chmod(ctx context.Context, mode uint16, d *dentry) error { + if i.isSymlink() { // Linux does not support changing the mode of symlinks. See // fs/attr.c:notify_change(). return unix.EOPNOTSUPP } - if !d.isSocket() { - return unix.Fchmod(d.controlFD, uint32(mode)) + if !i.isSocket() { + return unix.Fchmod(i.controlFD, uint32(mode)) } // Sockets use O_PATH control FDs. However, fchmod(2) fails with EBADF for // O_PATH FDs. Try to fchmodat(2) it from its parent. if parent := d.parent.Load(); parent != nil { - return unix.Fchmodat(parent.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */) + return unix.Fchmodat(parent.inode.impl.(*directfsInode).controlFD, d.name, uint32(mode), 0 /* flags */) } // This is a mount point socket (no parent). Fallback to using lisafs. - if err := d.ensureLisafsControlFD(ctx); err != nil { + if err := i.ensureLisafsControlFD(ctx, d); err != nil { return err } - return chmod(ctx, d.controlFDLisa, mode) + return chmod(ctx, i.controlFDLisa, mode) } // Preconditions: -// - d.handleMu is locked if d is a regular file. +// - i.inode.handleMu is locked if d is a regular file. // - fs.renameMu is locked if d is a symlink. -func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error { +func (i *directfsInode) utimensat(ctx context.Context, stat *linux.Statx, d *dentry) error { if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME) == 0 { return nil } @@ -309,12 +313,12 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error utimes[1].Nsec = int64(stat.Mtime.Nsec) } - if !d.isSymlink() { - hostFD := d.controlFD - if d.isRegularFile() { + if !i.isSymlink() { + hostFD := i.controlFD + if i.isRegularFile() { // utimensat(2) requires a writable FD for regular files. See BUGS // section. dentry.prepareSetStat() should have acquired a writable FD. - hostFD = int(d.writeFD.RacyLoad()) + hostFD = int(i.inode.writeFD.RacyLoad()) } // Non-symlinks can operate directly on the fd using an empty name. return fsutil.Utimensat(hostFD, "", utimes, 0) @@ -324,13 +328,13 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. if parent := d.parent.Load(); parent != nil { - return fsutil.Utimensat(parent.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW) + return fsutil.Utimensat(parent.inode.impl.(*directfsInode).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW) } // This is a mount point symlink. We don't have a parent FD. Fallback to // using lisafs. - if !d.controlFDLisa.Ok() { - panic("directfsDentry.controlFDLisa is not set for mount point symlink") + if !i.controlFDLisa.Ok() { + panic("directfsInode.controlFDLisa is not set for mount point symlink") } setStat := linux.Statx{ @@ -338,7 +342,7 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error Atime: stat.Atime, Mtime: stat.Mtime, } - _, failureErr, err := d.controlFDLisa.SetStat(ctx, &setStat) + _, failureErr, err := i.controlFDLisa.SetStat(ctx, &setStat) if err != nil { return err } @@ -346,9 +350,9 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error } // Precondition: fs.renameMu is locked. -func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { +func (i *directfsInode) prepareSetStat(ctx context.Context, stat *linux.Statx, d *dentry) error { if stat.Mask&unix.STATX_SIZE != 0 || - (stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 && d.isRegularFile()) { + (stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 && i.isRegularFile()) { // Need to ensure a writable FD is available. See setStatLocked() to // understand why. return d.ensureSharedHandle(ctx, false /* read */, true /* write */, false /* trunc */) @@ -357,11 +361,11 @@ func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) } // Preconditions: -// - d.handleMu is locked. +// - i.inode.handleMu is locked. // - fs.renameMu is locked. -func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) (failureMask uint32, failureErr error) { +func (i *directfsInode) setStatLocked(ctx context.Context, stat *linux.Statx, d *dentry) (failureMask uint32, failureErr error) { if stat.Mask&unix.STATX_MODE != 0 { - if err := d.chmod(ctx, stat.Mode&^unix.S_IFMT); err != nil { + if err := i.chmod(ctx, stat.Mode&^unix.S_IFMT, d); err != nil { failureMask |= unix.STATX_MODE failureErr = err } @@ -369,13 +373,13 @@ func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) ( if stat.Mask&unix.STATX_SIZE != 0 { // ftruncate(2) requires a writable FD. - if err := unix.Ftruncate(int(d.writeFD.RacyLoad()), int64(stat.Size)); err != nil { + if err := unix.Ftruncate(int(i.inode.writeFD.RacyLoad()), int64(stat.Size)); err != nil { failureMask |= unix.STATX_SIZE failureErr = err } } - if err := d.utimensat(ctx, stat); err != nil { + if err := i.utimensat(ctx, stat, d); err != nil { failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) failureErr = err } @@ -389,7 +393,7 @@ func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) ( if stat.Mask&unix.STATX_GID != 0 { gid = auth.KGID(stat.GID) } - if err := fchown(d.controlFD, uid, gid); err != nil { + if err := fchown(i.controlFD, uid, gid); err != nil { failureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID) failureErr = err } @@ -414,38 +418,42 @@ func fchown(fd int, uid auth.KUID, gid auth.KGID) error { return unix.Fchownat(fd, "", u, g, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } -// Precondition: d.handleMu must be locked. -func (d *directfsDentry) destroy(ctx context.Context) { - if d.controlFD >= 0 { - _ = unix.Close(d.controlFD) - d.controlFD = -1 +// Precondition: i.inode.handleMu must be locked. +func (i *directfsInode) destroy(ctx context.Context) { + if i.controlFD >= 0 { + _ = unix.Close(i.controlFD) + i.controlFD = -1 } - if d.controlFDLisa.Ok() { - d.controlFDLisa.Close(ctx, true /* flush */) + if i.controlFDLisa.Ok() { + i.controlFDLisa.Close(ctx, true /* flush */) } } -func (d *directfsDentry) getHostChild(name string) (*dentry, error) { +func (i *directfsInode) getHostChild(name string) (*dentry, error) { childFD, err := tryOpen(func(flags int) (int, error) { - return unix.Openat(d.controlFD, name, flags, 0) + return unix.Openat(i.controlFD, name, flags, 0) }) if err != nil { return nil, err } - return d.fs.newDirectfsDentry(childFD) + childInode, err := i.inode.fs.newDirectfsInode(childFD) + if err != nil { + return nil, err + } + return i.inode.fs.newDentry(childInode) } -func (d *directfsDentry) getXattr(ctx context.Context, name string, size uint64) (string, error) { - if ftype := d.fileType(); ftype == linux.S_IFSOCK || ftype == linux.S_IFLNK { +func (i *directfsInode) getXattr(ctx context.Context, name string, size uint64, d *dentry) (string, error) { + if ftype := d.inode.fileType(); ftype == linux.S_IFSOCK || ftype == linux.S_IFLNK { // Sockets and symlinks use O_PATH control FDs. However, fgetxattr(2) fails // with EBADF for O_PATH FDs. Fallback to lisafs. - if err := d.ensureLisafsControlFD(ctx); err != nil { + if err := i.ensureLisafsControlFD(ctx, d); err != nil { return "", err } - return d.controlFDLisa.GetXattr(ctx, name, size) + return i.controlFDLisa.GetXattr(ctx, name, size) } data := make([]byte, size) - n, err := unix.Fgetxattr(d.controlFD, name, data) + n, err := unix.Fgetxattr(i.controlFD, name, data) if err != nil { return "", err } @@ -454,7 +462,7 @@ func (d *directfsDentry) getXattr(ctx context.Context, name string, size uint64) // getCreatedChild opens the newly created child, sets its uid/gid, constructs // a disconnected dentry and returns it. -func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KGID, isDir bool, createDentry bool) (*dentry, error) { +func (i *directfsInode) getCreatedChild(name string, uid auth.KUID, gid auth.KGID, isDir bool, createDentry bool, d *dentry) (*dentry, error) { unlinkFlags := 0 extraOpenFlags := 0 if isDir { @@ -463,13 +471,13 @@ func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KG } deleteChild := func() { // Best effort attempt to remove the newly created child on failure. - if err := unix.Unlinkat(d.controlFD, name, unlinkFlags); err != nil { - log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + if err := unix.Unlinkat(i.controlFD, name, unlinkFlags); err != nil { + log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(i.inode.fs, d), name), err) } } childFD, err := tryOpen(func(flags int) (int, error) { - return unix.Openat(d.controlFD, name, flags|extraOpenFlags, 0) + return unix.Openat(i.controlFD, name, flags|extraOpenFlags, 0) }) if err != nil { deleteChild() @@ -483,8 +491,14 @@ func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KG } var child *dentry + var childInode *inode if createDentry { - child, err = d.fs.newDirectfsDentry(childFD) + childInode, err = i.fs.newDirectfsInode(childFD) + if err != nil { + deleteChild() + return nil, err + } + child, err = i.fs.newDentry(childInode) if err != nil { // Ownership of childFD was passed to newDirectDentry(), so no need to // clean that up. @@ -495,9 +509,9 @@ func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KG return child, nil } -func (d *directfsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { +func (i *directfsInode) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions, d *dentry) (*dentry, error) { if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); ok { - return d.bindAt(ctx, name, creds, opts) + return i.bindAt(ctx, name, creds, opts, d) } // From mknod(2) man page: @@ -507,48 +521,48 @@ func (d *directfsDentry) mknod(ctx context.Context, name string, creds *auth.Cre return nil, unix.EPERM } - if err := unix.Mknodat(d.controlFD, name, uint32(opts.Mode), 0); err != nil { + if err := unix.Mknodat(i.controlFD, name, uint32(opts.Mode), 0); err != nil { return nil, err } - return d.getCreatedChild(name, creds.EffectiveKUID, creds.EffectiveKGID, false /* isDir */, true /* createDentry */) + return i.getCreatedChild(name, creds.EffectiveKUID, creds.EffectiveKGID, false /* isDir */, true /* createDentry */, d) } // Precondition: opts.Endpoint != nil and is transport.HostBoundEndpoint type. -func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { +func (i *directfsInode) bindAt(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions, d *dentry) (*dentry, error) { // There are no filesystems mounted in the sandbox process's mount namespace. // So we can't perform absolute path traversals. So fallback to using lisafs. - if err := d.ensureLisafsControlFD(ctx); err != nil { + if err := i.ensureLisafsControlFD(ctx, d); err != nil { return nil, err } sockType := opts.Endpoint.(transport.Endpoint).Type() - childInode, boundSocketFD, err := d.controlFDLisa.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) + childInode, boundSocketFD, err := i.controlFDLisa.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } - d.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */) + i.inode.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */) // Update opts.Endpoint that it is bound. hbep := opts.Endpoint.(transport.HostBoundEndpoint) if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil { - if err := unix.Unlinkat(d.controlFD, name, 0); err != nil { - log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + if err := unix.Unlinkat(i.controlFD, name, 0); err != nil { + log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(i.inode.fs, d), name), err) } return nil, err } // Socket already has the right UID/GID set, so use uid = gid = -1. - child, err := d.getCreatedChild(name, auth.NoID /* uid */, auth.NoID /* gid */, false /* isDir */, true /* createDentry */) + child, err := i.getCreatedChild(name, auth.NoID /* uid */, auth.NoID /* gid */, false /* isDir */, true /* createDentry */, d) if err != nil { hbep.ResetBoundSocketFD(ctx) return nil, err } // Set the endpoint on the newly created child dentry, and take the // corresponding extra dentry reference. - child.endpoint = opts.Endpoint + child.inode.endpoint = opts.Endpoint child.IncRef() return child, nil } -// Precondition: d.fs.renameMu must be locked. -func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, error) { +// Precondition: i.inode.fs.renameMu must be locked. +func (i *directfsInode) link(target *dentry, name string, d *dentry) (*dentry, error) { // Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires // CAP_DAC_READ_SEARCH in the *root* userns. With directfs, the sandbox // process has CAP_DAC_READ_SEARCH in its own userns. But the sandbox is @@ -556,38 +570,39 @@ func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, err // using olddirfd to call linkat(2). // Also note that d and target are from the same mount. Given target is a // non-directory and d is a directory, target.parent must exist. - if err := unix.Linkat(target.parent.Load().impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil { + if err := unix.Linkat(target.parent.Load().inode.impl.(*directfsInode).controlFD, target.name, i.controlFD, name, 0); err != nil { return nil, err } // Note that we don't need to set uid/gid for the new child. This is a hard // link. The original file already has the right owner. // TODO(gvisor.dev/issue/6739): Hard linked dentries should share the same // inode fields. - return d.getCreatedChild(name, auth.NoID /* uid */, auth.NoID /* gid */, false /* isDir */, true /* createDentry */) + dentry, err := i.getCreatedChild(name, auth.NoID /* uid */, auth.NoID /* gid */, false /* isDir */, true /* createDentry */, d) + return dentry, err } -func (d *directfsDentry) mkdir(name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, error) { - if err := unix.Mkdirat(d.controlFD, name, uint32(mode)); err != nil { +func (i *directfsInode) mkdir(name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool, d *dentry) (*dentry, error) { + if err := unix.Mkdirat(i.controlFD, name, uint32(mode)); err != nil { return nil, err } - return d.getCreatedChild(name, uid, gid, true /* isDir */, createDentry) + return i.getCreatedChild(name, uid, gid, true /* isDir */, createDentry, d) } -func (d *directfsDentry) symlink(name, target string, creds *auth.Credentials) (*dentry, error) { - if err := unix.Symlinkat(target, d.controlFD, name); err != nil { +func (i *directfsInode) symlink(name, target string, creds *auth.Credentials, d *dentry) (*dentry, error) { + if err := unix.Symlinkat(target, i.controlFD, name); err != nil { return nil, err } - return d.getCreatedChild(name, creds.EffectiveKUID, creds.EffectiveKGID, false /* isDir */, true /* createDentry */) + return i.getCreatedChild(name, creds.EffectiveKUID, creds.EffectiveKGID, false /* isDir */, true /* createDentry */, d) } -func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, handle, error) { +func (i *directfsInode) openCreate(name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool, d *dentry) (*dentry, handle, error) { createFlags := unix.O_CREAT | unix.O_EXCL | int(accessFlags) | hostOpenFlags - childHandleFD, err := unix.Openat(d.controlFD, name, createFlags, uint32(mode&^linux.FileTypeMask)) + childHandleFD, err := unix.Openat(i.controlFD, name, createFlags, uint32(mode&^linux.FileTypeMask)) if err != nil { return nil, noHandle, err } - child, err := d.getCreatedChild(name, uid, gid, false /* isDir */, createDentry) + child, err := i.getCreatedChild(name, uid, gid, false /* isDir */, createDentry, d) if err != nil { _ = unix.Close(childHandleFD) return nil, noHandle, err @@ -595,8 +610,8 @@ func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux. return child, handle{fd: int32(childHandleFD)}, nil } -func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8)) error { - readFD := int(d.readFD.RacyLoad()) +func (i *directfsInode) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8), d *dentry) error { + readFD := int(i.inode.readFD.RacyLoad()) if _, err := unix.Seek(readFD, 0, 0); err != nil { return err } @@ -605,9 +620,9 @@ func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key ino // We also want the device ID, which annoyingly incurs an additional // syscall per dirent. // TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat. - stat, err := fsutil.StatAt(d.controlFD, name) + stat, err := fsutil.StatAt(i.controlFD, name) if err != nil { - log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(i.inode.fs, d), name), err) return } recordDirent(name, inoKeyFromStat(&stat), ftype) @@ -615,20 +630,20 @@ func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key ino } // Precondition: fs.renameMu is locked. -func (d *directfsDentry) connect(ctx context.Context, sockType linux.SockType, euid lisafs.UID, egid lisafs.GID) (int, error) { +func (i *directfsInode) connect(ctx context.Context, sockType linux.SockType, euid lisafs.UID, egid lisafs.GID, d *dentry) (int, error) { // There are no filesystems mounted in the sandbox process's mount namespace. // So we can't perform absolute path traversals. So fallback to using lisafs. - if err := d.ensureLisafsControlFD(ctx); err != nil { + if err := i.ensureLisafsControlFD(ctx, d); err != nil { return -1, err } - return d.controlFDLisa.Connect(ctx, sockType, euid, egid) + return i.controlFDLisa.Connect(ctx, sockType, euid, egid) } -func (d *directfsDentry) readlink() (string, error) { +func (i *directfsInode) readlink() (string, error) { // This is similar to what os.Readlink does. for linkLen := 128; linkLen < math.MaxUint16; linkLen *= 2 { b := make([]byte, linkLen) - n, err := unix.Readlinkat(d.controlFD, "", b) + n, err := unix.Readlinkat(i.controlFD, "", b) if err != nil { return "", err @@ -640,9 +655,9 @@ func (d *directfsDentry) readlink() (string, error) { return "", unix.ENOMEM } -func (d *directfsDentry) statfs() (linux.Statfs, error) { +func (i *directfsInode) statfs() (linux.Statfs, error) { var statFS unix.Statfs_t - if err := unix.Fstatfs(d.controlFD, &statFS); err != nil { + if err := unix.Fstatfs(i.controlFD, &statFS); err != nil { return linux.Statfs{}, err } return linux.Statfs{ @@ -657,17 +672,16 @@ func (d *directfsDentry) statfs() (linux.Statfs, error) { }, nil } -func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *vfs.CompleteRestoreOptions) error { +func (i *directfsInode) restoreFile(ctx context.Context, controlFD int, opts *vfs.CompleteRestoreOptions, d *dentry) error { if controlFD < 0 { - return fmt.Errorf("directfsDentry.restoreFile called with invalid controlFD") + return fmt.Errorf("directfsInode.restoreFile called with invalid controlFD") } var stat unix.Stat_t if err := unix.Fstat(controlFD, &stat); err != nil { _ = unix.Close(controlFD) - return fmt.Errorf("failed to stat %q: %w", genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to stat %q: %w", genericDebugPathname(i.inode.fs, d), err) } - - d.controlFD = controlFD + i.controlFD = controlFD // We do not preserve inoKey across checkpoint/restore, so: // // - We must assume that the host filesystem did not change in a way that @@ -675,33 +689,36 @@ func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *v // checking inoKey. // // - We need to associate the new inoKey with the existing d.ino. - d.inoKey = inoKeyFromStat(&stat) - d.fs.inoMu.Lock() - d.fs.inoByKey[d.inoKey] = d.ino - d.fs.inoMu.Unlock() + i.inoKey = inoKeyFromStat(&stat) + i.fs.inoMu.Lock() + i.fs.inoByKey[i.inode.inoKey] = i.ino + i.fs.inoMu.Unlock() + i.fs.inodeByInoMu.Lock() + i.fs.inodeByIno[i.inoKey] = &i.inode + i.fs.inodeByInoMu.Unlock() // Check metadata stability before updating metadata. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - if d.isRegularFile() { + i.inode.metadataMu.Lock() + defer i.inode.metadataMu.Unlock() + if i.isRegularFile() { if opts.ValidateFileSizes { - if d.size.RacyLoad() != uint64(stat.Size) { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.fs, &d.dentry), d.size.Load(), stat.Size)} + if i.inode.size.RacyLoad() != uint64(stat.Size) { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(i.inode.fs, d), i.inode.size.Load(), stat.Size)} } } if opts.ValidateFileModificationTimestamps { - if want := dentryTimestampFromUnix(stat.Mtim); d.mtime.RacyLoad() != want { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.fs, &d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} + if want := dentryTimestampFromUnix(stat.Mtim); i.inode.mtime.RacyLoad() != want { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(i.inode.fs, d), linux.NsecToStatxTimestamp(i.inode.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } - if !d.cachedMetadataAuthoritative() { - d.updateMetadataFromStatLocked(&stat) + if !i.cachedMetadataAuthoritative() { + i.updateMetadataFromStatLocked(&stat) } - if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { + if rw, ok := i.inode.fs.savedDentryRW[d]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { - return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(i.inode.fs, d), err) } } @@ -719,7 +736,7 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, // The function receiver has to be named `d` (to be consistent with other // receivers). But `d` variable is also used below in various places. This // helps with readability and makes code less error prone. - start := state.start.impl.(*directfsDentry) + start := state.start.inode.impl.(*directfsInode) if state.refreshStart { start.updateMetadata(ctx) } @@ -733,21 +750,21 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, var stat unix.Stat_t // Lock metadata *before* getting attributes for d. - d.metadataMu.Lock() + d.inode.metadataMu.Lock() found := err == nil if found { err = unix.Fstat(childFD, &stat) _ = unix.Close(childFD) if err != nil { - d.metadataMu.Unlock() + d.inode.metadataMu.Unlock() return err } } // Note that synthetic dentries will always fail this comparison check. - if !found || d.inoKey != inoKeyFromStat(&stat) { - d.metadataMu.Unlock() - if !found && d.isSynthetic() { + if !found || d.inode.inoKey != inoKeyFromStat(&stat) { + d.inode.metadataMu.Unlock() + if !found && d.inode.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. return nil @@ -759,11 +776,11 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, } // The file at this path hasn't changed. Just update cached metadata. - d.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.metadataMu is locked above. - d.metadataMu.Unlock() + d.inode.impl.(*directfsInode).updateMetadataFromStatLocked(&stat) // +checklocksforce: i.inode.metadataMu is locked above. + d.inode.metadataMu.Unlock() // Advance parent. - parent = d.impl.(*directfsDentry) + parent = d.inode.impl.(*directfsInode) } return nil } diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 31c7d9c62a..6398802067 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -30,7 +30,7 @@ import ( ) func (d *dentry) isDir() bool { - return d.fileType() == linux.S_IFDIR + return d.inode.fileType() == linux.S_IFDIR } // cacheNewChildLocked will cache the new child dentry, and will panic if a @@ -51,7 +51,7 @@ func (d *dentry) isDir() bool { // +checklocks:d.childrenMu func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.IncRef() // reference held by child on its parent - genericSetParentAndName(d.fs, child, d, name) + genericSetParentAndName(d.inode.fs, child, d, name) if d.children == nil { d.children = make(map[string]*dentry) } else if c, ok := d.children[name]; ok { @@ -74,10 +74,10 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) { // +checklocks:d.childrenMu func (d *dentry) cacheNegativeLookupLocked(name string) { // Don't cache negative lookups if InteropModeShared is in effect (since - // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // this makes remote lookup unavoidable), or if d.inode.isSynthetic() (in which // case the only files in the directory are those for which a dentry exists // in d.children). Instead, just delete any previously-cached dentry. - if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + if d.inode.fs.opts.interop == InteropModeShared || d.inode.isSynthetic() { delete(d.children, name) return } @@ -124,34 +124,38 @@ type createSyntheticOpts struct { // newSyntheticDentry creates a synthetic file with the given name. func (fs *filesystem) newSyntheticDentry(opts *createSyntheticOpts) *dentry { now := fs.clock.Now().Nanoseconds() + ino := fs.nextIno() + inodePtr := new(inode) + inodePtr.init(fs, ino, nil, nil) + child := &dentry{ - refs: atomicbitops.FromInt64(1), // held by parent. - fs: fs, - ino: fs.nextIno(), - mode: atomicbitops.FromUint32(uint32(opts.mode)), - uid: atomicbitops.FromUint32(uint32(opts.kuid)), - gid: atomicbitops.FromUint32(uint32(opts.kgid)), - blockSize: atomicbitops.FromUint32(hostarch.PageSize), // arbitrary - atime: atomicbitops.FromInt64(now), - mtime: atomicbitops.FromInt64(now), - ctime: atomicbitops.FromInt64(now), - btime: atomicbitops.FromInt64(now), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), - nlink: atomicbitops.FromUint32(2), + refs: atomicbitops.FromInt64(1), // held by parent. + inode: inodePtr, } + + child.inode.mode.Store(uint32(opts.mode)) + child.inode.uid.Store(uint32(opts.kuid)) + child.inode.gid.Store(uint32(opts.kgid)) + child.inode.blockSize.Store(hostarch.PageSize) + child.inode.atime.Store(now) + child.inode.mtime.Store(now) + child.inode.ctime.Store(now) + child.inode.btime.Store(now) + child.inode.nlink.Store(2) + child.inode.readFD.Store(-1) + child.inode.writeFD.Store(-1) + child.inode.mmapFD.Store(-1) switch opts.mode.FileType() { case linux.S_IFDIR: // Nothing else needs to be done. case linux.S_IFSOCK: - child.endpoint = opts.endpoint + child.inode.endpoint = opts.endpoint case linux.S_IFIFO: - child.pipe = opts.pipe + child.inode.pipe = opts.pipe default: panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) } - child.init(nil /* impl */) + child.init() return child } @@ -192,7 +196,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -224,8 +228,8 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // filesystem.renameMu is needed for d.parent, and must be locked before // d.opMu. - d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RLock() + defer d.inode.fs.renameMu.RUnlock() d.opMu.RLock() defer d.opMu.RUnlock() @@ -248,25 +252,25 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { { Name: ".", Type: linux.DT_DIR, - Ino: uint64(d.ino), + Ino: uint64(d.inode.ino), NextOff: 1, }, { Name: "..", - Type: uint8(parent.mode.Load() >> 12), - Ino: uint64(parent.ino), + Type: uint8(parent.inode.mode.Load() >> 12), + Ino: uint64(parent.inode.ino), NextOff: 2, }, } var realChildren map[string]struct{} - if !d.isSynthetic() { - if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + if !d.inode.isSynthetic() { + if d.syntheticChildren != 0 && d.inode.fs.opts.interop == InteropModeShared { // Record the set of children d actually has so that we don't emit // duplicate entries for synthetic children. realChildren = make(map[string]struct{}) } - d.handleMu.RLock() - if !d.isReadHandleOk() { + d.inode.handleMu.RLock() + if !d.inode.isReadHandleOk() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. panic("gofer.dentry.getDirents called without a readable handle") @@ -274,7 +278,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { err := d.getDirentsLocked(ctx, func(name string, key inoKey, dType uint8) { dirent := vfs.Dirent{ Name: name, - Ino: d.fs.inoFromKey(key), + Ino: d.inode.fs.inoFromKey(key), NextOff: int64(len(dirents) + 1), Type: dType, } @@ -283,7 +287,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { realChildren[name] = struct{}{} } }) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() if err != nil { return nil, err } @@ -292,7 +296,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // Emit entries for synthetic children. if d.syntheticChildren != 0 { for _, child := range d.children { - if child == nil || !child.isSynthetic() { + if child == nil || !child.inode.isSynthetic() { continue } if _, ok := realChildren[child.name]; ok { @@ -300,14 +304,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { } dirents = append(dirents, vfs.Dirent{ Name: child.name, - Type: uint8(child.mode.Load() >> 12), - Ino: uint64(child.ino), + Type: uint8(child.inode.mode.Load() >> 12), + Ino: uint64(child.inode.ino), NextOff: int64(len(dirents) + 1), }) } } // Cache dirents for future directoryFDs if permitted. - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { d.dirents = dirents d.childrenSet = make(map[string]struct{}, len(dirents)) for _, dirent := range d.dirents { diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 2f5e905607..c2975aef1a 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -110,7 +110,7 @@ func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { return ds } -// Precondition: !parent.isSynthetic() && !child.isSynthetic(). +// Precondition: !parent.inode.isSynthetic() && !child.inode.isSynthetic(). func appendNewChildDentry(ds **[]*dentry, parent *dentry, child *dentry) { // The new child was added to parent and took a ref on the parent (hence // parent can be removed from cache). A new child has 0 refs for now. So @@ -175,7 +175,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[] // - fs.renameMu must be locked. // - d.opMu must be locked for reading. // - !rp.Done(). -// - If !d.cachedMetadataAuthoritative(), then d and all children that are +// - If !d.inode.cachedMetadataAuthoritative(), then d and all children that are // part of rp must have been revalidated. // // +checklocksread:d.opMu @@ -211,7 +211,7 @@ func (fs *filesystem) stepLocked(ctx context.Context, rp resolvingPath, d *dentr if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, false, err } - if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + if child.inode.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { return nil, false, err @@ -299,8 +299,8 @@ func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *den if child, err := parent.getCachedChildLocked(rp.Component()); child != nil || err != nil { return child, err } - // dentry.getRemoteChildAndWalkPathLocked already handles dentry caching. - return parent.getRemoteChildAndWalkPathLocked(ctx, rp, ds) + // dentry.inode.getRemoteChildAndWalkPathLocked already handles dentry caching. + return parent.inode.getRemoteChildAndWalkPathLocked(ctx, rp, ds, parent) } // getCachedChildLocked returns a child dentry if it was cached earlier. If no @@ -320,7 +320,7 @@ func (d *dentry) getCachedChildLocked(name string) (*dentry, error) { } d.childrenMu.Lock() defer d.childrenMu.Unlock() - if child, ok := d.children[name]; ok || d.isSynthetic() { + if child, ok := d.children[name]; ok || d.inode.isSynthetic() { if child == nil { return nil, linuxerr.ENOENT } @@ -344,7 +344,7 @@ func (d *dentry) getCachedChildLocked(name string) (*dentry, error) { // Preconditions: // - fs.renameMu must be locked. // - !rp.Done(). -// - If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up +// - If !d.inode.cachedMetadataAuthoritative(), then d's cached metadata must be up // to date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, vfsRP *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { rp := resolvingPathParent(vfsRP) @@ -484,7 +484,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if !dir && rp.MustBeDir() { return linuxerr.ENOENT } - if parent.isSynthetic() { + if parent.inode.isSynthetic() { if createInSyntheticDir == nil { return linuxerr.EPERM } @@ -502,7 +502,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } // No cached dentry exists; however, in InteropModeShared there might still be @@ -514,7 +514,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } parent.childrenMu.Lock() parent.cacheNewChildLocked(child, name) - if child.isSynthetic() { + if child.inode.isSynthetic() { parent.syntheticChildren++ ds = appendDentry(ds, parent) } else { @@ -534,7 +534,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -602,7 +602,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Load child if sticky bit is set because we need to determine whether // deletion is allowed. var child *dentry - if parent.mode.Load()&linux.ModeSticky == 0 { + if parent.inode.mode.Load()&linux.ModeSticky == 0 { var ok bool parent.childrenMu.Lock() child, ok = parent.children[name] @@ -657,7 +657,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // revalidated (so we can't expect its file type to be correct) and // individually revalidating its children (to confirm that they // still exist) would be a waste of time. - if child.cachedMetadataAuthoritative() { + if child.inode.cachedMetadataAuthoritative() { if !child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return linuxerr.ENOTDIR @@ -684,12 +684,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return linuxerr.ENOTDIR } } - if parent.isSynthetic() { + if parent.inode.isSynthetic() { if child == nil { return linuxerr.ENOENT } - } else if child == nil || !child.isSynthetic() { - if err := parent.unlink(ctx, name, flags); err != nil { + } else if child == nil || !child.inode.isSynthetic() { + if err := parent.inode.unlink(ctx, name, flags); err != nil { if child != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } @@ -699,13 +699,13 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parent.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { - cw = &child.watches + cw = &child.inode.watches } - vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) + vfs.InotifyRemoveChild(ctx, cw, &parent.inode.watches, name) } parent.childrenMu.Lock() @@ -720,16 +720,17 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // writing since before we obtained child, and we can't race with // fs.RenameAt() since fs.renameMu has been locked since before we // obtained child. - if child.isSynthetic() { + if child.inode.isSynthetic() { parent.syntheticChildren-- child.decRefNoCaching() - } else if child.endpoint != nil { + } else if child.inode.endpoint != nil { child.decRefNoCaching() } + child.decLinks() ds = appendDentry(ds, child) } parent.cacheNegativeLookupLocked(name) - if parent.cachedMetadataAuthoritative() { + if parent.inode.cachedMetadataAuthoritative() { parent.clearDirentsLocked() parent.touchCMtime() if dir { @@ -806,19 +807,19 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if d.isDir() { return nil, linuxerr.EPERM } - gid := auth.KGID(d.gid.Load()) - uid := auth.KUID(d.uid.Load()) - mode := linux.FileMode(d.mode.Load()) + gid := auth.KGID(d.inode.gid.Load()) + uid := auth.KUID(d.inode.uid.Load()) + mode := linux.FileMode(d.inode.mode.Load()) if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { return nil, err } - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { return nil, linuxerr.ENOENT } - if d.nlink.Load() == math.MaxUint32 { + if d.inode.nlink.Load() == math.MaxUint32 { return nil, linuxerr.EMLINK } - if d.isSynthetic() { + if d.inode.isSynthetic() { // TODO(gvisor.dev/issue/6739): Add synthetic file hard link support. return nil, linuxerr.EOPNOTSUPP } @@ -829,6 +830,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // Success! vd.Dentry().Impl().(*dentry).incLinks() } + return err } @@ -840,8 +842,8 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // rather than the caller's and enable setgid. kgid := creds.EffectiveKGID mode := opts.Mode - if parent.mode.Load()&linux.S_ISGID != 0 { - kgid = auth.KGID(parent.gid.Load()) + if parent.inode.mode.Load()&linux.S_ISGID != 0 { + kgid = auth.KGID(parent.inode.gid.Load()) mode |= linux.S_ISGID } @@ -963,9 +965,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, linuxerr.EEXIST } - if !start.cachedMetadataAuthoritative() { + if !start.inode.cachedMetadataAuthoritative() { // Refresh dentry's attributes before opening. - if err := start.updateMetadata(ctx); err != nil { + if err := start.inode.updateMetadata(ctx); err != nil { return nil, err } } @@ -1014,7 +1016,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { - if parent.isSynthetic() { + if parent.inode.isSynthetic() { return nil, linuxerr.EPERM } @@ -1065,34 +1067,45 @@ var logRejectedFifoOpenOnce sync.Once // indefinitely). func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) + + if strings.Contains(d.name, "gvisor_test_temp") { + log.Infof("d name %s", d.name) + log.Infof("rp.Credentials() %v", rp.Credentials()) + log.Infof("d inode mode %x parent inode uid %x parent inode gid %x", d.inode.mode.Load(), d.inode.uid.Load(), d.inode.gid.Load()) + log.Infof("rp.Credentials() effective kuid %v effective kgid %v", rp.Credentials().EffectiveKUID, rp.Credentials().EffectiveKGID) + log.Infof("d inode number of references %v", d.inode.refs.ReadRefs()) + log.Infof("d inode key %v", d.inode.inoKey) + log.Infof("d inode ino %v", d.inode.ino) + } + log.Infof("open[jtoantran] d name %s", d.name) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } - - if !d.isSynthetic() { + if !d.inode.isSynthetic() { // renameMu is locked here because it is required by d.openHandle(), which // is called by d.ensureSharedHandle() and d.openSpecialFile() below. It is // also required by d.connect() which is called by // d.openSocketByConnecting(). Note that opening non-synthetic pipes may // block, renameMu is unlocked separately in d.openSpecialFile() for pipes. - d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RLock() + defer d.inode.fs.renameMu.RUnlock() } - trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG + trunc := opts.Flags&linux.O_TRUNC != 0 && d.inode.fileType() == linux.S_IFREG if trunc { // Lock metadataMu *while* we open a regular file with O_TRUNC because // open(2) will change the file size on server. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() } var vfd *vfs.FileDescription var err error mnt := rp.Mount() - switch d.fileType() { + switch d.inode.fileType() { case linux.S_IFREG: - if !d.fs.opts.regularFilesUseSpecialFileFD { + if !d.inode.fs.opts.regularFilesUseSpecialFileFD { if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil { return nil, err } @@ -1114,17 +1127,17 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } - if !d.isSynthetic() { + if !d.inode.isSynthetic() { if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { return nil, err } } fd := &directoryFD{} - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } - if d.readFD.Load() >= 0 { + if d.inode.readFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() @@ -1134,17 +1147,17 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, linuxerr.ELOOP case linux.S_IFSOCK: - if d.isSynthetic() { + if d.inode.isSynthetic() { return nil, linuxerr.ENXIO } - if d.fs.iopts.OpenSocketsByConnecting { + if d.inode.fs.iopts.OpenSocketsByConnecting { return d.openSocketByConnecting(ctx, opts) } case linux.S_IFIFO: - if d.isSynthetic() { - return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) + if d.inode.isSynthetic() { + return d.inode.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.inode.locks) } - if d.fs.opts.disableFifoOpen { + if d.inode.fs.opts.disableFifoOpen { logRejectedFifoOpenOnce.Do(func() { log.Warningf("Rejecting attempt to open fifo/pipe from host filesystem: %q. If you want to allow this, set flag --host-fifo=open", d.name) }) @@ -1160,12 +1173,12 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open if trunc { // If no errors occurred so far then update file size in memory. This - // step is required even if !d.cachedMetadataAuthoritative() because + // step is required even if !d.inode.cachedMetadataAuthoritative() because // d.mappings has to be updated. // d.metadataMu has already been acquired if trunc == true. - d.updateSizeLocked(0) + d.inode.updateSizeLocked(0) - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } } @@ -1196,12 +1209,12 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio } // Preconditions: -// - !d.isSynthetic(). +// - !d.inode.isSynthetic(). // - fs.renameMu is locked. It may be released temporarily while pipe blocks. // - If d is a pipe, no other locks (other than fs.renameMu) should be held. func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - if opts.Flags&linux.O_DIRECT != 0 && !d.isRegularFile() { + if opts.Flags&linux.O_DIRECT != 0 && !d.inode.isRegularFile() { return nil, linuxerr.EINVAL } // We assume that the server silently inserts O_NONBLOCK in the open flags @@ -1212,7 +1225,7 @@ func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs. // the instantaneous presence of a peer holding the other end of the pipe // open, not whether the pipe was *previously* opened by a peer that has // since closed its end. - isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 + isBlockingOpenOfNamedPipe := d.inode.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 retry: h, err := d.openHandle(ctx, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) if err != nil { @@ -1221,9 +1234,9 @@ retry: // with ENXIO if opening the same named pipe with O_WRONLY would // block because there are no readers of the pipe. Release renameMu // while blocking. - d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RUnlock() err := sleepBetweenNamedPipeOpenChecks(ctx) - d.fs.renameMu.RLock() + d.inode.fs.renameMu.RLock() if err != nil { return nil, err } @@ -1233,9 +1246,9 @@ retry: } if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { // Release renameMu while blocking. - d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RUnlock() err := blockUntilNonblockingPipeHasWriter(ctx, h.fd) - d.fs.renameMu.RLock() + d.inode.fs.renameMu.RLock() if err != nil { h.close(ctx) return nil, err @@ -1250,9 +1263,9 @@ retry: } // Preconditions: -// - d.fs.renameMu must be locked. +// - d.inode.fs.renameMu must be locked. // - d.opMu must be locked for writing. -// - !d.isSynthetic(). +// - !d.inode.isSynthetic(). // // +checklocks:d.opMu func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { @@ -1273,8 +1286,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // If the parent is a setgid directory, use the parent's GID rather // than the caller's. kgid := creds.EffectiveKGID - if d.mode.Load()&linux.S_ISGID != 0 { - kgid = auth.KGID(d.gid.Load()) + if d.inode.mode.Load()&linux.S_ISGID != 0 { + kgid = auth.KGID(d.inode.gid.Load()) } child, h, err := d.openCreate(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, creds.EffectiveKUID, kgid, true /* createDentry */) @@ -1283,23 +1296,23 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } // Incorporate the fid that was opened by lcreate. - useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + useRegularFileFD := child.inode.fileType() == linux.S_IFREG && !d.inode.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { var readable, writable bool - child.handleMu.Lock() + child.inode.handleMu.Lock() if vfs.MayReadFileWithOpenFlags(opts.Flags) { readable = true if h.fd != -1 { - child.readFD = atomicbitops.FromInt32(h.fd) - child.mmapFD = atomicbitops.FromInt32(h.fd) + child.inode.readFD = atomicbitops.FromInt32(h.fd) + child.inode.mmapFD = atomicbitops.FromInt32(h.fd) } } if vfs.MayWriteFileWithOpenFlags(opts.Flags) { writable = true - child.writeFD = atomicbitops.FromInt32(h.fd) + child.inode.writeFD = atomicbitops.FromInt32(h.fd) } - child.updateHandles(ctx, h, readable, writable) - child.handleMu.Unlock() + child.inode.updateHandles(ctx, h, readable, writable) + child.inode.handleMu.Unlock() } // Insert the dentry into the tree. d.childrenMu.Lock() @@ -1307,7 +1320,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // this name. We could not have raced. d.cacheNewChildLocked(child, name) appendNewChildDentry(ds, d, child) - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { d.touchCMtime() d.clearDirentsLocked() } @@ -1329,7 +1342,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + d.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1342,7 +1355,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if err != nil { return "", err } - if !d.isSymlink() { + if !d.inode.isSymlink() { return "", linuxerr.EINVAL } return d.readlink(ctx, rp.Mount()) @@ -1396,8 +1409,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if !oldParent.cachedMetadataAuthoritative() { - if err := oldParent.updateMetadata(ctx); err != nil { + if !oldParent.inode.cachedMetadataAuthoritative() { + if err := oldParent.inode.updateMetadata(ctx); err != nil { return err } } @@ -1484,19 +1497,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update the remote filesystem. - if !renamed.isSynthetic() { - if err := oldParent.rename(ctx, oldName, newParent, newName); err != nil { + if !renamed.inode.isSynthetic() { + if err := oldParent.inode.rename(ctx, oldName, newParent, newName); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } - } else if replaced != nil && !replaced.isSynthetic() { + } else if replaced != nil && !replaced.inode.isSynthetic() { // We are replacing an existing real file with a synthetic one, so we // need to unlink the former. flags := uint32(0) if replaced.isDir() { flags = linux.AT_REMOVEDIR } - if err := newParent.unlink(ctx, newName, flags); err != nil { + if err := newParent.inode.unlink(ctx, newName, flags); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } @@ -1517,10 +1530,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // comment for dentry.refs, drop that reference now. We can't race with // fs.unlinkAt() or invalidation since fs.renameMu has been locked for // writing since before we obtained replaced. - if replaced.isSynthetic() { + if replaced.inode.isSynthetic() { newParent.syntheticChildren-- replaced.decRefNoCaching() - } else if replaced.endpoint != nil { + } else if replaced.inode.endpoint != nil { replaced.decRefNoCaching() } ds = appendDentry(ds, replaced) @@ -1528,7 +1541,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa delete(newParent.children, newName) } oldParent.cacheNegativeLookupLocked(oldName) // +checklocksforce: oldParent.childrenMu is held if oldParent != newParent. - if renamed.isSynthetic() { + if renamed.inode.isSynthetic() { oldParent.syntheticChildren-- newParent.syntheticChildren++ } @@ -1542,17 +1555,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update metadata. - if renamed.cachedMetadataAuthoritative() { + if renamed.inode.cachedMetadataAuthoritative() { renamed.touchCtime() } - if oldParent.cachedMetadataAuthoritative() { + if oldParent.inode.cachedMetadataAuthoritative() { oldParent.clearDirentsLocked() oldParent.touchCMtime() if renamed.isDir() { oldParent.decLinks() } } - if newParent.cachedMetadataAuthoritative() { + if newParent.inode.cachedMetadataAuthoritative() { newParent.clearDirentsLocked() newParent.touchCMtime() if renamed.isDir() && (replaced == nil || !replaced.isDir()) { @@ -1560,7 +1573,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParent.incLinks() } } - vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParent.inode.watches, &newParent.inode.watches, oldName, newName, renamed.isDir()) return nil } @@ -1623,10 +1636,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu return linux.Statfs{}, err } // If d is synthetic, invoke statfs on the first ancestor of d that isn't. - for d.isSynthetic() { + for d.inode.isSynthetic() { d = d.parent.Load() } - statfs, err := d.statfs(ctx) + statfs, err := d.inode.statfs(ctx) if err != nil { return linux.Statfs{}, err } @@ -1647,14 +1660,14 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ if err != nil { return nil, err } - if parent.fs.opts.interop != InteropModeShared { + if parent.inode.fs.opts.interop != InteropModeShared { // Cache the symlink target on creation. In practice, this helps avoid a // lot of ReadLink RPCs. Note that when InteropModeShared is in effect, // we are forced to make Readlink RPCs. Because in this mode, we use host // timestamps, not timestamps based on our internal clock. And readlink // updates the atime on the host. - child.haveTarget = true - child.target = target + child.inode.haveTarget = true + child.inode.target = target } return child, nil }, nil) @@ -1677,13 +1690,13 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } - if !d.isSocket() { + if !d.inode.isSocket() { return nil, linuxerr.ECONNREFUSED } - if d.endpoint != nil { - return d.endpoint, nil + if d.inode.endpoint != nil { + return d.inode.endpoint, nil } - if !d.isSynthetic() { + if !d.inode.isSynthetic() { d.IncRef() ds = appendDentry(ds, d) return &endpoint{ diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 76fe786938..9b768ccd58 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -88,6 +88,7 @@ const ( moptOverlayfsStaleRead = "overlayfs_stale_read" moptDisableFileHandleSharing = "disable_file_handle_sharing" moptDisableFifoOpen = "disable_fifo_open" + moptEnableInodeSharing = "enable_inode_sharing" // Directfs options. moptDirectfs = "directfs" @@ -236,8 +237,10 @@ type filesystem struct { // across checkpoint/restore because inode numbers may be reused between // different gofer processes, so inode numbers may be repeated for different // files across checkpoint/restore. inoByKey is protected by inoMu. - inoMu sync.Mutex `state:"nosave"` - inoByKey map[inoKey]uint64 `state:"nosave"` + inoMu sync.Mutex `state:"nosave"` + inoByKey map[inoKey]uint64 `state:"nosave"` + inodeByInoMu sync.Mutex `state:"nosave"` + inodeByIno map[inoKey]*inode `state:"nosave"` // lastIno is the last inode number assigned to a file. lastIno is accessed // using atomic memory operations. @@ -254,6 +257,16 @@ type filesystem struct { released atomicbitops.Int32 } +func (fs *filesystem) findInode(inoKey inoKey) *inode { + fs.inodeByInoMu.Lock() + defer fs.inodeByInoMu.Unlock() + if inode, ok := fs.inodeByIno[inoKey]; ok { + inode.incRef() + return inode + } + return nil +} + // +stateify savable type filesystemOptions struct { fd int @@ -302,6 +315,10 @@ type filesystemOptions struct { // directfs holds options for directfs mode. directfs directfsOpts + + // If enable InodeSharing is true, inode sharing across dentries is enabled. + // This is disabled by default and only used runsc + enableInodeSharing bool } // +stateify savable @@ -521,6 +538,10 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, moptDirectfs) fsopts.directfs.enabled = true } + if _, ok := mopts[moptEnableInodeSharing]; ok { + delete(mopts, moptEnableInodeSharing) + fsopts.enableInodeSharing = true + } // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying // "cache=none". @@ -554,12 +575,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, err } fs := &filesystem{ - mf: mf, - opts: fsopts, - iopts: iopts, - clock: ktime.RealtimeClockFromContext(ctx), - devMinor: devMinor, - inoByKey: make(map[inoKey]uint64), + mf: mf, + opts: fsopts, + iopts: iopts, + clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + inoByKey: make(map[inoKey]uint64), + inodeByIno: make(map[inoKey]*inode), } // Did the user configure a global dentry cache? @@ -576,11 +598,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.DecRef(ctx) return nil, nil, err } + var inode *inode if fs.opts.directfs.enabled { - fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) + inode, err = fs.getDirectfsRootInode(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) } else { - fs.root, err = fs.newLisafsDentry(ctx, &rootInode) + inode, err = fs.newLisafsInode(ctx, &rootInode) + } + if err != nil { + fs.vfsfs.DecRef(ctx) + return nil, nil, err } + // Create the root dentry + fs.root, err = fs.newDentry(inode) if err != nil { fs.vfsfs.DecRef(ctx) return nil, nil, err @@ -592,6 +621,17 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &fs.root.vfsd, nil } +func (fs *filesystem) newDentry(inode *inode) (*dentry, error) { + d := &dentry{ + inode: inode, + } + d.init() + fs.syncMu.Lock() + fs.syncableDentries.PushBack(&d.syncableListEntry) + fs.syncMu.Unlock() + return d, nil +} + // initClientAndGetRoot initializes fs.client and returns the root inode for // this mount point. It handles the attach point (fs.opts.aname) resolution. func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { @@ -696,23 +736,23 @@ func (fs *filesystem) Release(ctx context.Context) { fs.syncMu.Lock() for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { d := elem.d - d.handleMu.Lock() - d.dataMu.Lock() - if d.isWriteHandleOk() { + d.inode.handleMu.Lock() + d.inode.dataMu.Lock() + if d.inode.isWriteHandleOk() { // Write dirty cached data to the remote file. - h := d.writeHandle() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + h := d.inode.writeHandle() + if err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? } // Discard cached pages. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - d.dataMu.Unlock() + d.inode.cache.DropAll(mf) + d.inode.dirty.RemoveAll() + d.inode.dataMu.Unlock() // Close host FDs if they exist. - d.closeHostFDs() - d.handleMu.Unlock() + d.inode.closeHostFDs() + d.inode.handleMu.Unlock() } // There can't be any specialFileFDs still using fs, since each such // FileDescription would hold a reference on a Mount holding a reference on @@ -750,9 +790,9 @@ func (fs *filesystem) Release(ctx context.Context) { // endpoint != nil. Such dentries have one reference for existence that should // be dropped during filesystem.Release. // -// Precondition: d.fs.renameMu is locked for writing. +// Precondition: d.inode.fs.renameMu is locked for writing. func (d *dentry) releaseExtraRefsRecursiveLocked(ctx context.Context) { - if d.isSynthetic() || d.endpoint != nil { + if d.inode.isSynthetic() || d.inode.endpoint != nil { d.decRefNoCaching() d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) } @@ -771,7 +811,8 @@ func (d *dentry) releaseExtraRefsRecursiveLocked(ctx context.Context) { } } -// inoKey is the key used to identify the inode backed by this dentry. +// inoKey is the key used to identify the inode backing the dentry. +// host inode major and minor numbers are used to identify the file. // // +stateify savable type inoKey struct { @@ -796,6 +837,221 @@ func inoKeyFromStat(stat *unix.Stat_t) inoKey { } } +// inode represents a filesystem object. +// +// +stateify savable +type inode struct { + // fs is the filesystem that this inode belongs to. + fs *filesystem + + // A reference is held on this inode as long as it is reachable in the + // in the filesystem tree, i.e. dentry points to the inode. + refs inodeRefs + + // inode key + inoKey inoKey + + // File size, which differs from other metadata in two ways: + // + // - We make a best-effort attempt to keep it up to date even if + // !dentry.inode.cachedMetadataAuthoritative() for the sake of O_APPEND writes. + // + // - size is protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it; locking either is sufficient to access it). + size atomicbitops.Uint64 + + // inode metadata. Writing multiple fields atomically requires holding + // mu, otherwise atomic operations can be used. + metadataMu sync.Mutex `state:"nosave"` + mode atomicbitops.Uint32 // type is immutable, perms are mutable + uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid atomicbitops.Uint32 // auth.KGID, but ... + blockSize atomicbitops.Uint32 // 0 if unknown + ino uint64 // virtual inode number,immutable + // Timestamps, all nsecs from the Unix epoch. + atime atomicbitops.Int64 + mtime atomicbitops.Int64 + ctime atomicbitops.Int64 + btime atomicbitops.Int64 + + nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu + + // If this inode does not represent a synthetic file, deleted is 0, and + // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the + // remote file's timestamps, which should be updated when this inode is + // evicted. + atimeDirty atomicbitops.Uint32 `state:"nosave"` + mtimeDirty atomicbitops.Uint32 + + mapsMu sync.Mutex `state:"nosave"` + + // If this inode represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // If this inode represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this inode represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // If this inode represents a deleted regular file, savedDeletedData is used + // to store file data for save/restore. + savedDeletedData []byte + + locks vfs.FileLocks + + // Inotify watches for this inode. + watches vfs.Watches + + // - If this inode represents a regular file or directory, readFD (if not + // -1) is a host FD used for reads by all regularFileFDs/directoryFDs + // representing this inode. + // + // - If this inode represents a regular file, writeFD (if not -1) is a host + // FD used for writes by all regularFileFDs representing this inode. + // + // - If this inode represents a regular file, mmapFD is the host FD used + // for memory mappings. If mmapFD is -1, no such FD is available, and the + // internal page cache implementation is used for memory mappings instead. + // + // These fields are protected by handleMu. readFD, writeFD, and mmapFD are + // additionally written using atomic memory operations, allowing them to be + // read (albeit racily) with atomic.LoadInt32() without locking handleMu. + // + // readFD and writeFD may or may not be the same file descriptor. Once either + // transitions from closed (-1) to open, it may be mutated with handleMu + // locked, but cannot be closed until the inode is destroyed. + // + // readFD and writeFD may or may not be the same file descriptor. mmapFD is + // always either -1 or equal to readFD; if the file has been opened for + // writing, it is additionally either -1 or equal to writeFD. + handleMu sync.RWMutex `state:"nosave"` + readFD atomicbitops.Int32 `state:"nosave"` + writeFD atomicbitops.Int32 `state:"nosave"` + mmapFD atomicbitops.Int32 `state:"nosave"` + + // pf implements memmap.File for mappings of hostFD. + pf inodePlatformFile + + // If this inode represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string + + // If this inode represents a socket file, endpoint is the transport + // endpoint bound to this file. + // + // endpoint often originates from vfs.MknodOptions.Endpoint, in which case + // it can't be recovered if the inode is evicted from the inode cache. + // Consequently, an extra reference is held on inode for which endpoint + // is non-nil to prevent eviction. + endpoint transport.BoundEndpoint + + // If this inode represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe + + // impl is the specific inode implementation for non-synthetic dentries. + // impl is immutable. + // + // If impl is nil, this inode represents a synthetic file, i.e. a + // file that does not exist on the host filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. + impl any // immutable +} + +func (i *inode) init(fs *filesystem, ino uint64, inoKey *inoKey, impl any) { + i.refs.InitRefs() + i.refs.LogRefs() + + i.ino = ino + i.fs = fs + i.readFD.Store(-1) + i.writeFD.Store(-1) + i.mmapFD.Store(-1) + + if inoKey != nil { + i.inoKey = *inoKey + fs.inodeByInoMu.Lock() + fs.inodeByIno[*inoKey] = i + fs.inodeByInoMu.Unlock() + } + + i.pf.inode = i + // Nested impl-inheritance pattern. In memory it looks like: + // [[ inode ] inodeImpl ] + // Inode has a pointer to the next level of implementation. + i.impl = impl +} + +func (i *inode) incRef() { + i.refs.IncRef() +} + +func (i *inode) tryIncRef() bool { + return i.refs.TryIncRef() +} + +func (i *inode) isSynthetic() bool { + return i.impl == nil +} + +func (i *inode) cachedMetadataAuthoritative() bool { + return i.fs.opts.interop != InteropModeShared || i.isSynthetic() +} + +func (i *inode) decRef(ctx context.Context, d *dentry) { + i.refs.DecRef(func() { + i.handleMu.Lock() + i.dataMu.Lock() + + // Close any resources held by the implementation. + i.destroyImpl(ctx, d) + + mf := i.fs.mf + if i.isWriteHandleOk() { + // Write dirty pages back to the remote filesystem. + h := i.writeHandle() + if err := fsutil.SyncDirtyAll(ctx, &i.cache, &i.dirty, i.size.Load(), mf, h.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) + } + } + // Discard cached data. + if !i.cache.IsEmpty() { + mf.MarkAllUnevictable(i) + i.cache.DropAll(mf) + i.dirty.RemoveAll() + } + + i.dataMu.Unlock() + + // Can use RacyLoad() because handleMu is locked. + if i.readFD.RacyLoad() >= 0 { + _ = unix.Close(int(i.readFD.RacyLoad())) + } + if i.writeFD.RacyLoad() >= 0 && i.readFD.RacyLoad() != i.writeFD.RacyLoad() { + _ = unix.Close(int(i.writeFD.RacyLoad())) + } + i.readFD.Store(-1) + i.writeFD.Store(-1) + i.mmapFD.Store(-1) + i.handleMu.Unlock() + + // Remove the inode from the inode map. + i.fs.inodeByInoMu.Lock() + delete(i.fs.inodeByIno, i.inoKey) + i.fs.inodeByInoMu.Unlock() + }) +} + // dentry implements vfs.DentryImpl. // // +stateify savable @@ -812,9 +1068,6 @@ type dentry struct { // using atomic memory operations. refs atomicbitops.Int64 - // fs is the owning filesystem. fs is immutable. - fs *filesystem - // parent is this dentry's parent directory. Each dentry holds a reference // on its parent. If this dentry is a filesystem root, parent is nil. // parent is protected by filesystem.renameMu. @@ -825,10 +1078,12 @@ type dentry struct { // filesystem.renameMu. name string - // inoKey is used to identify this dentry's inode. - inoKey inoKey + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory inode (with hard links). inode is + // immutable. + inode *inode - // If deleted is non-zero, the file represented by this dentry has been + // If deleted is non-zero, the file has been // deleted is accessed using atomic memory operations. deleted atomicbitops.Uint32 @@ -887,7 +1142,7 @@ type dentry struct { syntheticChildren int // If this dentry represents a directory, - // dentry.cachedMetadataAuthoritative() == true, and dirents is not + // dentry.inode.cachedMetadataAuthoritative() == true, and dirents is not // nil, then dirents is a cache of all entries in the directory, in the // order they were returned by the server. childrenSet just stores the // `Name` field of all dirents in a set for fast query. dirents and @@ -898,140 +1153,10 @@ type dentry struct { // +checklocks:childrenMu childrenSet map[string]struct{} `state:"nosave"` - // Cached metadata; protected by metadataMu. - // To access: - // - In situations where consistency is not required (like stat), these - // can be accessed using atomic operations only (without locking). - // - Lock metadataMu and can access without atomic operations. - // To mutate: - // - Lock metadataMu and use atomic operations to update because we might - // have atomic readers that don't hold the lock. - metadataMu sync.Mutex `state:"nosave"` - ino uint64 // immutable - mode atomicbitops.Uint32 // type is immutable, perms are mutable - uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid atomicbitops.Uint32 // auth.KGID, but ... - blockSize atomicbitops.Uint32 // 0 if unknown - // Timestamps, all nsecs from the Unix epoch. - atime atomicbitops.Int64 - mtime atomicbitops.Int64 - ctime atomicbitops.Int64 - btime atomicbitops.Int64 - // File size, which differs from other metadata in two ways: - // - // - We make a best-effort attempt to keep it up to date even if - // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. - // - // - size is protected by both metadataMu and dataMu (i.e. both must be - // locked to mutate it; locking either is sufficient to access it). - size atomicbitops.Uint64 - // If this dentry does not represent a synthetic file, deleted is 0, and - // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the - // remote file's timestamps, which should be updated when this dentry is - // evicted. - atimeDirty atomicbitops.Uint32 - mtimeDirty atomicbitops.Uint32 - - // nlink counts the number of hard links to this dentry. It's updated and - // accessed using atomic operations. It's not protected by metadataMu like the - // other metadata fields. - nlink atomicbitops.Uint32 - - mapsMu sync.Mutex `state:"nosave"` - - // If this dentry represents a regular file, mappings tracks mappings of - // the file into memmap.MappingSpaces. mappings is protected by mapsMu. - mappings memmap.MappingSet - - // - If this dentry represents a regular file or directory, readFD (if not - // -1) is a host FD used for reads by all regularFileFDs/directoryFDs - // representing this dentry. - // - // - If this dentry represents a regular file, writeFD (if not -1) is a host - // FD used for writes by all regularFileFDs representing this dentry. - // - // - If this dentry represents a regular file, mmapFD is the host FD used - // for memory mappings. If mmapFD is -1, no such FD is available, and the - // internal page cache implementation is used for memory mappings instead. - // - // These fields are protected by handleMu. readFD, writeFD, and mmapFD are - // additionally written using atomic memory operations, allowing them to be - // read (albeit racily) with atomic.LoadInt32() without locking handleMu. - // - // readFD and writeFD may or may not be the same file descriptor. Once either - // transitions from closed (-1) to open, it may be mutated with handleMu - // locked, but cannot be closed until the dentry is destroyed. - // - // readFD and writeFD may or may not be the same file descriptor. mmapFD is - // always either -1 or equal to readFD; if the file has been opened for - // writing, it is additionally either -1 or equal to writeFD. - handleMu sync.RWMutex `state:"nosave"` - readFD atomicbitops.Int32 `state:"nosave"` - writeFD atomicbitops.Int32 `state:"nosave"` - mmapFD atomicbitops.Int32 `state:"nosave"` - - dataMu sync.RWMutex `state:"nosave"` - - // If this dentry represents a regular file that is client-cached, cache - // maps offsets into the cached file to offsets into - // filesystem.mfp.MemoryFile() that store the file's data. cache is - // protected by dataMu. - cache fsutil.FileRangeSet - - // If this dentry represents a regular file that is client-cached, dirty - // tracks dirty segments in cache. dirty is protected by dataMu. - dirty fsutil.DirtySet - - // If this dentry represents a deleted regular file, savedDeletedData is used - // to store file data for save/restore. - savedDeletedData []byte - - // pf implements memmap.File for mappings of hostFD. - pf dentryPlatformFile - - // If this dentry represents a symbolic link, InteropModeShared is not in - // effect, and haveTarget is true, target is the symlink target. haveTarget - // and target are protected by dataMu. - haveTarget bool - target string - - // If this dentry represents a socket file, endpoint is the transport - // endpoint bound to this file. - // - // endpoint often originates from vfs.MknodOptions.Endpoint, in which case - // it can't be recovered if the dentry is evicted from the dentry cache. - // Consequently, an extra reference is held on dentries for which endpoint - // is non-nil to prevent eviction. - endpoint transport.BoundEndpoint - - // If this dentry represents a synthetic named pipe, pipe is the pipe - // endpoint bound to this file. - pipe *pipe.VFSPipe - - locks vfs.FileLocks - - // Inotify watches for this dentry. - // - // Note that inotify may behave unexpectedly in the presence of hard links, - // because dentries corresponding to the same file have separate inotify - // watches when they should share the same set. This is the case because it is - // impossible for us to know for sure whether two dentries correspond to the - // same underlying file (see the gofer filesystem section fo vfs/inotify.md for - // a more in-depth discussion on this matter). - watches vfs.Watches - // forMountpoint marks directories that were created for mount points during // container startup. This is used during restore, in case these mount points // need to be recreated. forMountpoint bool - - // impl is the specific dentry implementation for non-synthetic dentries. - // impl is immutable. - // - // If impl is nil, this dentry represents a synthetic file, i.e. a - // file that does not exist on the host filesystem. As of this writing, the - // only files that can be synthetic are sockets, pipes, and directories. - impl any } // +stateify savable @@ -1065,141 +1190,125 @@ func (fs *filesystem) nextIno() uint64 { } // init must be called before first use of d. -func (d *dentry) init(impl any) { - d.pf.dentry = d +func (d *dentry) init() { d.cacheEntry.d = d d.syncableListEntry.d = d - // Nested impl-inheritance pattern. In memory it looks like: - // [[[ vfs.Dentry ] dentry ] dentryImpl ] - // All 3 abstractions are allocated in one allocation. We achieve this by - // making each outer dentry implementation hold the inner dentry by value. - // Then the outer most dentry is allocated and we initialize fields inward. - // Each inner dentry has a pointer to the next level of implementation. - d.impl = impl d.vfsd.Init(d) refs.Register(d) } -func (d *dentry) isSynthetic() bool { - return d.impl == nil -} - -func (d *dentry) cachedMetadataAuthoritative() bool { - return d.fs.opts.interop != InteropModeShared || d.isSynthetic() -} - // updateMetadataFromStatxLocked is called to update d's metadata after an update // from the remote filesystem. -// Precondition: d.metadataMu must be locked. -// +checklocks:d.metadataMu -func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { +// Precondition: i.inode.metadataMu must be locked. +// +checklocks:i.inode.metadataMu +func (i *lisafsInode) updateMetadataFromStatxLocked(stat *linux.Statx) { if stat.Mask&linux.STATX_TYPE != 0 { - if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { + if got, want := stat.Mode&linux.FileTypeMask, i.inode.fileType(); uint32(got) != want { panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) } } if stat.Mask&linux.STATX_MODE != 0 { - d.mode.Store(uint32(stat.Mode)) + i.inode.mode.Store(uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 { - d.uid.Store(dentryUID(lisafs.UID(stat.UID))) + i.inode.uid.Store(dentryUID(lisafs.UID(stat.UID))) } if stat.Mask&linux.STATX_GID != 0 { - d.gid.Store(dentryGID(lisafs.GID(stat.GID))) + i.inode.gid.Store(dentryGID(lisafs.GID(stat.GID))) } if stat.Blksize != 0 { - d.blockSize.Store(stat.Blksize) + i.inode.blockSize.Store(stat.Blksize) } // Don't override newer client-defined timestamps with old server-defined // ones. - if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { - d.atime.Store(dentryTimestamp(stat.Atime)) + if stat.Mask&linux.STATX_ATIME != 0 && i.inode.atimeDirty.Load() == 0 { + i.inode.atime.Store(dentryTimestamp(stat.Atime)) } - if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { - d.mtime.Store(dentryTimestamp(stat.Mtime)) + if stat.Mask&linux.STATX_MTIME != 0 && i.inode.mtimeDirty.Load() == 0 { + i.inode.mtime.Store(dentryTimestamp(stat.Mtime)) } if stat.Mask&linux.STATX_CTIME != 0 { - d.ctime.Store(dentryTimestamp(stat.Ctime)) + i.inode.ctime.Store(dentryTimestamp(stat.Ctime)) } if stat.Mask&linux.STATX_BTIME != 0 { - d.btime.Store(dentryTimestamp(stat.Btime)) + i.inode.btime.Store(dentryTimestamp(stat.Btime)) } if stat.Mask&linux.STATX_NLINK != 0 { - d.nlink.Store(stat.Nlink) + i.inode.nlink.Store(stat.Nlink) } if stat.Mask&linux.STATX_SIZE != 0 { - d.updateSizeLocked(stat.Size) + i.updateSizeLocked(stat.Size) } } // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, // except that it takes a unix.Stat_t argument. -// Precondition: d.metadataMu must be locked. -// +checklocks:d.metadataMu -func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { - if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { +// Precondition: i.inode.metadataMu must be locked. +// +checklocks:i.inode.metadataMu +func (i *directfsInode) updateMetadataFromStatLocked(stat *unix.Stat_t) error { + if got, want := stat.Mode&unix.S_IFMT, i.inode.fileType(); got != want { panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) } - d.mode.Store(stat.Mode) - d.uid.Store(stat.Uid) - d.gid.Store(stat.Gid) - d.blockSize.Store(uint32(stat.Blksize)) + i.inode.mode.Store(stat.Mode) + i.inode.uid.Store(stat.Uid) + i.inode.gid.Store(stat.Gid) + i.inode.blockSize.Store(uint32(stat.Blksize)) // Don't override newer client-defined timestamps with old host-defined // ones. - if d.atimeDirty.Load() == 0 { - d.atime.Store(dentryTimestampFromUnix(stat.Atim)) + if i.inode.atimeDirty.Load() == 0 { + i.inode.atime.Store(dentryTimestampFromUnix(stat.Atim)) } - if d.mtimeDirty.Load() == 0 { - d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) + if i.inode.mtimeDirty.Load() == 0 { + i.inode.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) } - d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) - d.nlink.Store(uint32(stat.Nlink)) - d.updateSizeLocked(uint64(stat.Size)) + i.inode.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) + i.inode.nlink.Store(uint32(stat.Nlink)) + i.inode.updateSizeLocked(uint64(stat.Size)) return nil } -// Preconditions: !d.isSynthetic(). -// Preconditions: d.metadataMu is locked. -// +checklocks:d.metadataMu +// Preconditions: !d.inode.isSynthetic(). +// Preconditions: d.inode.metadataMu is locked. +// +checklocks:d.inode.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { - d.handleMu.RLock() + d.inode.handleMu.RLock() // Can use RacyLoad() because handleMu is locked. - if d.writeFD.RacyLoad() < 0 { - d.handleMu.RUnlock() + if d.inode.writeFD.RacyLoad() < 0 { + d.inode.handleMu.RUnlock() // Use a suitable FD if we don't have a writable host FD. - return d.updateMetadataLocked(ctx, noHandle) + return d.inode.updateMetadataLocked(ctx, noHandle) } // Using statx(2) with a minimal mask is faster than fstat(2). var stat unix.Statx_t // Can use RacyLoad() because handleMu is locked. - err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) - d.handleMu.RUnlock() // must be released before updateSizeLocked() + err := unix.Statx(int(d.inode.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) + d.inode.handleMu.RUnlock() // must be released before updateSizeLocked() if err != nil { return err } - d.updateSizeLocked(stat.Size) + d.inode.updateSizeLocked(stat.Size) return nil } -// Preconditions: !d.isSynthetic(). -func (d *dentry) updateMetadata(ctx context.Context) error { - // d.metadataMu must be locked *before* we stat so that we do not end up +// Preconditions: !i.isSynthetic(). +func (i *inode) updateMetadata(ctx context.Context) error { + // d.inode.metadataMu must be locked *before* we stat so that we do not end up // updating stale attributes in d.updateMetadataFromStatLocked(). - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - return d.updateMetadataLocked(ctx, noHandle) + i.metadataMu.Lock() + defer i.metadataMu.Unlock() + return i.updateMetadataLocked(ctx, noHandle) } -func (d *dentry) fileType() uint32 { - return d.mode.Load() & linux.S_IFMT +func (i *inode) fileType() uint32 { + return i.mode.Load() & linux.S_IFMT } func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME - stat.Blksize = d.blockSize.Load() - stat.Nlink = d.nlink.Load() + stat.Blksize = d.inode.blockSize.Load() + stat.Nlink = d.inode.nlink.Load() if stat.Nlink == 0 { // The remote filesystem doesn't support link count; just make // something up. This is consistent with Linux, where @@ -1208,20 +1317,20 @@ func (d *dentry) statTo(stat *linux.Statx) { // it's not provided by the remote filesystem. stat.Nlink = 1 } - stat.UID = d.uid.Load() - stat.GID = d.gid.Load() - stat.Mode = uint16(d.mode.Load()) - stat.Ino = uint64(d.ino) - stat.Size = d.size.Load() + stat.UID = d.inode.uid.Load() + stat.GID = d.inode.gid.Load() + stat.Mode = uint16(d.inode.mode.Load()) + stat.Ino = uint64(d.inode.ino) + stat.Size = d.inode.size.Load() // This is consistent with regularFileFD.Seek(), which treats regular files // as having no holes. stat.Blocks = (stat.Size + 511) / 512 - stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) - stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) - stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) - stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) + stat.Atime = linux.NsecToStatxTimestamp(d.inode.atime.Load()) + stat.Btime = linux.NsecToStatxTimestamp(d.inode.btime.Load()) + stat.Ctime = linux.NsecToStatxTimestamp(d.inode.ctime.Load()) + stat.Mtime = linux.NsecToStatxTimestamp(d.inode.mtime.Load()) stat.DevMajor = linux.UNNAMED_MAJOR - stat.DevMinor = d.fs.devMinor + stat.DevMinor = d.inode.fs.devMinor } // Precondition: fs.renameMu is locked. @@ -1233,8 +1342,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return linuxerr.EPERM } - mode := linux.FileMode(d.mode.Load()) - if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { + mode := linux.FileMode(d.inode.mode.Load()) + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load())); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -1256,7 +1365,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } var now int64 - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { // Truncate updates mtime. if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { stat.Mask |= linux.STATX_MTIME @@ -1266,7 +1375,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } // Use client clocks for timestamps. - now = d.fs.clock.Now().Nanoseconds() + now = d.inode.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { stat.Atime = linux.NsecToStatxTimestamp(now) } @@ -1275,19 +1384,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() isOwnerChanging := false if stat.Mask&linux.STATX_UID != 0 { - if stat.UID == d.uid.RacyLoad() { + if stat.UID == d.inode.uid.RacyLoad() { stat.Mask &^= linux.STATX_UID } else { isOwnerChanging = true } } if stat.Mask&linux.STATX_GID != 0 { - if stat.GID == d.gid.RacyLoad() { + if stat.GID == d.inode.gid.RacyLoad() { stat.Mask &^= linux.STATX_GID } else { isOwnerChanging = true @@ -1302,7 +1411,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if stat.Mask&linux.STATX_MODE != 0 { stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) } else { - oldMode := d.mode.Load() + oldMode := d.inode.mode.Load() if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { stat.Mode = uint16(updatedMode) stat.Mask |= linux.STATX_MODE @@ -1317,40 +1426,40 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // to not be updated in the dentry cache. var failureMask uint32 var failureErr error - if !d.isSynthetic() { + if !d.inode.isSynthetic() { if stat.Mask != 0 { if err := d.prepareSetStat(ctx, stat); err != nil { return err } - d.handleMu.RLock() + d.inode.handleMu.RLock() if stat.Mask&linux.STATX_SIZE != 0 { - // d.dataMu must be held around the update to both the remote - // file's size and d.size to serialize with writeback (which - // might otherwise write data back up to the old d.size after + // d.inode.dataMu must be held around the update to both the remote + // file's size and d.inode.size to serialize with writeback (which + // might otherwise write data back up to the old d.inode.size after // the remote file has been truncated). - d.dataMu.Lock() + d.inode.dataMu.Lock() } var err error failureMask, failureErr, err = d.setStatLocked(ctx, stat) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() if err != nil { if stat.Mask&linux.STATX_SIZE != 0 { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + d.inode.dataMu.Unlock() // +checklocksforce: locked conditionally above } return err } if stat.Mask&linux.STATX_SIZE != 0 { if failureMask&linux.STATX_SIZE == 0 { - // d.size should be kept up to date, and privatized + // d.inode.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. - d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above + d.inode.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above } else { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + d.inode.dataMu.Unlock() // +checklocksforce: locked conditionally above } } } - if d.fs.opts.interop == InteropModeShared { + if d.inode.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since // it'll be overwritten by revalidation before the next time it's // used anyway. (InteropModeShared inhibits client caching of @@ -1359,28 +1468,28 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { - d.mode.Store(d.fileType() | uint32(stat.Mode)) + d.inode.mode.Store(d.inode.fileType() | uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { - d.uid.Store(stat.UID) + d.inode.uid.Store(stat.UID) } if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { - d.gid.Store(stat.GID) + d.inode.gid.Store(stat.GID) } // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because - // if d.cachedMetadataAuthoritative() then we converted stat.Atime and + // if d.inode.cachedMetadataAuthoritative() then we converted stat.Atime and // stat.Mtime to client-local timestamps above, and if - // !d.cachedMetadataAuthoritative() then we returned after calling + // !d.inode.cachedMetadataAuthoritative() then we returned after calling // d.file.setAttr(). For the same reason, now must have been initialized. if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { - d.atime.Store(stat.Atime.ToNsec()) - d.atimeDirty.Store(0) + d.inode.atime.Store(stat.Atime.ToNsec()) + d.inode.atimeDirty.Store(0) } if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { - d.mtime.Store(stat.Mtime.ToNsec()) - d.mtimeDirty.Store(0) + d.inode.mtime.Store(stat.Mtime.ToNsec()) + d.inode.mtimeDirty.Store(0) } - d.ctime.Store(now) + d.inode.ctime.Store(now) if failureMask != 0 { // Setting some attribute failed on the remote filesystem. return failureErr @@ -1388,15 +1497,15 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } -// doAllocate performs an allocate operation on d. Note that d.metadataMu will +// doAllocate performs an allocate operation on d. Note that d.inode.metadataMu will // be held when allocate is called. func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // Allocating a smaller size is a noop. size := offset + length - if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { + if d.inode.cachedMetadataAuthoritative() && size <= d.inode.size.RacyLoad() { return nil } @@ -1404,58 +1513,58 @@ func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate if err != nil { return err } - d.updateSizeLocked(size) - if d.cachedMetadataAuthoritative() { + d.inode.updateSizeLocked(size) + if d.inode.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } return nil } -// Preconditions: d.metadataMu must be locked. -func (d *dentry) updateSizeLocked(newSize uint64) { - d.dataMu.Lock() - d.updateSizeAndUnlockDataMuLocked(newSize) +// Preconditions: d.inode.metadataMu must be locked. +func (i *inode) updateSizeLocked(newSize uint64) { + i.dataMu.Lock() + i.updateSizeAndUnlockDataMuLocked(newSize) } -// Preconditions: d.metadataMu and d.dataMu must be locked. +// Preconditions: i.metadataMu and i.dataMu must be locked. // -// Postconditions: d.dataMu is unlocked. -// +checklocksrelease:d.dataMu -func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { - oldSize := d.size.RacyLoad() - d.size.Store(newSize) - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings +// Postconditions: i.dataMu is unlocked. +// +checklocksrelease:i.dataMu +func (i *inode) updateSizeAndUnlockDataMuLocked(newSize uint64) { + oldSize := i.size.RacyLoad() + i.size.Store(newSize) + // i.dataMu must be unlocked to lock i.mapsMu and invalidate mappings // below. This allows concurrent calls to Read/Translate/etc. These // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, + // contents beyond the new i.size. (We are still holding i.metadataMu, // so we can't race with Write or another truncate.) - d.dataMu.Unlock() + i.dataMu.Unlock() if newSize < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) newpgend, _ := hostarch.PageRoundUp(newSize) if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + i.mapsMu.Lock() + i.mappings.Invalidate(memmap.MappableRange{Start: newpgend, End: oldpgend}, memmap.InvalidateOpts{ // Compare Linux's mm/truncate.c:truncate_setsize() => // truncate_pagecache() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) - d.mapsMu.Unlock() + i.mapsMu.Unlock() } // We are now guaranteed that there are no translations of // truncated pages, and can remove them from the cache. Since // truncated pages have been removed from the remote file, they // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(newSize, d.fs.mf) - d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) - d.dataMu.Unlock() + i.dataMu.Lock() + i.cache.Truncate(newSize, i.fs.mf) + i.dirty.KeepClean(memmap.MappableRange{Start: newSize, End: oldpgend}) + i.dataMu.Unlock() } } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load())) } func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { @@ -1474,9 +1583,9 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats if ats.MayWrite() && strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) { return linuxerr.EOPNOTSUPP } - mode := linux.FileMode(d.mode.Load()) - kuid := auth.KUID(d.uid.Load()) - kgid := auth.KGID(d.gid.Load()) + mode := linux.FileMode(d.inode.mode.Load()) + kuid := auth.KUID(d.inode.uid.Load()) + kgid := auth.KGID(d.inode.gid.Load()) if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { return err } @@ -1486,10 +1595,10 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky( creds, - linux.FileMode(d.mode.Load()), - auth.KUID(d.uid.Load()), - auth.KUID(child.uid.Load()), - auth.KGID(child.gid.Load()), + linux.FileMode(d.inode.mode.Load()), + auth.KUID(d.inode.uid.Load()), + auth.KUID(child.inode.uid.Load()), + auth.KGID(child.inode.gid.Load()), ) } @@ -1509,7 +1618,7 @@ func dentryGID(gid lisafs.GID) uint32 { // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { - // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.refs may be 0 if d.inode.fs.renameMu is locked, which serializes against // d.checkCachingLocked(). r := d.refs.Add(1) if d.LogRefs() { @@ -1578,18 +1687,18 @@ func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, e events |= linux.IN_ISDIR } - d.fs.ancestryMu.RLock() + d.inode.fs.ancestryMu.RLock() // The ordering below is important, Linux always notifies the parent first. if parent := d.parent.Load(); parent != nil { - parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) + parent.inode.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) - d.fs.ancestryMu.RUnlock() + d.inode.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) + d.inode.fs.ancestryMu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { - return &d.watches + return &d.inode.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. @@ -1613,7 +1722,7 @@ func (d *dentry) OnZeroWatches(ctx context.Context) { // operation. One of the calls may destroy the dentry, so subsequent calls will // do nothing. // -// Preconditions: d.fs.renameMu must be locked for writing if +// Preconditions: d.inode.fs.renameMu must be locked for writing if // renameMuWriteLocked is true; it may be temporarily unlocked. func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { d.cachingMu.Lock() @@ -1640,9 +1749,9 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo d.removeFromCacheLocked() d.cachingMu.Unlock() if !renameMuWriteLocked { - // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + // Need to lock d.inode.fs.renameMu for writing as needed by d.destroyLocked(). + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() // Now that renameMu is locked for writing, no more refs can be taken on // d because path resolution requires renameMu for reading at least. if d.refs.Load() != 0 { @@ -1652,7 +1761,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo } } if d.isDeleted() { - d.watches.HandleDeletion(ctx) + d.inode.watches.HandleDeletion(ctx) } d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return @@ -1670,22 +1779,22 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo // If d still has inotify watches and it is not deleted or invalidated, it // can't be evicted. Otherwise, we will lose its watches, even if a new // dentry is created for the same file in the future. Note that the size of - // d.watches cannot concurrently transition from zero to non-zero, because + // d.inode.watches cannot concurrently transition from zero to non-zero, because // adding a watch requires holding a reference on d. - if d.watches.Size() > 0 { + if d.inode.watches.Size() > 0 { // As in the refs > 0 case, removing d is beneficial. d.removeFromCacheLocked() d.cachingMu.Unlock() return } - if d.fs.released.Load() != 0 { + if d.inode.fs.released.Load() != 0 { d.cachingMu.Unlock() if !renameMuWriteLocked { - // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as + // Need to lock d.inode.fs.renameMu to access d.parent. Lock it for writing as // needed by d.destroyLocked() later. - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() } if parent := d.parent.Load(); parent != nil { parent.childrenMu.Lock() @@ -1696,42 +1805,42 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo return } - d.fs.dentryCache.mu.Lock() + d.inode.fs.dentryCache.mu.Lock() // If d is already cached, just move it to the front of the LRU. if d.cached { - d.fs.dentryCache.dentries.Remove(&d.cacheEntry) - d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) - d.fs.dentryCache.mu.Unlock() + d.inode.fs.dentryCache.dentries.Remove(&d.cacheEntry) + d.inode.fs.dentryCache.dentries.PushFront(&d.cacheEntry) + d.inode.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() return } // Cache the dentry, then evict the least recently used cached dentry if // the cache becomes over-full. - d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) - d.fs.dentryCache.dentriesLen++ + d.inode.fs.dentryCache.dentries.PushFront(&d.cacheEntry) + d.inode.fs.dentryCache.dentriesLen++ d.cached = true - shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries - d.fs.dentryCache.mu.Unlock() + shouldEvict := d.inode.fs.dentryCache.dentriesLen > d.inode.fs.dentryCache.maxCachedDentries + d.inode.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() if shouldEvict { if !renameMuWriteLocked { - // Need to lock d.fs.renameMu for writing as needed by + // Need to lock d.inode.fs.renameMu for writing as needed by // d.evictCachedDentryLocked(). - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() } - d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. + d.inode.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } // Preconditions: d.cachingMu must be locked. func (d *dentry) removeFromCacheLocked() { if d.cached { - d.fs.dentryCache.mu.Lock() - d.fs.dentryCache.dentries.Remove(&d.cacheEntry) - d.fs.dentryCache.dentriesLen-- - d.fs.dentryCache.mu.Unlock() + d.inode.fs.dentryCache.mu.Lock() + d.inode.fs.dentryCache.dentries.Remove(&d.cacheEntry) + d.inode.fs.dentryCache.dentriesLen-- + d.inode.fs.dentryCache.mu.Unlock() d.cached = false } } @@ -1759,7 +1868,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { return } - if victim.d.fs == fs { + if victim.d.inode.fs == fs { victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs return } @@ -1774,23 +1883,23 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { } // Preconditions: -// - d.fs.renameMu must not be locked for writing. +// - d.inode.fs.renameMu must not be locked for writing. func (d *dentry) evict(ctx context.Context) { - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() d.evictLocked(ctx) } // Preconditions: -// - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// - d.inode.fs.renameMu must be locked for writing; it may be temporarily unlocked. // -// +checklocks:d.fs.renameMu +// +checklocks:d.inode.fs.renameMu func (d *dentry) evictLocked(ctx context.Context) { d.cachingMu.Lock() d.removeFromCacheLocked() - // d.refs or d.watches.Size() may have become non-zero from an earlier path + // d.refs or d.inode.watches.Size() may have become non-zero from an earlier path // resolution since it was inserted into fs.dentryCache.dentries. - if d.refs.Load() != 0 || d.watches.Size() != 0 { + if d.refs.Load() != 0 || d.inode.watches.Size() != 0 { d.cachingMu.Unlock() return } @@ -1799,7 +1908,7 @@ func (d *dentry) evictLocked(ctx context.Context) { if !d.vfsd.IsDead() { // Note that d can't be a mount point (in any mount namespace), since VFS // holds references on mount points. - rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) + rcs := d.inode.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) for _, rc := range rcs { rc.DecRef(ctx) } @@ -1825,43 +1934,8 @@ func (d *dentry) evictLocked(ctx context.Context) { // destroyDisconnected destroys an uncached, unparented dentry. There are no // locking preconditions. func (d *dentry) destroyDisconnected(ctx context.Context) { - mf := d.fs.mf - - d.handleMu.Lock() - d.dataMu.Lock() - - if d.isWriteHandleOk() { - // Write dirty pages back to the remote filesystem. - h := d.writeHandle() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) - } - } - // Discard cached data. - if !d.cache.IsEmpty() { - mf.MarkAllUnevictable(d) - d.cache.DropAll(mf) - d.dirty.RemoveAll() - } - d.dataMu.Unlock() - - // Close any resources held by the implementation. - d.destroyImpl(ctx) - - // Can use RacyLoad() because handleMu is locked. - if d.readFD.RacyLoad() >= 0 { - _ = unix.Close(int(d.readFD.RacyLoad())) - } - if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { - _ = unix.Close(int(d.writeFD.RacyLoad())) - } - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) - d.handleMu.Unlock() - - if !d.isSynthetic() { - // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, + if !d.inode.isSynthetic() { + // Note that it's possible that d.inode.atimeDirty or d.inode.mtimeDirty are true, // i.e. client and server timestamps may differ (because e.g. a client // write was serviced by the page cache, and only written back to the // remote file later). Ideally, we'd write client timestamps back to @@ -1871,9 +1945,9 @@ func (d *dentry) destroyDisconnected(ctx context.Context) { // don't do this. // Remove d from the set of syncable dentries. - d.fs.syncMu.Lock() - d.fs.syncableDentries.Remove(&d.syncableListEntry) - d.fs.syncMu.Unlock() + d.inode.fs.syncMu.Lock() + d.inode.fs.syncableDentries.Remove(&d.syncableListEntry) + d.inode.fs.syncMu.Unlock() } // Drop references and stop tracking this child. @@ -1884,12 +1958,12 @@ func (d *dentry) destroyDisconnected(ctx context.Context) { // destroyLocked destroys the dentry. // // Preconditions: -// - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// - d.inode.fs.renameMu must be locked for writing; it may be temporarily unlocked. // - d.refs == 0. // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. // -// +checklocks:d.fs.renameMu +// +checklocks:d.inode.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch d.refs.Load() { case 0: @@ -1901,17 +1975,20 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("dentry.destroyLocked() called with references on the dentry") } + // Decrement inode reference count. + d.inode.decRef(ctx, d) + // Allow the following to proceed without renameMu locked to improve // scalability. - d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Unlock() // No locks need to be held during destoryDisconnected. d.destroyDisconnected(ctx) - d.fs.renameMu.Lock() + d.inode.fs.renameMu.Lock() // Drop the reference held by d on its parent without recursively locking - // d.fs.renameMu. + // d.inode.fs.renameMu. if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 { parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) @@ -1927,15 +2004,14 @@ func (d *dentry) setDeleted() { } func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { - if d.isSynthetic() { + if d.inode.isSynthetic() { return nil, nil } - return d.listXattrImpl(ctx, size) } func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { - if d.isSynthetic() { + if d.inode.isSynthetic() { return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { @@ -1945,7 +2021,7 @@ func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { - if d.isSynthetic() { + if d.inode.isSynthetic() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { @@ -1955,36 +2031,36 @@ func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { - if d.isSynthetic() { + if d.inode.isSynthetic() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - return d.removeXattrImpl(ctx, name) + return d.inode.removeXattrImpl(ctx, name) } // Preconditions: -// - !d.isSynthetic(). +// - !d.inode.isSynthetic(). // - d.isRegularFile() || d.isDir(). // - fs.renameMu is locked. func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). if !trunc { - d.handleMu.RLock() - canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) - d.handleMu.RUnlock() + d.inode.handleMu.RLock() + canReuseCurHandle := (!read || d.inode.isReadHandleOk()) && (!write || d.inode.isWriteHandleOk()) + d.inode.handleMu.RUnlock() if canReuseCurHandle { // Current handles are sufficient. return nil } } - d.handleMu.Lock() - needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc + d.inode.handleMu.Lock() + needNewHandle := (read && !d.inode.isReadHandleOk()) || (write && !d.inode.isWriteHandleOk()) || trunc if !needNewHandle { - d.handleMu.Unlock() + d.inode.handleMu.Unlock() return nil } @@ -1996,11 +2072,10 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // // - Writable memory mappings of a host FD require that the host FD is // opened for both reading and writing. - // // - NOTE(b/141991141): Some filesystems may not ensure coherence // between multiple handles for the same file. - openReadable := d.isReadHandleOk() || read - openWritable := d.isWriteHandleOk() || write + openReadable := d.inode.isReadHandleOk() || read + openWritable := d.inode.isWriteHandleOk() || write h, err := d.openHandle(ctx, openReadable, openWritable, trunc) if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { // It may not be possible to use a single handle for both @@ -2014,73 +2089,73 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool h, err = d.openHandle(ctx, openReadable, openWritable, trunc) } if err != nil { - d.handleMu.Unlock() + d.inode.handleMu.Unlock() return err } - // Update d.readFD and d.writeFD + // Update d.inode.readFD and d.inode.writeFD if h.fd >= 0 { - if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { + if openReadable && openWritable && (d.inode.readFD.RacyLoad() < 0 || d.inode.writeFD.RacyLoad() < 0 || d.inode.readFD.RacyLoad() != d.inode.writeFD.RacyLoad()) { // Replace existing FDs with this one. - if d.readFD.RacyLoad() >= 0 { + if d.inode.readFD.RacyLoad() >= 0 { // We already have a readable FD that may be in use by - // concurrent callers of d.pf.FD(). - if d.fs.opts.overlayfsStaleRead { + // concurrent callers of d.inode.pf.FD(). + if d.inode.fs.opts.overlayfsStaleRead { // If overlayfsStaleRead is in effect, then the new FD // may not be coherent with the existing one, so we // have no choice but to switch to mappings of the new // FD in both the application and sentry. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() + if err := d.inode.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.inode.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } - fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) + fdsToClose = append(fdsToClose, d.inode.readFD.RacyLoad()) invalidateTranslations = true - d.readFD.Store(h.fd) + d.inode.readFD.Store(h.fd) } else { // Otherwise, we want to avoid invalidating existing // memmap.Translations (which is expensive); instead, use // dup3 to make the old file descriptor refer to the new // file description, then close the new file descriptor - // (which is no longer needed). Racing callers of d.pf.FD() + // (which is no longer needed). Racing callers of d.inode.pf.FD() // may use the old or new file description, but this // doesn't matter since they refer to the same file, and // any racing mappings must be read-only. - if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { - oldFD := d.readFD.RacyLoad() - d.handleMu.Unlock() + if err := unix.Dup3(int(h.fd), int(d.inode.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { + oldFD := d.inode.readFD.RacyLoad() + d.inode.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) h.close(ctx) return err } fdsToClose = append(fdsToClose, h.fd) - h.fd = d.readFD.RacyLoad() + h.fd = d.inode.readFD.RacyLoad() } } else { - d.readFD.Store(h.fd) + d.inode.readFD.Store(h.fd) } - if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { - fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) + if d.inode.writeFD.RacyLoad() != h.fd && d.inode.writeFD.RacyLoad() >= 0 { + fdsToClose = append(fdsToClose, d.inode.writeFD.RacyLoad()) } - d.writeFD.Store(h.fd) - d.mmapFD.Store(h.fd) - } else if openReadable && d.readFD.RacyLoad() < 0 { - readHandleWasOk := d.isReadHandleOk() - d.readFD.Store(h.fd) + d.inode.writeFD.Store(h.fd) + d.inode.mmapFD.Store(h.fd) + } else if openReadable && d.inode.readFD.RacyLoad() < 0 { + readHandleWasOk := d.inode.isReadHandleOk() + d.inode.readFD.Store(h.fd) // If the file has not been opened for writing, the new FD may // be used for read-only memory mappings. If the file was // previously opened for reading (without an FD), then existing // translations of the file may use the internal page cache; // invalidate those mappings. - if !d.isWriteHandleOk() { + if !d.inode.isWriteHandleOk() { invalidateTranslations = readHandleWasOk - d.mmapFD.Store(h.fd) + d.inode.mmapFD.Store(h.fd) } - } else if openWritable && d.writeFD.RacyLoad() < 0 { - d.writeFD.Store(h.fd) - if d.readFD.RacyLoad() >= 0 { + } else if openWritable && d.inode.writeFD.RacyLoad() < 0 { + d.inode.writeFD.Store(h.fd) + if d.inode.readFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just // been opened for writing, so we need to start supporting // writable memory mappings. However, the new FD is not @@ -2088,32 +2163,32 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // writable memory mappings. Switch to using the internal // page cache. invalidateTranslations = true - d.mmapFD.Store(-1) + d.inode.mmapFD.Store(-1) } } else { // The new FD is not useful. fdsToClose = append(fdsToClose, h.fd) } - } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { + } else if openWritable && d.inode.writeFD.RacyLoad() < 0 && d.inode.mmapFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just been // opened for writing, so we need to start supporting writable // memory mappings. However, we have no writable host FD. Switch to // using the internal page cache. invalidateTranslations = true - d.mmapFD.Store(-1) + d.inode.mmapFD.Store(-1) } - d.updateHandles(ctx, h, openReadable, openWritable) - d.handleMu.Unlock() + d.inode.updateHandles(ctx, h, openReadable, openWritable) + d.inode.handleMu.Unlock() if invalidateTranslations { // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls - // to d.Translate(). This requires holding d.mapsMu, which precedes - // d.handleMu in the lock order. - d.mapsMu.Lock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + // to d.Translate(). This requires holding d.inode.mapsMu, which precedes + // d.inode.handleMu in the lock order. + d.inode.mapsMu.Lock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() } for _, fd := range fdsToClose { unix.Close(int(fd)) @@ -2123,32 +2198,32 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool } func (d *dentry) syncRemoteFile(ctx context.Context) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() return d.syncRemoteFileLocked(ctx) } -// Preconditions: d.handleMu must be locked. +// Preconditions: d.inode.handleMu must be locked. func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { // Prefer syncing write handles over read handles, since some remote // filesystem implementations may not sync changes made through write // handles otherwise. - wh := d.writeHandle() + wh := d.inode.writeHandle() wh.sync(ctx) - rh := d.readHandle() + rh := d.inode.readHandle() rh.sync(ctx) return nil } func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if d.isWriteHandleOk() { + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + if d.inode.isWriteHandleOk() { // Write back dirty pages to the remote file. - d.dataMu.Lock() - h := d.writeHandle() - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt) - d.dataMu.Unlock() + d.inode.dataMu.Lock() + h := d.inode.writeHandle() + err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), d.inode.fs.mf, h.writeFromBlocksAt) + d.inode.dataMu.Unlock() if err != nil { return err } @@ -2159,7 +2234,7 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err } // Only return err if we can reasonably have expected sync to succeed // (d is a regular file and was opened for writing). - if d.isRegularFile() && d.isWriteHandleOk() { + if d.inode.isRegularFile() && d.inode.isWriteHandleOk() { return err } ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) @@ -2169,20 +2244,20 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err // incLinks increments link count. func (d *dentry) incLinks() { - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } - d.nlink.Add(1) + d.inode.nlink.Add(1) } // decLinks decrements link count. func (d *dentry) decLinks() { - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } - d.nlink.Add(^uint32(0)) + d.inode.nlink.Add(^uint32(0)) } // fileDescription is embedded by gofer implementations of @@ -2209,14 +2284,14 @@ func (fd *fileDescription) dentry() *dentry { func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) - if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + if !d.inode.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // Use specialFileFD.handle.fileLisa for the Stat if available, for the // same reason that we try to use open FD in updateMetadataLocked(). var err error if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { err = sffd.updateMetadata(ctx) } else { - err = d.updateMetadata(ctx) + err = d.inode.updateMetadata(ctx) } if err != nil { return linux.Statx{}, err diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index d3c072814a..9e30fb5783 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -27,27 +27,32 @@ import ( func TestDestroyIdempotent(t *testing.T) { ctx := contexttest.Context(t) fs := filesystem{ - mf: pgalloc.MemoryFileFromContext(ctx), - inoByKey: make(map[inoKey]uint64), - clock: ktime.RealtimeClockFromContext(ctx), + mf: pgalloc.MemoryFileFromContext(ctx), + inoByKey: make(map[inoKey]uint64), + inodeByIno: make(map[inoKey]*inode), + clock: ktime.RealtimeClockFromContext(ctx), // Test relies on no dentry being held in the cache. dentryCache: &dentryCache{maxCachedDentries: 0}, client: &lisafs.Client{}, } - parentInode := lisafs.Inode{ + parentRemoteInode := lisafs.Inode{ ControlFD: 1, Stat: linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE, Mode: linux.S_IFDIR | 0666, }, } - parent, err := fs.newLisafsDentry(ctx, &parentInode) + parentInode, err := fs.newLisafsInode(ctx, &parentRemoteInode) if err != nil { - t.Fatalf("fs.newLisafsDentry(): %v", err) + t.Fatalf("fs.newLisafsInode(): %v", err) + } + parent, err := fs.newDentry(parentInode) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) } - childInode := lisafs.Inode{ + childRemoteInode := lisafs.Inode{ ControlFD: 2, Stat: linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_SIZE, @@ -55,9 +60,14 @@ func TestDestroyIdempotent(t *testing.T) { Size: 0, }, } - child, err := fs.newLisafsDentry(ctx, &childInode) + childInode, err := fs.newLisafsInode(ctx, &childRemoteInode) + if err != nil { + t.Fatalf("fs.newLisafsInode(): %v", err) + } + + child, err := fs.newDentry(childInode) if err != nil { - t.Fatalf("fs.newLisafsDentry(): %v", err) + t.Fatalf("fs.newDentry(): %v", err) } parent.opMu.Lock() parent.childrenMu.Lock() diff --git a/pkg/sentry/fsimpl/gofer/dentry_impl.go b/pkg/sentry/fsimpl/gofer/inode_impl.go similarity index 54% rename from pkg/sentry/fsimpl/gofer/dentry_impl.go rename to pkg/sentry/fsimpl/gofer/inode_impl.go index 2fe86e3380..e22a3ec0a5 100644 --- a/pkg/sentry/fsimpl/gofer/dentry_impl.go +++ b/pkg/sentry/fsimpl/gofer/inode_impl.go @@ -44,47 +44,47 @@ import ( // analysis to proceed as usual and avoids heap allocations. // // Also note that the default case in these type switch statements panics. We -// do not do panic(fmt.Sprintf("... %T", d.impl)) because somehow it adds a lot +// do not do panic(fmt.Sprintf("... %T", i.impl)) because somehow it adds a lot // of overhead to the type switch. So instead we panic with a constant string. -// Precondition: d.handleMu must be locked. -func (d *dentry) isReadHandleOk() bool { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.readFDLisa.Ok() - case *directfsDentry: - return d.readFD.RacyLoad() >= 0 - case nil: // synthetic dentry +// Precondition: i.handleMu must be locked. +func (i *inode) isReadHandleOk() bool { + switch it := i.impl.(type) { + case *lisafsInode: + return it.readFDLisa.Ok() + case *directfsInode: + return it.readFD.RacyLoad() >= 0 + case nil: // synthetic inode return false default: - panic("unknown dentry implementation") + panic("unknown inode implementation") } } -// Precondition: d.handleMu must be locked. -func (d *dentry) isWriteHandleOk() bool { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.writeFDLisa.Ok() - case *directfsDentry: - return d.writeFD.RacyLoad() >= 0 - case nil: // synthetic dentry +// Precondition: i.handleMu must be locked. +func (i *inode) isWriteHandleOk() bool { + switch it := i.impl.(type) { + case *lisafsInode: + return it.writeFDLisa.Ok() + case *directfsInode: + return i.writeFD.RacyLoad() >= 0 + case nil: // synthetic inode return false default: - panic("unknown dentry implementation") + panic("unknown inode implementation") } } -// Precondition: d.handleMu must be locked. -func (d *dentry) readHandle() handle { - switch dt := d.impl.(type) { - case *lisafsDentry: +// Precondition: i.handleMu must be locked. +func (i *inode) readHandle() handle { + switch it := i.impl.(type) { + case *lisafsInode: return handle{ - fdLisa: dt.readFDLisa, - fd: d.readFD.RacyLoad(), + fdLisa: it.readFDLisa, + fd: i.readFD.RacyLoad(), } - case *directfsDentry: - return handle{fd: d.readFD.RacyLoad()} + case *directfsInode: + return handle{fd: i.readFD.RacyLoad()} case nil: // synthetic dentry return noHandle default: @@ -92,20 +92,20 @@ func (d *dentry) readHandle() handle { } } -// Precondition: d.handleMu must be locked. -func (d *dentry) writeHandle() handle { - switch dt := d.impl.(type) { - case *lisafsDentry: +// Precondition: i.handleMu must be locked. +func (i *inode) writeHandle() handle { + switch it := i.impl.(type) { + case *lisafsInode: return handle{ - fdLisa: dt.writeFDLisa, - fd: d.writeFD.RacyLoad(), + fdLisa: it.writeFDLisa, + fd: i.writeFD.RacyLoad(), } - case *directfsDentry: - return handle{fd: d.writeFD.RacyLoad()} - case nil: // synthetic dentry + case *directfsInode: + return handle{fd: i.writeFD.RacyLoad()} + case nil: // synthetic inode return noHandle default: - panic("unknown dentry implementation") + panic("unknown inode implementation") } } @@ -127,24 +127,24 @@ func (d *dentry) openHandle(ctx context.Context, read, write, trunc bool) (handl if trunc { flags |= unix.O_TRUNC } - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.openHandle(ctx, flags) - case *directfsDentry: - return dt.openHandle(ctx, flags) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.openHandle(ctx, flags) + case *directfsInode: + return it.openHandle(ctx, flags, d) default: - panic("unknown dentry implementation") + panic("unknown inode implementation") } } // Preconditions: -// - d.handleMu must be locked. +// - i.handleMu must be locked. // - !d.isSynthetic(). -func (d *dentry) updateHandles(ctx context.Context, h handle, readable, writable bool) { - switch dt := d.impl.(type) { - case *lisafsDentry: - dt.updateHandles(ctx, h, readable, writable) - case *directfsDentry: +func (i *inode) updateHandles(ctx context.Context, h handle, readable, writable bool) { + switch it := i.impl.(type) { + case *lisafsInode: + it.updateHandles(ctx, h, readable, writable) + case *directfsInode: // No update needed. default: panic("unknown dentry implementation") @@ -152,25 +152,25 @@ func (d *dentry) updateHandles(ctx context.Context, h handle, readable, writable } // Preconditions: -// - d.handleMu must be locked. +// - i.handleMu must be locked. // - !d.isSynthetic(). -func (d *dentry) closeHostFDs() { - // We can use RacyLoad() because d.handleMu is locked. - if d.readFD.RacyLoad() >= 0 { - _ = unix.Close(int(d.readFD.RacyLoad())) - } - if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { - _ = unix.Close(int(d.writeFD.RacyLoad())) - } - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) - - switch dt := d.impl.(type) { - case *directfsDentry: - if dt.controlFD >= 0 { - _ = unix.Close(dt.controlFD) - dt.controlFD = -1 +func (i *inode) closeHostFDs() { + // We can use RacyLoad() because i.handleMu is locked. + if i.readFD.RacyLoad() >= 0 { + _ = unix.Close(int(i.readFD.RacyLoad())) + } + if i.writeFD.RacyLoad() >= 0 && i.readFD.RacyLoad() != i.writeFD.RacyLoad() { + _ = unix.Close(int(i.writeFD.RacyLoad())) + } + i.readFD = atomicbitops.FromInt32(-1) + i.writeFD = atomicbitops.FromInt32(-1) + i.mmapFD = atomicbitops.FromInt32(-1) + + switch it := i.impl.(type) { + case *directfsInode: + if it.controlFD >= 0 { + _ = unix.Close(it.controlFD) + it.controlFD = -1 } } } @@ -181,18 +181,18 @@ func (d *dentry) closeHostFDs() { // // Preconditions: // - !d.isSynthetic(). -// - d.metadataMu is locked. +// - i.metadataMu is locked. // -// +checklocks:d.metadataMu -func (d *dentry) updateMetadataLocked(ctx context.Context, h handle) error { +// +checklocks:i.metadataMu +func (i *inode) updateMetadataLocked(ctx context.Context, h handle) error { // Need checklocksforce below because checklocks has no way of knowing that - // d.impl.(*dentryImpl).dentry == d. It can't know that the right metadataMu + // i.impl.(*dentryImpl).dentry == d. It can't know that the right metadataMu // is already locked. - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.updateMetadataLocked(ctx, h) // +checklocksforce: acquired by precondition. - case *directfsDentry: - return dt.updateMetadataLocked(h) // +checklocksforce: acquired by precondition. + switch it := i.impl.(type) { + case *lisafsInode: + return it.updateMetadataLocked(ctx, h) // +checklocksforce: acquired by precondition. + case *directfsInode: + return it.updateMetadataLocked(h) // +checklocksforce: acquired by precondition. default: panic("unknown dentry implementation") } @@ -202,12 +202,12 @@ func (d *dentry) updateMetadataLocked(ctx context.Context, h handle) error { // - !d.isSynthetic(). // - fs.renameMu is locked. func (d *dentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { - switch dt := d.impl.(type) { - case *lisafsDentry: + switch it := d.inode.impl.(type) { + case *lisafsInode: // Nothing to be done. return nil - case *directfsDentry: - return dt.prepareSetStat(ctx, stat) + case *directfsInode: + return it.prepareSetStat(ctx, stat, d) default: panic("unknown dentry implementation") } @@ -215,11 +215,11 @@ func (d *dentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { // Precondition: fs.renameMu is locked if d is a socket. func (d *dentry) chmod(ctx context.Context, mode uint16) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return chmod(ctx, dt.controlFD, mode) - case *directfsDentry: - return dt.chmod(ctx, mode) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return chmod(ctx, it.controlFD, mode) + case *directfsInode: + return it.chmod(ctx, mode, d) default: panic("unknown dentry implementation") } @@ -227,27 +227,27 @@ func (d *dentry) chmod(ctx context.Context, mode uint16) error { // Preconditions: // - !d.isSynthetic(). -// - d.handleMu is locked. +// - i.handleMu is locked. // - fs.renameMu is locked. func (d *dentry) setStatLocked(ctx context.Context, stat *linux.Statx) (uint32, error, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.SetStat(ctx, stat) - case *directfsDentry: - failureMask, failureErr := dt.setStatLocked(ctx, stat) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.controlFD.SetStat(ctx, stat) + case *directfsInode: + failureMask, failureErr := it.setStatLocked(ctx, stat, d) return failureMask, failureErr, nil default: panic("unknown dentry implementation") } } -// Precondition: d.handleMu must be locked. -func (d *dentry) destroyImpl(ctx context.Context) { - switch dt := d.impl.(type) { - case *lisafsDentry: - dt.destroy(ctx) - case *directfsDentry: - dt.destroy(ctx) +// Precondition: i.handleMu must be locked. +func (i *inode) destroyImpl(ctx context.Context, d *dentry) { + switch i := i.impl.(type) { + case *lisafsInode: + i.destroy(ctx, d) + case *directfsInode: + i.destroy(ctx) case nil: // synthetic dentry default: panic("unknown dentry implementation") @@ -258,11 +258,11 @@ func (d *dentry) destroyImpl(ctx context.Context) { // // +checklocksread:d.opMu func (d *dentry) getRemoteChild(ctx context.Context, name string) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.getRemoteChild(ctx, name) - case *directfsDentry: - return dt.getHostChild(name) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.getRemoteChild(ctx, name) + case *directfsInode: + return it.getHostChild(name) default: panic("unknown dentry implementation") } @@ -277,14 +277,14 @@ func (d *dentry) getRemoteChild(ctx context.Context, name string) (*dentry, erro // Postcondition: The returned dentry is already cached appropriately. // // +checklocksread:d.opMu -func (d *dentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.getRemoteChildAndWalkPathLocked(ctx, rp, ds) - case *directfsDentry: +func (i *inode) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry, d *dentry) (*dentry, error) { + switch it := i.impl.(type) { + case *lisafsInode: + return it.getRemoteChildAndWalkPathLocked(ctx, rp, ds, d) + case *directfsInode: // We need to check for races because opMu is read locked which allows // concurrent walks to occur. - return d.fs.getRemoteChildLocked(ctx, d, rp.Component(), true /* checkForRace */, ds) + return i.fs.getRemoteChildLocked(ctx, d, rp.Component(), true /* checkForRace */, ds) default: panic("unknown dentry implementation") } @@ -292,10 +292,10 @@ func (d *dentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvi // Precondition: !d.isSynthetic(). func (d *dentry) listXattrImpl(ctx context.Context, size uint64) ([]string, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.ListXattr(ctx, size) - case *directfsDentry: + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.controlFD.ListXattr(ctx, size) + case *directfsInode: // Consistent with runsc/fsgofer. return nil, linuxerr.EOPNOTSUPP default: @@ -305,11 +305,11 @@ func (d *dentry) listXattrImpl(ctx context.Context, size uint64) ([]string, erro // Precondition: !d.isSynthetic(). func (d *dentry) getXattrImpl(ctx context.Context, opts *vfs.GetXattrOptions) (string, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.GetXattr(ctx, opts.Name, opts.Size) - case *directfsDentry: - return dt.getXattr(ctx, opts.Name, opts.Size) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.controlFD.GetXattr(ctx, opts.Name, opts.Size) + case *directfsInode: + return it.getXattr(ctx, opts.Name, opts.Size, d) default: panic("unknown dentry implementation") } @@ -317,10 +317,10 @@ func (d *dentry) getXattrImpl(ctx context.Context, opts *vfs.GetXattrOptions) (s // Precondition: !d.isSynthetic(). func (d *dentry) setXattrImpl(ctx context.Context, opts *vfs.SetXattrOptions) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) - case *directfsDentry: + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.controlFD.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) + case *directfsInode: // Consistent with runsc/fsgofer. return linuxerr.EOPNOTSUPP default: @@ -329,11 +329,11 @@ func (d *dentry) setXattrImpl(ctx context.Context, opts *vfs.SetXattrOptions) er } // Precondition: !d.isSynthetic(). -func (d *dentry) removeXattrImpl(ctx context.Context, name string) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.RemoveXattr(ctx, name) - case *directfsDentry: +func (i *inode) removeXattrImpl(ctx context.Context, name string) error { + switch it := i.impl.(type) { + case *lisafsInode: + return it.controlFD.RemoveXattr(ctx, name) + case *directfsInode: // Consistent with runsc/fsgofer. return linuxerr.EOPNOTSUPP default: @@ -343,11 +343,11 @@ func (d *dentry) removeXattrImpl(ctx context.Context, name string) error { // Precondition: !d.isSynthetic(). func (d *dentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.mknod(ctx, name, creds, opts) - case *directfsDentry: - return dt.mknod(ctx, name, creds, opts) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.mknod(ctx, name, creds, opts) + case *directfsInode: + return it.mknod(ctx, name, creds, opts, d) default: panic("unknown dentry implementation") } @@ -356,13 +356,13 @@ func (d *dentry) mknod(ctx context.Context, name string, creds *auth.Credentials // Preconditions: // - !d.isSynthetic(). // - !target.isSynthetic(). -// - d.fs.renameMu must be locked. +// - i.fs.renameMu must be locked. func (d *dentry) link(ctx context.Context, target *dentry, name string) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.link(ctx, target.impl.(*lisafsDentry), name) - case *directfsDentry: - return dt.link(target.impl.(*directfsDentry), name) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.link(ctx, target.inode.impl.(*lisafsInode), name) + case *directfsInode: + return it.link(target, name, d) default: panic("unknown dentry implementation") } @@ -370,11 +370,11 @@ func (d *dentry) link(ctx context.Context, target *dentry, name string) (*dentry // Precondition: !d.isSynthetic(). func (d *dentry) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.mkdir(ctx, name, mode, uid, gid, createDentry) - case *directfsDentry: - return dt.mkdir(name, mode, uid, gid, createDentry) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.mkdir(ctx, name, mode, uid, gid, createDentry) + case *directfsInode: + return it.mkdir(name, mode, uid, gid, createDentry, d) default: panic("unknown dentry implementation") } @@ -382,11 +382,11 @@ func (d *dentry) mkdir(ctx context.Context, name string, mode linux.FileMode, ui // Precondition: !d.isSynthetic(). func (d *dentry) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.symlink(ctx, name, target, creds) - case *directfsDentry: - return dt.symlink(name, target, creds) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.symlink(ctx, name, target, creds) + case *directfsInode: + return it.symlink(name, target, creds, d) default: panic("unknown dentry implementation") } @@ -394,11 +394,11 @@ func (d *dentry) symlink(ctx context.Context, name, target string, creds *auth.C // Precondition: !d.isSynthetic(). func (d *dentry) openCreate(ctx context.Context, name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, handle, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.openCreate(ctx, name, accessFlags, mode, uid, gid, createDentry) - case *directfsDentry: - return dt.openCreate(name, accessFlags, mode, uid, gid, createDentry) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.openCreate(ctx, name, accessFlags, mode, uid, gid, createDentry) + case *directfsInode: + return it.openCreate(name, accessFlags, mode, uid, gid, createDentry, d) default: panic("unknown dentry implementation") } @@ -406,27 +406,27 @@ func (d *dentry) openCreate(ctx context.Context, name string, accessFlags uint32 // Preconditions: // - d.isDir(). -// - d.handleMu must be locked. +// - i.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.getDirentsLocked(ctx, recordDirent) - case *directfsDentry: - return dt.getDirentsLocked(recordDirent) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.getDirentsLocked(ctx, recordDirent) + case *directfsInode: + return it.getDirentsLocked(recordDirent, d) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). -func (d *dentry) flush(ctx context.Context) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - switch dt := d.impl.(type) { - case *lisafsDentry: - return flush(ctx, dt.writeFDLisa) - case *directfsDentry: +func (i *inode) flush(ctx context.Context) error { + i.handleMu.RLock() + defer i.handleMu.RUnlock() + switch it := i.impl.(type) { + case *lisafsInode: + return flush(ctx, it.writeFDLisa) + case *directfsInode: // Nothing to do here. return nil default: @@ -435,14 +435,14 @@ func (d *dentry) flush(ctx context.Context) error { } // Precondition: !d.isSynthetic(). -func (d *dentry) allocate(ctx context.Context, mode, offset, length uint64) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.writeFDLisa.Allocate(ctx, mode, offset, length) - case *directfsDentry: - return unix.Fallocate(int(d.writeFD.RacyLoad()), uint32(mode), int64(offset), int64(length)) +func (i *inode) allocate(ctx context.Context, mode, offset, length uint64) error { + i.handleMu.RLock() + defer i.handleMu.RUnlock() + switch it := i.impl.(type) { + case *lisafsInode: + return it.writeFDLisa.Allocate(ctx, mode, offset, length) + case *directfsInode: + return unix.Fallocate(int(i.writeFD.RacyLoad()), uint32(mode), int64(offset), int64(length)) default: panic("unknown dentry implementation") } @@ -459,59 +459,59 @@ func (d *dentry) connect(ctx context.Context, sockType linux.SockType) (int, err euid = lisafs.UID(creds.EffectiveKUID) egid = lisafs.GID(creds.EffectiveKGID) } - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.Connect(ctx, sockType, euid, egid) - case *directfsDentry: - return dt.connect(ctx, sockType, euid, egid) + switch it := d.inode.impl.(type) { + case *lisafsInode: + return it.controlFD.Connect(ctx, sockType, euid, egid) + case *directfsInode: + return it.connect(ctx, sockType, euid, egid, d) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). -func (d *dentry) readlinkImpl(ctx context.Context) (string, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.ReadLinkAt(ctx) - case *directfsDentry: - return dt.readlink() +func (i *inode) readlinkImpl(ctx context.Context) (string, error) { + switch it := i.impl.(type) { + case *lisafsInode: + return it.controlFD.ReadLinkAt(ctx) + case *directfsInode: + return it.readlink() default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). -func (d *dentry) unlink(ctx context.Context, name string, flags uint32) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.UnlinkAt(ctx, name, flags) - case *directfsDentry: - return unix.Unlinkat(dt.controlFD, name, int(flags)) +func (i *inode) unlink(ctx context.Context, name string, flags uint32) error { + switch it := i.impl.(type) { + case *lisafsInode: + return it.controlFD.UnlinkAt(ctx, name, flags) + case *directfsInode: + return unix.Unlinkat(it.controlFD, name, int(flags)) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). -func (d *dentry) rename(ctx context.Context, oldName string, newParent *dentry, newName string) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.controlFD.RenameAt(ctx, oldName, newParent.impl.(*lisafsDentry).controlFD.ID(), newName) - case *directfsDentry: - return fsutil.RenameAt(dt.controlFD, oldName, newParent.impl.(*directfsDentry).controlFD, newName) +func (i *inode) rename(ctx context.Context, oldName string, newParent *dentry, newName string) error { + switch it := i.impl.(type) { + case *lisafsInode: + return it.controlFD.RenameAt(ctx, oldName, newParent.inode.impl.(*lisafsInode).controlFD.ID(), newName) + case *directfsInode: + return fsutil.RenameAt(it.controlFD, oldName, newParent.inode.impl.(*directfsInode).controlFD, newName) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). -func (d *dentry) statfs(ctx context.Context) (linux.Statfs, error) { - switch dt := d.impl.(type) { - case *lisafsDentry: - return dt.statfs(ctx) - case *directfsDentry: - return dt.statfs() +func (i *inode) statfs(ctx context.Context) (linux.Statfs, error) { + switch it := i.impl.(type) { + case *lisafsInode: + return it.statfs(ctx) + case *directfsInode: + return it.statfs() default: panic("unknown dentry implementation") } @@ -524,12 +524,12 @@ func (fs *filesystem) restoreRoot(ctx context.Context, opts *vfs.CompleteRestore } // The root is always non-synthetic. - switch dt := fs.root.impl.(type) { - case *lisafsDentry: - return dt.restoreFile(ctx, &rootInode, opts) - case *directfsDentry: - dt.controlFDLisa = fs.client.NewFD(rootInode.ControlFD) - return dt.restoreFile(ctx, rootHostFD, opts) + switch it := fs.root.inode.impl.(type) { + case *lisafsInode: + return it.restoreFile(ctx, &rootInode, opts, fs.root) + case *directfsInode: + it.controlFDLisa = fs.client.NewFD(rootInode.ControlFD) + return it.restoreFile(ctx, rootHostFD, opts, fs.root) default: panic("unknown dentry implementation") } @@ -539,39 +539,39 @@ func (fs *filesystem) restoreRoot(ctx context.Context, opts *vfs.CompleteRestore // - !d.isSynthetic(). // - d.parent != nil and has been restored. func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { - switch dt := d.impl.(type) { - case *lisafsDentry: - controlFD := d.parent.Load().impl.(*lisafsDentry).controlFD + switch it := d.inode.impl.(type) { + case *lisafsInode: + controlFD := d.parent.Load().inode.impl.(*lisafsInode).controlFD inode, err := controlFD.Walk(ctx, d.name) if err != nil { - if !dt.isDir() || !dt.forMountpoint { - return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.fs, d), dt.fileType(), err) + if !d.isDir() || !d.forMountpoint { + return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(it.fs, d), it.inode.fileType(), err) } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. - inode, err = controlFD.MkdirAt(ctx, d.name, linux.FileMode(d.mode.Load()), lisafs.UID(d.uid.Load()), lisafs.GID(d.gid.Load())) + inode, err = controlFD.MkdirAt(ctx, d.name, linux.FileMode(it.mode.Load()), lisafs.UID(it.uid.Load()), lisafs.GID(it.gid.Load())) if err != nil { - return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(it.fs, d), err) } } - return dt.restoreFile(ctx, &inode, opts) + return it.restoreFile(ctx, &inode, opts, d) - case *directfsDentry: - controlFD := d.parent.Load().impl.(*directfsDentry).controlFD + case *directfsInode: + controlFD := d.parent.Load().inode.impl.(*directfsInode).controlFD childFD, err := tryOpen(func(flags int) (int, error) { n, err := unix.Openat(controlFD, d.name, flags, 0) return n, err }) if err != nil { - if !dt.isDir() || !dt.forMountpoint { - return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.fs, d), dt.fileType(), err) + if !d.isDir() || !d.forMountpoint { + return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(it.fs, d), it.inode.fileType(), err) } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. - if err := unix.Mkdirat(controlFD, d.name, d.mode.Load()); err != nil { - return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.fs, d), err) + if err := unix.Mkdirat(controlFD, d.name, it.mode.Load()); err != nil { + return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(it.fs, d), err) } // Try again... @@ -579,10 +579,10 @@ func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptio return unix.Openat(controlFD, d.name, flags, 0) }) if err != nil { - return fmt.Errorf("failed to open %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open %q: %w", genericDebugPathname(it.fs, d), err) } } - return dt.restoreFile(ctx, childFD, opts) + return it.restoreFile(ctx, childFD, opts, d) default: panic("unknown dentry implementation") @@ -598,13 +598,13 @@ func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptio func (r *revalidateState) doRevalidation(ctx context.Context, vfsObj *vfs.VirtualFilesystem, ds **[]*dentry) error { // Skip synthetic dentries because there is no actual implementation that can // be used to walk the remote filesystem. A start dentry cannot be replaced. - if r.start.isSynthetic() { + if r.start.inode.isSynthetic() { return nil } - switch r.start.impl.(type) { - case *lisafsDentry: + switch r.start.inode.impl.(type) { + case *lisafsInode: return doRevalidationLisafs(ctx, vfsObj, r, ds) - case *directfsDentry: + case *directfsInode: return doRevalidationDirectfs(ctx, vfsObj, r, ds) default: panic("unknown dentry implementation") diff --git a/pkg/sentry/fsimpl/gofer/lisafs_dentry.go b/pkg/sentry/fsimpl/gofer/lisafs_inode.go similarity index 59% rename from pkg/sentry/fsimpl/gofer/lisafs_dentry.go rename to pkg/sentry/fsimpl/gofer/lisafs_inode.go index d7c5699453..38804fcd2e 100644 --- a/pkg/sentry/fsimpl/gofer/lisafs_dentry.go +++ b/pkg/sentry/fsimpl/gofer/lisafs_inode.go @@ -63,12 +63,12 @@ func (fs *filesystem) handleAnameLisafs(ctx context.Context, rootInode lisafs.In } } -// lisafsDentry is a gofer dentry implementation. It represents a dentry backed +// lisafsInode is a gofer inode implementation. It represents a inode backed // by a lisafs connection. // // +stateify savable -type lisafsDentry struct { - dentry +type lisafsInode struct { + inode // controlFD is used by lisafs to perform path based operations on this // dentry. controlFD is immutable. @@ -88,19 +88,20 @@ type lisafsDentry struct { // readFDLisa and writeFDLisa may or may not represent the same LISAFS FD. // Once either transitions from closed (Ok() == false) to open // (Ok() == true), it may be mutated with dentry.handleMu locked, but cannot - // be closed until the dentry is destroyed. writeFDLisa is protected by + // be closed until the dentry is destroyei. writeFDLisa is protected by // dentry.handleMu. writeFDLisa lisafs.ClientFD `state:"nosave"` } -// newLisafsDentry creates a new dentry representing the given file. The dentry +// newLisafsInode creates a new dentry representing the given file. The dentry // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. -// newLisafsDentry takes ownership of ino. -func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (*dentry, error) { +// newLisafsInode takes ownership of ino. +func (fs *filesystem) newLisafsInode(ctx context.Context, ino *lisafs.Inode) (*inode, error) { + var ret *inode if ino.Stat.Mask&linux.STATX_TYPE == 0 { - ctx.Warningf("can't create gofer.dentry without file type") + ctx.Warningf("can't create gofer.inode without file type") fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */) return nil, linuxerr.EIO } @@ -111,106 +112,108 @@ func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (* } inoKey := inoKeyFromStatx(&ino.Stat) - d := &lisafsDentry{ - dentry: dentry{ - fs: fs, - inoKey: inoKey, - ino: fs.inoFromKey(inoKey), - mode: atomicbitops.FromUint32(uint32(ino.Stat.Mode)), - uid: atomicbitops.FromUint32(uint32(fs.opts.dfltuid)), - gid: atomicbitops.FromUint32(uint32(fs.opts.dfltgid)), - blockSize: atomicbitops.FromUint32(hostarch.PageSize), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), - }, - controlFD: fs.client.NewFD(ino.ControlFD), - } - if ino.Stat.Mask&linux.STATX_UID != 0 { - d.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID))) - } - if ino.Stat.Mask&linux.STATX_GID != 0 { - d.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID))) - } - if ino.Stat.Mask&linux.STATX_SIZE != 0 { - d.size = atomicbitops.FromUint64(ino.Stat.Size) - } - if ino.Stat.Blksize != 0 { - d.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize) - } - if ino.Stat.Mask&linux.STATX_ATIME != 0 { - d.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime)) - } else { - d.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) - } - if ino.Stat.Mask&linux.STATX_MTIME != 0 { - d.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime)) - } else { - d.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) - } - if ino.Stat.Mask&linux.STATX_CTIME != 0 { - d.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime)) - } else { - // Approximate ctime with mtime if ctime isn't available. - d.ctime = atomicbitops.FromInt64(d.mtime.Load()) - } - if ino.Stat.Mask&linux.STATX_BTIME != 0 { - d.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime)) - } - if ino.Stat.Mask&linux.STATX_NLINK != 0 { - d.nlink = atomicbitops.FromUint32(ino.Stat.Nlink) - } else { - if ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory { - d.nlink = atomicbitops.FromUint32(2) + if fs.opts.enableInodeSharing { + cachedInode := fs.findInode(inoKey) + if cachedInode != nil { + ret = cachedInode + } + } + + if ret == nil { + i := &lisafsInode{ + inode: inode{}, + controlFD: fs.client.NewFD(ino.ControlFD), + } + i.init(fs, fs.inoFromKey(inoKey), &inoKey, i) + ret = &i.inode + ret.mode.Store(uint32(ino.Stat.Mode)) + ret.uid.Store(uint32(fs.opts.dfltuid)) + ret.gid.Store(uint32(fs.opts.dfltgid)) + ret.blockSize.Store(uint32(hostarch.PageSize)) + + if ino.Stat.Mask&linux.STATX_UID != 0 { + ret.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID))) + } + if ino.Stat.Mask&linux.STATX_GID != 0 { + ret.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID))) + } + if ino.Stat.Mask&linux.STATX_SIZE != 0 { + ret.size = atomicbitops.FromUint64(ino.Stat.Size) + } + if ino.Stat.Blksize != 0 { + ret.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize) + } + if ino.Stat.Mask&linux.STATX_ATIME != 0 { + ret.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime)) } else { - d.nlink = atomicbitops.FromUint32(1) + ret.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) + } + if ino.Stat.Mask&linux.STATX_MTIME != 0 { + ret.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime)) + } else { + ret.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) + } + if ino.Stat.Mask&linux.STATX_CTIME != 0 { + ret.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime)) + } else { + // Approximate ctime with mtime if ctime isn't available. + ret.ctime = atomicbitops.FromInt64(ret.mtime.Load()) + } + if ino.Stat.Mask&linux.STATX_BTIME != 0 { + ret.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime)) + } + + if ino.Stat.Mask&linux.STATX_NLINK != 0 { + ret.nlink = atomicbitops.FromUint32(ino.Stat.Nlink) + } else { + if ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory { + ret.nlink = atomicbitops.FromUint32(2) + } else { + ret.nlink = atomicbitops.FromUint32(1) + } } } - d.dentry.init(d) - fs.syncMu.Lock() - fs.syncableDentries.PushBack(&d.syncableListEntry) - fs.syncMu.Unlock() - return &d.dentry, nil + return ret, nil } -func (d *lisafsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) { - openFD, hostFD, err := d.controlFD.OpenAt(ctx, flags) +func (i *lisafsInode) openHandle(ctx context.Context, flags uint32) (handle, error) { + openFD, hostFD, err := i.controlFD.OpenAt(ctx, flags) if err != nil { return noHandle, err } return handle{ - fdLisa: d.controlFD.Client().NewFD(openFD), + fdLisa: i.controlFD.Client().NewFD(openFD), fd: int32(hostFD), }, nil } -func (d *lisafsDentry) updateHandles(ctx context.Context, h handle, readable, writable bool) { +func (i *lisafsInode) updateHandles(ctx context.Context, h handle, readable, writable bool) { // Switch to new LISAFS FDs. Note that the read, write and mmap host FDs are // updated separately. oldReadFD := lisafs.InvalidFDID if readable { - oldReadFD = d.readFDLisa.ID() - d.readFDLisa = h.fdLisa + oldReadFD = i.readFDLisa.ID() + i.readFDLisa = h.fdLisa } oldWriteFD := lisafs.InvalidFDID if writable { - oldWriteFD = d.writeFDLisa.ID() - d.writeFDLisa = h.fdLisa + oldWriteFD = i.writeFDLisa.ID() + i.writeFDLisa = h.fdLisa } // NOTE(b/141991141): Close old FDs before making new fids visible (by - // unlocking d.handleMu). + // unlocking i.inode.handleMu). if oldReadFD.Ok() { - d.fs.client.CloseFD(ctx, oldReadFD, false /* flush */) + i.inode.fs.client.CloseFD(ctx, oldReadFD, false /* flush */) } if oldWriteFD.Ok() && oldReadFD != oldWriteFD { - d.fs.client.CloseFD(ctx, oldWriteFD, false /* flush */) + i.inode.fs.client.CloseFD(ctx, oldWriteFD, false /* flush */) } } -// Precondition: d.metadataMu must be locked. +// Precondition: i.inode.metadataMu must be lockei. // -// +checklocks:d.metadataMu -func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error { +// +checklocks:i.inode.metadataMu +func (i *lisafsInode) updateMetadataLocked(ctx context.Context, h handle) error { handleMuRLocked := false if !h.fdLisa.Ok() { // Use open FDs in preferenece to the control FD. This may be significantly @@ -218,17 +221,17 @@ func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error // readable one since some filesystem implementations may update a writable // FD's metadata after writes, without making metadata updates immediately // visible to read-only FDs representing the same file. - d.handleMu.RLock() + i.inode.handleMu.RLock() switch { - case d.writeFDLisa.Ok(): - h.fdLisa = d.writeFDLisa + case i.writeFDLisa.Ok(): + h.fdLisa = i.writeFDLisa handleMuRLocked = true - case d.readFDLisa.Ok(): - h.fdLisa = d.readFDLisa + case i.readFDLisa.Ok(): + h.fdLisa = i.readFDLisa handleMuRLocked = true default: - h.fdLisa = d.controlFD - d.handleMu.RUnlock() + h.fdLisa = i.controlFD + i.inode.handleMu.RUnlock() } } @@ -236,12 +239,12 @@ func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error err := h.fdLisa.StatTo(ctx, &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). - d.handleMu.RUnlock() // +checklocksforce: complex case. + i.inode.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err } - d.updateMetadataFromStatxLocked(&stat) + i.updateMetadataFromStatxLocked(&stat) return nil } @@ -257,39 +260,43 @@ func chmod(ctx context.Context, controlFD lisafs.ClientFD, mode uint16) error { return failureErr } -func (d *lisafsDentry) destroy(ctx context.Context) { - if d.readFDLisa.Ok() && d.readFDLisa.ID() != d.writeFDLisa.ID() { - d.readFDLisa.Close(ctx, false /* flush */) +func (i *lisafsInode) destroy(ctx context.Context, d *dentry) { + if i.readFDLisa.Ok() && i.readFDLisa.ID() != i.writeFDLisa.ID() { + i.readFDLisa.Close(ctx, false /* flush */) } - if d.writeFDLisa.Ok() { - d.writeFDLisa.Close(ctx, false /* flush */) + if i.writeFDLisa.Ok() { + i.writeFDLisa.Close(ctx, false /* flush */) } - if d.controlFD.Ok() { + if i.controlFD.Ok() { // Close the control FD. Propagate the Close RPCs immediately to the server // if the dentry being destroyed is a deleted regular file. This is to // release the disk space on remote immediately. This will flush the above // read/write lisa FDs as well. - flushClose := d.isDeleted() && d.isRegularFile() - d.controlFD.Close(ctx, flushClose) + flushClose := d.isDeleted() && i.isRegularFile() + i.controlFD.Close(ctx, flushClose) } } -func (d *lisafsDentry) getRemoteChild(ctx context.Context, name string) (*dentry, error) { - childInode, err := d.controlFD.Walk(ctx, name) +func (i *lisafsInode) getRemoteChild(ctx context.Context, name string) (*dentry, error) { + childInode, err := i.controlFD.Walk(ctx, name) if err != nil { return nil, err } - return d.fs.newLisafsDentry(ctx, &childInode) + inode, err := i.inode.fs.newLisafsInode(ctx, &childInode) + if err != nil { + return nil, err + } + return i.inode.fs.newDentry(inode) } // Preconditions: // - fs.renameMu must be locked. -// - d.opMu must be locked. -// - d.isDir(). +// - i.opMu must be locked. +// - i.isDir(). // - !rp.done() && rp.Component() is not "." or "..". // // Postcondition: The returned dentry is already cached appropriately. -func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry) (*dentry, error) { +func (i *lisafsInode) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry, d *dentry) (*dentry, error) { // Collect as many path components as possible to walk. var namesArr [16]string // arbitrarily sized array to help avoid slice allocation. names := namesArr[:0] @@ -304,12 +311,12 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r return true }) // Walk as much of the path as possible in 1 RPC. - _, inodes, err := d.controlFD.WalkMultiple(ctx, names) + _, inodes, err := i.controlFD.WalkMultiple(ctx, names) if err != nil { return nil, err } if len(inodes) == 0 { - // d.opMu is locked. So a new child could not have appeared concurrently. + // i.opMu is locked. So a new child could not have appeared concurrently. // It should be safe to mark this as a negative entry. d.childrenMu.Lock() defer d.childrenMu.Unlock() @@ -318,7 +325,7 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r } // Add the walked inodes into the dentry tree. - startParent := &d.dentry + startParent := d curParent := startParent curParentLock := func() { if curParent != startParent { @@ -336,7 +343,7 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r var dentryCreationErr error for i := range inodes { if dentryCreationErr != nil { - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) continue } @@ -346,11 +353,17 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r if ok && child != nil { // We raced. Clean up the new inode and proceed with // the cached child. - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) } else { // Create and cache the new dentry. var err error - child, err = d.fs.newLisafsDentry(ctx, &inodes[i]) + inode, err := d.inode.fs.newLisafsInode(ctx, &inodes[i]) + if err != nil { + dentryCreationErr = err + curParentUnlock() + continue + } + child, err = d.inode.fs.newDentry(inode) if err != nil { dentryCreationErr = err curParentUnlock() @@ -377,94 +390,106 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r return ret, dentryCreationErr } -func (d *lisafsDentry) newChildDentry(ctx context.Context, childIno *lisafs.Inode, childName string) (*dentry, error) { - child, err := d.fs.newLisafsDentry(ctx, childIno) +func (i *lisafsInode) newChildDentry(ctx context.Context, childIno *lisafs.Inode, childName string) (*dentry, error) { + inode, err := i.fs.newLisafsInode(ctx, childIno) if err != nil { - if err := d.controlFD.UnlinkAt(ctx, childName, 0 /* flags */); err != nil { - log.Warningf("failed to clean up created child %s after newLisafsDentry() failed: %v", childName, err) + if err := i.controlFD.UnlinkAt(ctx, childName, 0 /* flags */); err != nil { + log.Warningf("failed to create new lisafs inode for %q: %v", childName, err) + } + return nil, err + } + child, err := i.fs.newDentry(inode) + if err != nil { + if err := i.controlFD.UnlinkAt(ctx, childName, 0 /* flags */); err != nil { + log.Warningf("failed to clean up created child %q after newDentry() failed: %v", childName, err) } } return child, err } -func (d *lisafsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { +func (i *lisafsInode) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); !ok { - childInode, err := d.controlFD.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor) + childInode, err := i.controlFD.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor) if err != nil { return nil, err } - return d.newChildDentry(ctx, &childInode, name) + return i.newChildDentry(ctx, &childInode, name) } // This mknod(2) is coming from unix bind(2), as opts.Endpoint is set. sockType := opts.Endpoint.(transport.Endpoint).Type() - childInode, boundSocketFD, err := d.controlFD.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) + childInode, boundSocketFD, err := i.controlFD.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } hbep := opts.Endpoint.(transport.HostBoundEndpoint) if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil { - if err := d.controlFD.UnlinkAt(ctx, name, 0 /* flags */); err != nil { + if err := i.controlFD.UnlinkAt(ctx, name, 0 /* flags */); err != nil { log.Warningf("failed to clean up socket which was created by BindAt RPC: %v", err) } - d.fs.client.CloseFD(ctx, childInode.ControlFD, false /* flush */) + i.inode.fs.client.CloseFD(ctx, childInode.ControlFD, false /* flush */) return nil, err } - child, err := d.newChildDentry(ctx, &childInode, name) + child, err := i.newChildDentry(ctx, &childInode, name) if err != nil { hbep.ResetBoundSocketFD(ctx) return nil, err } // Set the endpoint on the newly created child dentry, and take the // corresponding extra dentry reference. - child.endpoint = opts.Endpoint + child.inode.endpoint = opts.Endpoint child.IncRef() return child, nil } -func (d *lisafsDentry) link(ctx context.Context, target *lisafsDentry, name string) (*dentry, error) { - linkInode, err := d.controlFD.LinkAt(ctx, target.controlFD.ID(), name) +func (i *lisafsInode) link(ctx context.Context, target *lisafsInode, name string) (*dentry, error) { + linkInode, err := i.controlFD.LinkAt(ctx, target.controlFD.ID(), name) if err != nil { return nil, err } // TODO(gvisor.dev/issue/6739): Hard linked dentries should share the same // inode fields. - return d.newChildDentry(ctx, &linkInode, name) + return i.newChildDentry(ctx, &linkInode, name) } -func (d *lisafsDentry) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, error) { - childDirInode, err := d.controlFD.MkdirAt(ctx, name, mode, lisafs.UID(uid), lisafs.GID(gid)) +func (i *lisafsInode) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, error) { + childDirInode, err := i.controlFD.MkdirAt(ctx, name, mode, lisafs.UID(uid), lisafs.GID(gid)) if err != nil { return nil, err } if !createDentry { return nil, nil } - return d.newChildDentry(ctx, &childDirInode, name) + return i.newChildDentry(ctx, &childDirInode, name) } -func (d *lisafsDentry) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { - symlinkInode, err := d.controlFD.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) +func (i *lisafsInode) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { + symlinkInode, err := i.controlFD.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } - return d.newChildDentry(ctx, &symlinkInode, name) + return i.newChildDentry(ctx, &symlinkInode, name) } -func (d *lisafsDentry) openCreate(ctx context.Context, name string, flags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, handle, error) { - ino, openFD, hostFD, err := d.controlFD.OpenCreateAt(ctx, name, flags, mode, lisafs.UID(uid), lisafs.GID(gid)) +func (i *lisafsInode) openCreate(ctx context.Context, name string, flags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, handle, error) { + ino, openFD, hostFD, err := i.controlFD.OpenCreateAt(ctx, name, flags, mode, lisafs.UID(uid), lisafs.GID(gid)) if err != nil { return nil, noHandle, err } h := handle{ - fdLisa: d.fs.client.NewFD(openFD), + fdLisa: i.inode.fs.client.NewFD(openFD), fd: int32(hostFD), } if !createDentry { return nil, h, nil } - child, err := d.fs.newLisafsDentry(ctx, &ino) + inode, err := i.inode.fs.newLisafsInode(ctx, &ino) + if err != nil { + h.close(ctx) + return nil, noHandle, err + } + child, err := i.inode.fs.newDentry(inode) if err != nil { h.close(ctx) return nil, noHandle, err @@ -478,7 +503,7 @@ const lisafsGetdentsCount = int32(64 * 1024) // Preconditions: // - getDirents may not be called concurrently with another getDirents call. -func (d *lisafsDentry) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { +func (i *lisafsInode) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { // shouldSeek0 indicates whether the server should SEEK to 0 before reading // directory entries. shouldSeek0 := true @@ -489,7 +514,7 @@ func (d *lisafsDentry) getDirentsLocked(ctx context.Context, recordDirent func(n count = -count shouldSeek0 = false } - dirents, err := d.readFDLisa.Getdents64(ctx, count) + dirents, err := i.readFDLisa.Getdents64(ctx, count) if err != nil { return err } @@ -517,9 +542,9 @@ func flush(ctx context.Context, fd lisafs.ClientFD) error { return nil } -func (d *lisafsDentry) statfs(ctx context.Context) (linux.Statfs, error) { +func (i *lisafsInode) statfs(ctx context.Context) (linux.Statfs, error) { var statFS lisafs.StatFS - if err := d.controlFD.StatFSTo(ctx, &statFS); err != nil { + if err := i.controlFD.StatFSTo(ctx, &statFS); err != nil { return linux.Statfs{}, err } return linux.Statfs{ @@ -534,8 +559,8 @@ func (d *lisafsDentry) statfs(ctx context.Context) (linux.Statfs, error) { }, nil } -func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error { - d.controlFD = d.fs.client.NewFD(inode.ControlFD) +func (i *lisafsInode) restoreFile(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions, d *dentry) error { + i.controlFD = i.inode.fs.client.NewFD(inode.ControlFD) // Gofers do not preserve inoKey across checkpoint/restore, so: // @@ -543,40 +568,43 @@ func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opt // would invalidate dentries, since we can't revalidate dentries by // checking inoKey. // - // - We need to associate the new inoKey with the existing d.ino. - d.inoKey = inoKeyFromStatx(&inode.Stat) - d.fs.inoMu.Lock() - d.fs.inoByKey[d.inoKey] = d.ino - d.fs.inoMu.Unlock() + // - We need to associate the new inoKey with the existing i.ino. + i.inode.inoKey = inoKeyFromStatx(&inode.Stat) + i.inode.fs.inoMu.Lock() + i.inode.fs.inoByKey[i.inode.inoKey] = i.inode.ino + i.inode.fs.inoMu.Unlock() + i.inode.fs.inodeByInoMu.Lock() + i.inode.fs.inodeByIno[i.inoKey] = &i.inode + i.inode.fs.inodeByInoMu.Unlock() // Check metadata stability before updating metadata. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - if d.isRegularFile() { + i.inode.metadataMu.Lock() + defer i.inode.metadataMu.Unlock() + if i.isRegularFile() { if opts.ValidateFileSizes { if inode.Stat.Mask&linux.STATX_SIZE == 0 { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d.fs, &d.dentry))} + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(i.inode.fs, d))} } - if d.size.RacyLoad() != inode.Stat.Size { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.fs, &d.dentry), d.size.Load(), inode.Stat.Size)} + if i.inode.size.RacyLoad() != inode.Stat.Size { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(i.inode.fs, d), i.inode.size.Load(), inode.Stat.Size)} } } if opts.ValidateFileModificationTimestamps { if inode.Stat.Mask&linux.STATX_MTIME == 0 { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d.fs, &d.dentry))} + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(i.inode.fs, d))} } - if want := dentryTimestamp(inode.Stat.Mtime); d.mtime.RacyLoad() != want { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.fs, &d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} + if want := dentryTimestamp(inode.Stat.Mtime); i.inode.mtime.RacyLoad() != want { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(i.inode.fs, d), linux.NsecToStatxTimestamp(i.inode.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } - if !d.cachedMetadataAuthoritative() { - d.updateMetadataFromStatxLocked(&inode.Stat) + if !i.inode.cachedMetadataAuthoritative() { + i.updateMetadataFromStatxLocked(&inode.Stat) } - if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { + if rw, ok := i.inode.fs.savedDentryRW[d]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { - return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(i.inode.fs, d), err) } } @@ -590,7 +618,7 @@ func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opt // - fs.renameMu must be locked. // - InteropModeShared is in effect. func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error { - start := state.start.impl.(*lisafsDentry) + start := state.start.inode.impl.(*lisafsInode) // Populate state.names. state.names = state.names[:0] // For sanity. @@ -603,11 +631,11 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st // Lock metadata on all dentries *before* getting attributes for them. if state.refreshStart { - start.metadataMu.Lock() - defer start.metadataMu.Unlock() + start.inode.metadataMu.Lock() + defer start.inode.metadataMu.Unlock() } for _, d := range state.dentries { - d.metadataMu.Lock() + d.inode.metadataMu.Lock() } // lastUnlockedDentry keeps track of the dentries in state.dentries that have // already had their metadataMu unlocked. Avoid defer unlock in the loop @@ -617,7 +645,7 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st // Advance to the first unevaluated dentry and unlock the remaining // dentries. for lastUnlockedDentry++; lastUnlockedDentry < len(state.dentries); lastUnlockedDentry++ { - state.dentries[lastUnlockedDentry].metadataMu.Unlock() + state.dentries[lastUnlockedDentry].inode.metadataMu.Unlock() } }() @@ -630,7 +658,7 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st if state.refreshStart { if len(stats) > 0 { // First dentry is where the search is starting, just update attributes - // since it cannot be replaced. + // since it cannot be replacei. start.updateMetadataFromStatxLocked(&stats[0]) // +checklocksforce: see above. stats = stats[1:] } @@ -640,13 +668,13 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st d := state.dentries[i] found := i < len(stats) // Advance lastUnlockedDentry. It is the responsibility of this for loop - // block to unlock d.metadataMu. + // block to unlock i.inode.metadataMu. lastUnlockedDentry = i // Note that synthetic dentries will always fail this comparison check. - if !found || d.inoKey != inoKeyFromStatx(&stats[i]) { - d.metadataMu.Unlock() - if !found && d.isSynthetic() { + if !found || d.inode.inoKey != inoKeyFromStatx(&stats[i]) { + d.inode.metadataMu.Unlock() + if !found && d.inode.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. return nil @@ -658,8 +686,8 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st } // The file at this path hasn't changed. Just update cached metadata. - d.impl.(*lisafsDentry).updateMetadataFromStatxLocked(&stats[i]) // +checklocksforce: see above. - d.metadataMu.Unlock() + d.inode.impl.(*lisafsInode).updateMetadataFromStatxLocked(&stats[i]) // +checklocksforce: see above. + d.inode.metadataMu.Unlock() } return nil } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 56d5136817..0da2bc794e 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -36,8 +36,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func (d *dentry) isRegularFile() bool { - return d.fileType() == linux.S_IFREG +func (i *inode) isRegularFile() bool { + return i.fileType() == linux.S_IFREG } // +stateify savable @@ -51,16 +51,16 @@ type regularFileFD struct { func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { fd := ®ularFileFD{} - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err } - if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { + if fd.vfsfd.IsWritable() && (d.inode.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } - if d.mmapFD.Load() >= 0 { + if d.inode.mmapFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() @@ -77,8 +77,8 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } - d := fd.dentry() - if d.fs.opts.interop == InteropModeExclusive { + i := fd.dentry().inode + if i.fs.opts.interop == InteropModeExclusive { // d may have dirty pages that we won't write back now (and wouldn't // have in VFS1), making a flushf RPC ineffective. If this is the case, // skip the flushf. @@ -87,21 +87,21 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { // modes if forcePageCache is in effect; we conservatively assume that // applications have some way of tolerating this and still want the // flushf. - d.dataMu.RLock() - haveDirtyPages := !d.dirty.IsEmpty() - d.dataMu.RUnlock() + i.dataMu.RLock() + haveDirtyPages := !i.dirty.IsEmpty() + i.dataMu.RUnlock() if haveDirtyPages { return nil } } - return d.flush(ctx) + return i.flush(ctx) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { - return d.allocate(ctx, mode, offset, length) + return d.inode.allocate(ctx, mode, offset, length) }) } @@ -110,7 +110,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs start := fsmetric.StartReadWait() d := fd.dentry() defer func() { - if d.readFD.Load() >= 0 { + if d.inode.readFD.Load() >= 0 { fsmetric.GoferReadsHost.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { @@ -131,8 +131,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } // Check for reading at EOF before calling into MM (but not under - // InteropModeShared, which makes d.size unreliable). - if d.cachedMetadataAuthoritative() && uint64(offset) >= d.size.Load() { + // InteropModeShared, which makes d.inode.size unreliable). + if d.inode.cachedMetadataAuthoritative() && uint64(offset) >= d.inode.size.Load() { return 0, io.EOF } @@ -151,7 +151,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw.direct = true n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtimeLocked(fd.vfsfd.Mount()) } @@ -159,7 +159,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw := getDentryReadWriter(ctx, d, offset) n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtime(fd.vfsfd.Mount()) } @@ -198,13 +198,13 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // If the fd was opened with O_APPEND, make sure the file size is updated. // There is a possible race here if size is modified externally after // metadata cache is updated. - if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.inode.cachedMetadataAuthoritative() { if err := d.refreshSizeLocked(ctx); err != nil { return 0, offset, err } @@ -212,8 +212,8 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { - // Holding d.metadataMu is sufficient for reading d.size. - offset = int64(d.size.RacyLoad()) + // Holding d.inode.metadataMu is sufficient for reading d.inode.size. + offset = int64(d.inode.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -221,10 +221,10 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } src = src.TakeFirst64(limit) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking - // d.metadataMu (recursively). + // d.inode.metadataMu (recursively). d.touchCMtimeLocked() } @@ -264,14 +264,14 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // As with Linux, writing clears the setuid and setgid bits. if n > 0 { - oldMode := d.mode.Load() - // If setuid or setgid were set, update d.mode and propagate + oldMode := d.inode.mode.Load() + // If setuid or setgid were set, update d.inode.mode and propagate // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { if err := d.chmod(ctx, uint16(newMode)); err != nil { return 0, offset, err } - d.mode.Store(newMode) + d.inode.mode.Store(newMode) } } @@ -294,19 +294,19 @@ func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64 mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange - d.dataMu.Lock() - d.cache.RemoveRangeWith(mr, func(cseg fsutil.FileRangeIterator) { + d.inode.dataMu.Lock() + d.inode.cache.RemoveRangeWith(mr, func(cseg fsutil.FileRangeIterator) { freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) }) - d.dataMu.Unlock() + d.inode.dataMu.Unlock() // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + d.inode.mapsMu.Lock() + d.inode.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() // Finally free pages removed from the cache. - mf := d.fs.mf + mf := d.inode.fs.mf for _, freedFR := range freed { mf.DecRef(freedFR) } @@ -361,10 +361,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from // readHandle() without locking dentry.dataMu. - rw.d.handleMu.RLock() - defer rw.d.handleMu.RUnlock() - h := rw.d.readHandle() - if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + rw.d.inode.handleMu.RLock() + defer rw.d.inode.handleMu.RUnlock() + h := rw.d.inode.readHandle() + if (rw.d.inode.mmapFD.RacyLoad() >= 0 && !rw.d.inode.fs.opts.forcePageCache) || rw.d.inode.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.off += n return n, err @@ -372,20 +372,20 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // Otherwise read from/through the cache. memCgID := pgalloc.MemoryCgroupIDFromContext(rw.ctx) - mf := rw.d.fs.mf + mf := rw.d.inode.fs.mf fillCache := mf.ShouldCacheEvictable() var dataMuUnlock func() if fillCache { - rw.d.dataMu.Lock() - dataMuUnlock = rw.d.dataMu.Unlock + rw.d.inode.dataMu.Lock() + dataMuUnlock = rw.d.inode.dataMu.Unlock } else { - rw.d.dataMu.RLock() - dataMuUnlock = rw.d.dataMu.RUnlock + rw.d.inode.dataMu.RLock() + dataMuUnlock = rw.d.inode.dataMu.RUnlock } defer dataMuUnlock() // Compute the range to read (limited by file size and overflow-checked). - end := rw.d.size.Load() + end := rw.d.inode.size.Load() if rw.off >= end { return 0, io.EOF } @@ -394,7 +394,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) } var done uint64 - seg, gap := rw.d.cache.Find(rw.off) + seg, gap := rw.d.inode.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { @@ -428,13 +428,13 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - _, err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size.Load(), mf, pgalloc.AllocOpts{ + _, err := rw.d.inode.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.inode.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, }, h.readToBlocksAt) - mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) - seg, gap = rw.d.cache.Find(rw.off) + mf.MarkEvictable(rw.d.inode, pgalloc.EvictableRange{Start: optMR.Start, End: optMR.End}) + seg, gap = rw.d.inode.cache.Find(rw.off) if !seg.Ok() { return done, err } @@ -464,7 +464,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // -// Preconditions: rw.d.metadataMu must be locked. +// Preconditions: rw.d.inode.metadataMu must be locked. func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil @@ -475,16 +475,16 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // (which prevents us from caching file contents), or if the file was // opened with O_DIRECT, write directly to dentry.writeHandle() // without locking dentry.dataMu. - rw.d.handleMu.RLock() - defer rw.d.handleMu.RUnlock() - h := rw.d.writeHandle() - if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + rw.d.inode.handleMu.RLock() + defer rw.d.inode.handleMu.RUnlock() + h := rw.d.inode.writeHandle() + if (rw.d.inode.mmapFD.RacyLoad() >= 0 && !rw.d.inode.fs.opts.forcePageCache) || rw.d.inode.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n - rw.d.dataMu.Lock() - defer rw.d.dataMu.Unlock() - if rw.off > rw.d.size.Load() { - rw.d.size.Store(rw.off) + rw.d.inode.dataMu.Lock() + defer rw.d.inode.dataMu.Unlock() + if rw.off > rw.d.inode.size.Load() { + rw.d.inode.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } @@ -492,9 +492,9 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro } // Otherwise write to/through the cache. - mf := rw.d.fs.mf - rw.d.dataMu.Lock() - defer rw.d.dataMu.Unlock() + mf := rw.d.inode.fs.mf + rw.d.inode.dataMu.Lock() + defer rw.d.inode.dataMu.Unlock() // Compute the range to write (overflow-checked). start := rw.off @@ -507,7 +507,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro done uint64 retErr error ) - seg, gap := rw.d.cache.Find(rw.off) + seg, gap := rw.d.inode.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { @@ -525,7 +525,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro done += n rw.off += n srcs = srcs.DropFirst64(n) - rw.d.dirty.MarkDirty(segMR) + rw.d.inode.dirty.MarkDirty(segMR) if err != nil { retErr = err goto exitLoop @@ -556,18 +556,18 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro } } exitLoop: - if rw.off > rw.d.size.Load() { - rw.d.size.Store(rw.off) + if rw.off > rw.d.inode.size.Load() { + rw.d.inode.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } // If InteropModeWritethrough is in effect, flush written data back to the // remote filesystem. - if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if rw.d.inode.fs.opts.interop == InteropModeWritethrough && done != 0 { if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + }, &rw.d.inode.cache, &rw.d.inode.dirty, rw.d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 @@ -581,13 +581,13 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { if size == 0 { return nil } - d.handleMu.RLock() - defer d.handleMu.RUnlock() - h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + h := d.inode.writeHandle() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). - dentrySize := d.size.Load() + dentrySize := d.inode.size.Load() if uint64(offset) >= dentrySize { return nil } @@ -598,7 +598,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), - }, &d.cache, &d.dirty, dentrySize, d.fs.mf, h.writeFromBlocksAt) + }, &d.inode.cache, &d.inode.dirty, dentrySize, d.inode.fs.mf, h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -622,12 +622,12 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 offset += fdOffset case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: // Ensure file size is up to date. - if !d.cachedMetadataAuthoritative() { - if err := d.updateMetadata(ctx); err != nil { + if !d.inode.cachedMetadataAuthoritative() { + if err := d.inode.updateMetadata(ctx); err != nil { return 0, err } } - size := int64(d.size.Load()) + size := int64(d.inode.size.Load()) // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous // block of data. switch whence { @@ -662,8 +662,8 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Force sentry page caching at your own risk. - if !d.fs.opts.forcePageCache { - switch d.fs.opts.interop { + if !d.inode.fs.opts.forcePageCache { + switch d.inode.fs.opts.interop { case InteropModeExclusive: // Any mapping is fine. case InteropModeWritethrough: @@ -677,16 +677,16 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt case InteropModeShared: // All mappings require a host FD to be coherent with other // filesystem users. - if d.mmapFD.Load() < 0 { + if d.inode.mmapFD.Load() < 0 { return linuxerr.ENODEV } default: - panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + panic(fmt.Sprintf("unknown InteropMode %v", d.inode.fs.opts.interop)) } } // After this point, d may be used as a memmap.Mappable. - d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) - opts.SentryOwnedContent = d.fs.opts.forcePageCache + d.inode.pf.hostFileMapperInitOnce.Do(d.inode.pf.hostFileMapper.Init) + opts.SentryOwnedContent = d.inode.fs.opts.forcePageCache return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } @@ -696,47 +696,47 @@ func (fs *filesystem) mayCachePagesInMemoryFile() bool { // AddMapping implements memmap.Mappable.AddMapping. func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { - d.mapsMu.Lock() - mapped := d.mappings.AddMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + mapped := d.inode.mappings.AddMapping(ms, ar, offset, writable) // Do this unconditionally since whether we have a host FD can change // across save/restore. for _, r := range mapped { - d.pf.hostFileMapper.IncRefOn(r) + d.inode.pf.hostFileMapper.IncRefOn(r) } - if d.fs.mayCachePagesInMemoryFile() { + if d.inode.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. - mf := d.fs.mf + mf := d.inode.fs.mf for _, r := range mapped { - mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) + mf.MarkUnevictable(d.inode, pgalloc.EvictableRange{Start: r.Start, End: r.End}) } } - d.mapsMu.Unlock() + d.inode.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { - d.mapsMu.Lock() - unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + unmapped := d.inode.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { - d.pf.hostFileMapper.DecRefOn(r) + d.inode.pf.hostFileMapper.DecRefOn(r) } - if d.fs.mayCachePagesInMemoryFile() { + if d.inode.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. - mf := d.fs.mf - d.dataMu.Lock() + mf := d.inode.fs.mf + d.inode.dataMu.Lock() for _, r := range unmapped { // Since these pages are no longer mapped, they are no longer // concurrently dirtyable by a writable memory mapping. - d.dirty.AllowClean(r) - mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) + d.inode.dirty.AllowClean(r) + mf.MarkEvictable(d.inode, pgalloc.EvictableRange{Start: r.Start, End: r.End}) } - d.dataMu.Unlock() + d.inode.dataMu.Unlock() } - d.mapsMu.Unlock() + d.inode.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. @@ -746,17 +746,17 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if d.mmapFD.RacyLoad() >= 0 && !d.fs.opts.forcePageCache { + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + if d.inode.mmapFD.RacyLoad() >= 0 && !d.inode.fs.opts.forcePageCache { mr := optional - if d.fs.opts.limitHostFDTranslation { + if d.inode.fs.opts.limitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, - File: &d.pf, + File: &d.inode.pf, Offset: mr.Start, Perms: hostarch.AnyAccess, }, @@ -764,12 +764,12 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab } memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) - d.dataMu.Lock() - defer d.dataMu.Unlock() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() - // Constrain translations to d.size (rounded up) to prevent translation to + // Constrain translations to d.inode.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend, _ := hostarch.PageRoundUp(d.size.Load()) + pgend, _ := hostarch.PageRoundUp(d.inode.size.Load()) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -782,9 +782,9 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab optional.End = pgend } - mf := d.fs.mf - h := d.readHandle() - _, cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size.Load(), mf, pgalloc.AllocOpts{ + mf := d.inode.fs.mf + h := d.inode.readHandle() + _, cerr := d.inode.cache.Fill(ctx, required, maxFillRange(required, optional), d.inode.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, @@ -792,7 +792,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab var ts []memmap.Translation var translatedEnd uint64 - for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + for seg := d.inode.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) // TODO(jamieliu): Make Translations writable even if writability is // not required if already kept-dirty by another writable translation. @@ -800,7 +800,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab if at.Write { // From this point forward, this memory can be dirtied through the // mapping at any time. - d.dirty.KeepDirty(segMR) + d.inode.dirty.KeepDirty(segMR) perms.Write = true } ts = append(ts, memmap.Translation{ @@ -844,70 +844,70 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. - mf := d.fs.mf - d.handleMu.RLock() - defer d.handleMu.RUnlock() - h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + mf := d.inode.fs.mf + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + h := d.inode.writeHandle() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { return err } // Discard the cache so that it's not stored in saved state. This is safe // because per InvalidateUnsavable invariants, no new translations can have // been returned after we invalidated all existing translations above. - d.cache.DropAll(mf) - d.dirty.RemoveAll() + d.inode.cache.DropAll(mf) + d.inode.dirty.RemoveAll() return nil } // Evict implements pgalloc.EvictableMemoryUser.Evict. -func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { +func (i *inode) Evict(ctx context.Context, er pgalloc.EvictableRange) { mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mf - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.handleMu.RLock() - defer d.handleMu.RUnlock() - h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() + mf := i.fs.mf + i.mapsMu.Lock() + defer i.mapsMu.Unlock() + i.handleMu.RLock() + defer i.handleMu.RUnlock() + h := i.writeHandle() + i.dataMu.Lock() + defer i.dataMu.Unlock() // Only allow pages that are no longer memory-mapped to be evicted. - for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + for mgap := i.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirty(ctx, mgapMR, &i.cache, &i.dirty, i.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } - d.cache.Drop(mgapMR, mf) - d.dirty.KeepClean(mgapMR) + i.cache.Drop(mgapMR, mf) + i.dirty.KeepClean(mgapMR) } } -// dentryPlatformFile implements memmap.File. It exists solely because dentry +// inodePlatformFile implements memmap.File. It exists solely because dentry // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // -// dentryPlatformFile is only used when a host FD representing the remote file +// inodePlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application // memory mappings (i.e. !filesystem.opts.forcePageCache). // // +stateify savable -type dentryPlatformFile struct { +type inodePlatformFile struct { memmap.DefaultMemoryType memmap.NoBufferedIOFallback - *dentry + *inode // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. @@ -922,34 +922,34 @@ type dentryPlatformFile struct { } // IncRef implements memmap.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { - d.dataMu.Lock() - d.fdRefs.IncRefAndAccount(fr, memCgID) - d.dataMu.Unlock() +func (i *inodePlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { + i.dataMu.Lock() + i.fdRefs.IncRefAndAccount(fr, memCgID) + i.dataMu.Unlock() } // DecRef implements memmap.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { - d.dataMu.Lock() - d.fdRefs.DecRefAndAccount(fr) - d.dataMu.Unlock() +func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { + i.dataMu.Lock() + i.fdRefs.DecRefAndAccount(fr) + i.dataMu.Unlock() } // MapInternal implements memmap.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - return d.hostFileMapper.MapInternal(fr, int(d.mmapFD.RacyLoad()), at.Write) +func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { + i.handleMu.RLock() + defer i.handleMu.RUnlock() + return i.hostFileMapper.MapInternal(fr, int(i.mmapFD.RacyLoad()), at.Write) } // DataFD implements memmap.File.DataFD. -func (d *dentryPlatformFile) DataFD(fr memmap.FileRange) (int, error) { - return d.FD(), nil +func (i *inodePlatformFile) DataFD(fr memmap.FileRange) (int, error) { + return i.FD(), nil } // FD implements memmap.File.FD. -func (d *dentryPlatformFile) FD() int { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - return int(d.mmapFD.RacyLoad()) +func (i *inodePlatformFile) FD() int { + i.handleMu.RLock() + defer i.handleMu.RUnlock() + return int(i.mmapFD.RacyLoad()) } diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 9231d9c0fd..202ebe6aba 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -67,7 +67,7 @@ func (fs *filesystem) revalidateOne(ctx context.Context, vfsObj *vfs.VirtualFile // Skip revalidation for interop mode different than InteropModeShared or // if the parent is synthetic (child must be synthetic too, but it cannot be // replaced without first replacing the parent). - if parent.cachedMetadataAuthoritative() { + if parent.inode.cachedMetadataAuthoritative() { return nil } @@ -184,7 +184,7 @@ func (fs *filesystem) revalidateStep(ctx context.Context, rp resolvingPath, d *d state.add(child) // Symlink must be resolved before continuing with revalidation. - if child.isSymlink() { + if child.inode.isSymlink() { return nil, errRevalidationStepDone{} } @@ -211,7 +211,7 @@ func (d *dentry) invalidate(ctx context.Context, vfsObj *vfs.VirtualFilesystem, if child := parent.children[d.name]; child == d { // Invalidate dentry so it gets reloaded next time it's accessed. delete(parent.children, d.name) - if d.isSynthetic() { + if d.inode.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for // consistency with Linux. However, synthetic files are @@ -247,7 +247,7 @@ func (d *dentry) invalidate(ctx context.Context, vfsObj *vfs.VirtualFilesystem, // now. (The same would apply to racy replacement by // filesystem.RenameAt(), but we can't race with rename since renameMu // has been locked since entering filesystem.revalidatePath().) - if removed && (d.isSynthetic() || d.endpoint != nil) { + if removed && (d.inode.isSynthetic() || d.inode.endpoint != nil) { d.decRefNoCaching() } @@ -282,7 +282,7 @@ func (d *dentry) disownAllChildrenForInvalidation(ctx context.Context, vfsObj *v for name, child := range d.children { children = append(children, child) delete(d.children, name) - if child.isSynthetic() { + if child.inode.isSynthetic() { child.deleteSynthetic(d, ds) } } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index a21345a043..d8f444ac5d 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -59,7 +59,7 @@ func (fs *filesystem) PrepareSave(ctx context.Context) error { // is a legacy VFS1 feature.) fs.syncMu.Lock() for sffd := fs.specialFileFDs.Front(); sffd != nil; sffd = sffd.Next() { - if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() { + if sffd.dentry().inode.fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() { if err := sffd.savePipeData(ctx); err != nil { fs.syncMu.Unlock() return err @@ -112,17 +112,17 @@ func (fd *specialFileFD) savePipeData(ctx context.Context) error { } func (d *dentry) prepareSaveDead(ctx context.Context) error { - if !d.isRegularFile() && !d.isDir() { - return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: only deleted dentries for regular files and directories can be saved, got %s", genericDebugPathname(d.fs, d), linux.FileMode(d.mode.Load())) + if !d.inode.isRegularFile() && !d.isDir() { + return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: only deleted dentries for regular files and directories can be saved, got %s", genericDebugPathname(d.inode.fs, d), linux.FileMode(d.inode.mode.Load())) } if !d.isDeleted() { - return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: invalidated dentries can't be saved", genericDebugPathname(d.fs, d)) + return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: invalidated dentries can't be saved", genericDebugPathname(d.inode.fs, d)) } - if d.isRegularFile() { - if !d.cachedMetadataAuthoritative() { + if d.inode.isRegularFile() { + if !d.inode.cachedMetadataAuthoritative() { // Get updated metadata for d in case we need to perform metadata // validation during restore. - if err := d.updateMetadata(ctx); err != nil { + if err := d.inode.updateMetadata(ctx); err != nil { return err } } @@ -130,70 +130,70 @@ func (d *dentry) prepareSaveDead(ctx context.Context) error { return err } } - if d.isReadHandleOk() || d.isWriteHandleOk() { - d.fs.savedDentryRW[d] = savedDentryRW{ - read: d.isReadHandleOk(), - write: d.isWriteHandleOk(), + if d.inode.isReadHandleOk() || d.inode.isWriteHandleOk() { + d.inode.fs.savedDentryRW[d] = savedDentryRW{ + read: d.inode.isReadHandleOk(), + write: d.inode.isWriteHandleOk(), } } - if d.fs.savedDeletedOpenDentries == nil { - d.fs.savedDeletedOpenDentries = make(map[*dentry]struct{}) + if d.inode.fs.savedDeletedOpenDentries == nil { + d.inode.fs.savedDeletedOpenDentries = make(map[*dentry]struct{}) } - d.fs.savedDeletedOpenDentries[d] = struct{}{} + d.inode.fs.savedDeletedOpenDentries[d] = struct{}{} return nil } // Preconditions: -// - d.isRegularFile() +// - d.inode.isRegularFile() // - d.isDeleted() func (d *dentry) prepareSaveDeletedRegularFile(ctx context.Context) error { // Fetch an appropriate handle to read the deleted file. - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() var h handle - if d.isReadHandleOk() { - h = d.readHandle() + if d.inode.isReadHandleOk() { + h = d.inode.readHandle() } else { var err error h, err = d.openHandle(ctx, true /* read */, false /* write */, false /* trunc */) if err != nil { - return fmt.Errorf("failed to open read handle for deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open read handle for deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } defer h.close(ctx) } - // Read the file data and store it in d.savedDeletedData. - d.dataMu.RLock() - defer d.dataMu.RUnlock() - d.savedDeletedData = make([]byte, d.size.Load()) + // Read the file data and store it in d.inode.savedDeletedData. + d.inode.dataMu.RLock() + defer d.inode.dataMu.RUnlock() + d.inode.savedDeletedData = make([]byte, d.inode.size.Load()) done := uint64(0) - for done < uint64(len(d.savedDeletedData)) { - n, err := h.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.savedDeletedData[done:])), done) + for done < uint64(len(d.inode.savedDeletedData)) { + n, err := h.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.inode.savedDeletedData[done:])), done) done += n if err != nil { if err == io.EOF { break } - return fmt.Errorf("failed to read deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to read deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } } - if done < uint64(len(d.savedDeletedData)) { - return fmt.Errorf("failed to read all of deleted file %q: read %d bytes, expected %d", genericDebugPathname(d.fs, d), done, len(d.savedDeletedData)) + if done < uint64(len(d.inode.savedDeletedData)) { + return fmt.Errorf("failed to read all of deleted file %q: read %d bytes, expected %d", genericDebugPathname(d.inode.fs, d), done, len(d.inode.savedDeletedData)) } return nil } func (d *dentry) prepareSaveRecursive(ctx context.Context) error { - if d.isRegularFile() && !d.cachedMetadataAuthoritative() { + if d.inode.isRegularFile() && !d.inode.cachedMetadataAuthoritative() { // Get updated metadata for d in case we need to perform metadata // validation during restore. - if err := d.updateMetadata(ctx); err != nil { + if err := d.inode.updateMetadata(ctx); err != nil { return err } } - if d.isReadHandleOk() || d.isWriteHandleOk() { - d.fs.savedDentryRW[d] = savedDentryRW{ - read: d.isReadHandleOk(), - write: d.isWriteHandleOk(), + if d.inode.isReadHandleOk() || d.inode.isWriteHandleOk() { + d.inode.fs.savedDentryRW[d] = savedDentryRW{ + read: d.inode.isReadHandleOk(), + write: d.inode.isWriteHandleOk(), } } d.childrenMu.Lock() @@ -216,8 +216,8 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error { // beforeSave is invoked by stateify. func (d *dentry) beforeSave() { if d.vfsd.IsDead() { - if _, ok := d.fs.savedDeletedOpenDentries[d]; !ok { - panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: dead dentry is not saved in fs.savedDeletedOpenDentries (deleted=%t, synthetic=%t)", genericDebugPathname(d.fs, d), d.isDeleted(), d.isSynthetic())) + if _, ok := d.inode.fs.savedDeletedOpenDentries[d]; !ok { + panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: dead dentry is not saved in fs.savedDeletedOpenDentries (deleted=%t, synthetic=%t)", genericDebugPathname(d.inode.fs, d), d.isDeleted(), d.inode.isSynthetic())) } } } @@ -225,7 +225,7 @@ func (d *dentry) beforeSave() { // BeforeResume implements vfs.FilesystemImplSaveRestoreExtension.BeforeResume. func (fs *filesystem) BeforeResume(ctx context.Context) { for d := range fs.savedDeletedOpenDentries { - d.savedDeletedData = nil + d.inode.savedDeletedData = nil } fs.savedDeletedOpenDentries = nil fs.savedDentryRW = nil @@ -236,26 +236,29 @@ func (fs *filesystem) afterLoad(ctx goContext.Context) { fs.mf = pgalloc.MemoryFileFromContext(ctx) } +func (i *inode) afterLoad(goContext.Context) { + i.readFD = atomicbitops.FromInt32(-1) + i.writeFD = atomicbitops.FromInt32(-1) + i.mmapFD = atomicbitops.FromInt32(-1) +} + // afterLoad is invoked by stateify. func (d *dentry) afterLoad(goContext.Context) { - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) if d.refs.Load() != -1 { refs.Register(d) } } // afterLoad is invoked by stateify. -func (d *directfsDentry) afterLoad(goContext.Context) { - d.controlFD = -1 +func (i *directfsInode) afterLoad(goContext.Context) { + i.controlFD = -1 } // afterLoad is invoked by stateify. -func (d *dentryPlatformFile) afterLoad(goContext.Context) { - if d.hostFileMapper.IsInited() { +func (i *inodePlatformFile) afterLoad(goContext.Context) { + if i.hostFileMapper.IsInited() { // Ensure that we don't call d.hostFileMapper.Init() again. - d.hostFileMapperInitOnce.Do(func() {}) + i.hostFileMapperInitOnce.Do(func() {}) } } @@ -291,6 +294,7 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest } fs.opts.fd = fd fs.inoByKey = make(map[inoKey]uint64) + fs.inodeByIno = make(map[inoKey]*inode) if err := fs.restoreRoot(ctx, &opts); err != nil { return vfs.PrependErrMsg("failed to restore root", err) @@ -308,7 +312,7 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest // This is consistent with VFS1. haveWriteOnlyPipes := false for fd := fs.specialFileFDs.Front(); fd != nil; fd = fd.Next() { - if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { + if fd.dentry().inode.fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { haveWriteOnlyPipes = true continue } @@ -318,7 +322,7 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest } if haveWriteOnlyPipes { for fd := fs.specialFileFDs.Front(); fd != nil; fd = fd.Next() { - if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { + if fd.dentry().inode.fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { if err := fd.completeRestore(ctx); err != nil { return err } @@ -344,7 +348,7 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest delete(leafDirectories, d.parent.Load()) } for leafD := range leafDirectories { - if err := leafD.parent.Load().unlink(ctx, leafD.name, linux.AT_REMOVEDIR); err != nil { + if err := leafD.parent.Load().inode.unlink(ctx, leafD.name, linux.AT_REMOVEDIR); err != nil { return fmt.Errorf("failed to clean up recreated deleted directory %q: %v", genericDebugPathname(fs, leafD), err) } delete(dirsToDelete, leafD) @@ -366,7 +370,7 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp if child == nil { continue } - if child.isSynthetic() { + if child.inode.isSynthetic() { continue } if err := child.restoreFile(ctx, opts); err != nil { @@ -382,44 +386,44 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp // restoreDeleted restores a deleted dentry for a directory or regular file. // // Preconditions: -// - d.isRegularFile() || d.isDir() -// - d.savedDeletedData != nil iff d.isRegularFile() +// - d.inode.isRegularFile() || d.isDir() +// - d.inode.savedDeletedData != nil iff d.inode.isRegularFile() func (d *dentry) restoreDeleted(ctx context.Context, opts *vfs.CompleteRestoreOptions, dirsToDelete map[*dentry]struct{}) error { parent := d.parent.Load() - if _, ok := d.fs.savedDeletedOpenDentries[parent]; ok { + if _, ok := d.inode.fs.savedDeletedOpenDentries[parent]; ok { // Recursively restore the parent first if the parent is also deleted. if err := parent.restoreDeleted(ctx, opts, dirsToDelete); err != nil { return err } } switch { - case d.isRegularFile(): + case d.inode.isRegularFile(): return d.restoreDeletedRegularFile(ctx, opts) case d.isDir(): return d.restoreDeletedDirectory(ctx, opts, dirsToDelete) default: - return fmt.Errorf("gofer.dentry(%q).restoreDeleted: invalid file type %s", genericDebugPathname(d.fs, d), linux.FileMode(d.mode.Load())) + return fmt.Errorf("gofer.dentry(%q).restoreDeleted: invalid file type %s", genericDebugPathname(d.inode.fs, d), linux.FileMode(d.inode.mode.Load())) } } func (d *dentry) restoreDeletedDirectory(ctx context.Context, opts *vfs.CompleteRestoreOptions, dirsToDelete map[*dentry]struct{}) error { // Recreate the directory on the host filesystem. This will be deleted later. parent := d.parent.Load() - _, err := parent.mkdir(ctx, d.name, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()), false /* createDentry */) + _, err := parent.mkdir(ctx, d.name, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load()), false /* createDentry */) if err != nil { - return fmt.Errorf("failed to re-create deleted directory %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to re-create deleted directory %q: %w", genericDebugPathname(d.inode.fs, d), err) } // Restore the directory. if err := d.restoreFile(ctx, opts); err != nil { - if err := parent.unlink(ctx, d.name, linux.AT_REMOVEDIR); err != nil { - log.Warningf("failed to clean up recreated deleted directory %q: %v", genericDebugPathname(d.fs, d), err) + if err := parent.inode.unlink(ctx, d.name, linux.AT_REMOVEDIR); err != nil { + log.Warningf("failed to clean up recreated deleted directory %q: %v", genericDebugPathname(d.inode.fs, d), err) } return fmt.Errorf("failed to restore deleted directory: %w", err) } // We will delete the directory later. We need to keep it around in case any // of its children need to be restored after this. dirsToDelete[d] = struct{}{} - delete(d.fs.savedDeletedOpenDentries, d) + delete(d.inode.fs.savedDeletedOpenDentries, d) return nil } @@ -427,32 +431,32 @@ func (d *dentry) restoreDeletedRegularFile(ctx context.Context, opts *vfs.Comple // Recreate the file on the host filesystem (this is temporary). parent := d.parent.Load() name := d.name - _, h, err := parent.openCreate(ctx, name, linux.O_WRONLY, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()), false /* createDentry */) + _, h, err := parent.openCreate(ctx, name, linux.O_WRONLY, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load()), false /* createDentry */) if linuxerr.Equals(linuxerr.EEXIST, err) { name = fmt.Sprintf("%s.tmp-gvisor-restore", name) - log.Warningf("Deleted file %q was replaced with a new file at the same path, using new name %q", genericDebugPathname(d.fs, d), name) - _, h, err = parent.openCreate(ctx, name, linux.O_WRONLY, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()), false /* createDentry */) + log.Warningf("Deleted file %q was replaced with a new file at the same path, using new name %q", genericDebugPathname(d.inode.fs, d), name) + _, h, err = parent.openCreate(ctx, name, linux.O_WRONLY, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load()), false /* createDentry */) } if err != nil { - return fmt.Errorf("failed to re-create deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to re-create deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } defer h.close(ctx) // In case of errors, clean up the recreated file. unlinkCU := cleanup.Make(func() { - if err := parent.unlink(ctx, name, 0 /* flags */); err != nil { - log.Warningf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.fs, d), err) + if err := parent.inode.unlink(ctx, d.name, 0 /* flags */); err != nil { + log.Warningf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.inode.fs, d), err) } }) defer unlinkCU.Clean() // Write the file data to the recreated file. - n, err := h.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.savedDeletedData)), 0) + n, err := h.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.inode.savedDeletedData)), 0) if err != nil { - return fmt.Errorf("failed to write deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to write deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } - if n != uint64(len(d.savedDeletedData)) { - return fmt.Errorf("failed to write all of deleted file %q: wrote %d bytes, expected %d", genericDebugPathname(d.fs, d), n, len(d.savedDeletedData)) + if n != uint64(len(d.inode.savedDeletedData)) { + return fmt.Errorf("failed to write all of deleted file %q: wrote %d bytes, expected %d", genericDebugPathname(d.inode.fs, d), n, len(d.inode.savedDeletedData)) } - d.savedDeletedData = nil + d.inode.savedDeletedData = nil // Restore the file. Note that timestamps may not match since we re-created // the file on the host. recreateOpts := *opts @@ -462,10 +466,10 @@ func (d *dentry) restoreDeletedRegularFile(ctx context.Context, opts *vfs.Comple } // Finally, unlink the recreated file. unlinkCU.Release() - if err := parent.unlink(ctx, name, 0 /* flags */); err != nil { - return fmt.Errorf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.fs, d), err) + if err := parent.inode.unlink(ctx, d.name, 0 /* flags */); err != nil { + return fmt.Errorf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.inode.fs, d), err) } - delete(d.fs.savedDeletedOpenDentries, d) + delete(d.inode.fs.savedDeletedOpenDentries, d) return nil } @@ -473,15 +477,15 @@ func (fd *specialFileFD) completeRestore(ctx context.Context) error { d := fd.dentry() h, err := d.openHandle(ctx, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) if err != nil { - return fmt.Errorf("failed to open handle for specialFileFD for %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open handle for specialFileFD for %q: %w", genericDebugPathname(d.inode.fs, d), err) } fd.handle = h - ftype := d.fileType() + ftype := d.inode.fileType() fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0 if fd.haveQueue { if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil { - return fmt.Errorf("failed to add FD to fdnotified for %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to add FD to fdnotified for %q: %w", genericDebugPathname(d.inode.fs, d), err) } } diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index 64e9a995a6..ab40245be5 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -func (d *dentry) isSocket() bool { - return d.fileType() == linux.S_IFSOCK +func (i *inode) isSocket() bool { + return i.fileType() == linux.S_IFSOCK } func isSocketTypeSupported(sockType linux.SockType) bool { @@ -103,9 +103,9 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context, opts transport.Uni } func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue, opts transport.UnixSocketOpts) (*transport.SCMConnectedEndpoint, *syserr.Error) { - e.dentry.fs.renameMu.RLock() + e.dentry.inode.fs.renameMu.RLock() hostSockFD, err := e.dentry.connect(ctx, sockType) - e.dentry.fs.renameMu.RUnlock() + e.dentry.inode.fs.renameMu.RUnlock() if err != nil { return nil, syserr.ErrConnectionRefused } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index d974e05f9b..75eb640ee9 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -92,7 +92,7 @@ type specialFileFD struct { } func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { - ftype := d.fileType() + ftype := d.inode.fileType() seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK || ftype == linux.S_IFCHR) && h.fd >= 0 fd := &specialFileFD{ @@ -101,7 +101,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci seekable: seekable, haveQueue: haveQueue, } - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err @@ -117,10 +117,10 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci } return nil, err } - d.fs.syncMu.Lock() - d.fs.specialFileFDs.PushBack(fd) - d.fs.syncMu.Unlock() - if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { + d.inode.fs.syncMu.Lock() + d.inode.fs.specialFileFDs.PushBack(fd) + d.inode.fs.syncMu.Unlock() + if fd.vfsfd.IsWritable() && (d.inode.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } if h.fd >= 0 { @@ -229,7 +229,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, linuxerr.EOPNOTSUPP } - if d := fd.dentry(); d.cachedMetadataAuthoritative() { + if d := fd.dentry(); d.inode.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -304,20 +304,20 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // If the regular file fd was opened with O_APPEND, make sure the file // size is updated. There is a possible race here if size is modified // externally after metadata cache is updated. - if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { - if err := d.updateMetadata(ctx); err != nil { + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.inode.cachedMetadataAuthoritative() { + if err := d.inode.updateMetadata(ctx); err != nil { return 0, offset, err } } // We need to hold the metadataMu *while* writing to a regular file. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // Set offset to file size if the regular file was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { - // Holding d.metadataMu is sufficient for reading d.size. - offset = int64(d.size.RacyLoad()) + // Holding d.inode.metadataMu is sufficient for reading d.inode.size. + offset = int64(d.inode.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -326,7 +326,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off src = src.TakeFirst64(limit) } - if d.cachedMetadataAuthoritative() { + if d.inode.cachedMetadataAuthoritative() { if fd.isRegularFile { d.touchCMtimeLocked() } else { @@ -335,7 +335,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } // handleReadWriter always writes to the remote file. So O_DIRECT is - // effectively always set. Invalidate pages in d.mappings that have been + // effectively always set. Invalidate pages in d.inode.mappings that have been // written to. pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) @@ -343,9 +343,9 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + d.inode.mapsMu.Lock() + d.inode.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() rw := getHandleReadWriter(ctx, &fd.handle, offset) n, err := src.CopyInTo(ctx, rw) @@ -369,11 +369,11 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } // Update file size for regular files. if fd.isRegularFile { - // d.metadataMu is already locked at this point. - if uint64(offset) > d.size.RacyLoad() { - d.dataMu.Lock() - defer d.dataMu.Unlock() - d.size.Store(uint64(offset)) + // d.inode.metadataMu is already locked at this point. + if uint64(offset) > d.inode.size.RacyLoad() { + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() + d.inode.size.Store(uint64(offset)) } } return int64(n), offset, err @@ -444,9 +444,9 @@ func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // AddMapping implements memmap.Mappable.AddMapping. func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.AddMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.AddMapping(ms, ar, offset, writable) fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) return nil } @@ -454,9 +454,9 @@ func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.RemoveMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.RemoveMapping(ms, ar, offset, writable) fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) } @@ -484,9 +484,9 @@ func (fd *specialFileFD) Translate(ctx context.Context, required, optional memma // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) return nil } @@ -532,7 +532,7 @@ func (fd *specialFileFD) requireHostFD() { func (fd *specialFileFD) updateMetadata(ctx context.Context) error { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - return d.updateMetadataLocked(ctx, fd.handle) + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() + return d.inode.updateMetadataLocked(ctx, fd.handle) } diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index c446f1629c..c1b25b0b07 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -20,28 +20,28 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -func (d *dentry) isSymlink() bool { - return d.fileType() == linux.S_IFLNK +func (i *inode) isSymlink() bool { + return i.fileType() == linux.S_IFLNK } // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { d.touchAtime(mnt) - d.dataMu.Lock() - if d.haveTarget { - target := d.target - d.dataMu.Unlock() + d.inode.dataMu.Lock() + if d.inode.haveTarget { + target := d.inode.target + d.inode.dataMu.Unlock() return target, nil } } - target, err := d.readlinkImpl(ctx) - if d.fs.opts.interop != InteropModeShared { + target, err := d.inode.readlinkImpl(ctx) + if d.inode.fs.opts.interop != InteropModeShared { if err == nil { - d.haveTarget = true - d.target = target + d.inode.haveTarget = true + d.inode.target = target } - d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. + d.inode.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 0c58172b17..fff9baed9f 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -36,15 +36,15 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.atime.Store(now) - d.atimeDirty.Store(1) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.atime.Store(now) + d.inode.atimeDirty.Store(1) + d.inode.metadataMu.Unlock() mnt.EndWrite() } -// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true. +// Preconditions: d.inode.metadataMu is locked. d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { if opts := mnt.Options(); opts.Flags.NoATime || opts.ReadOnly { return @@ -52,9 +52,9 @@ func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := d.fs.clock.Now().Nanoseconds() - d.atime.Store(now) - d.atimeDirty.Store(1) + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.atime.Store(now) + d.inode.atimeDirty.Store(1) mnt.EndWrite() } @@ -62,30 +62,30 @@ func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.ctime.Store(now) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.ctime.Store(now) + d.inode.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.mtime.Store(now) - d.ctime.Store(now) - d.mtimeDirty.Store(1) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.mtime.Store(now) + d.inode.ctime.Store(now) + d.inode.mtimeDirty.Store(1) + d.inode.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. -// - The caller has locked d.metadataMu. +// - The caller has locked d.inode.metadataMu. func (d *dentry) touchCMtimeLocked() { - now := d.fs.clock.Now().Nanoseconds() - d.mtime.Store(now) - d.ctime.Store(now) - d.mtimeDirty.Store(1) + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.mtime.Store(now) + d.inode.ctime.Store(now) + d.inode.mtimeDirty.Store(1) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 48934c4d96..1eed0953ef 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -303,6 +303,8 @@ func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []str if !conf.HostFifo.AllowOpen() { opts = append(opts, "disable_fifo_open") } + // Enable inode sharing for runsc goferfs + opts = append(opts, "enable_inode_sharing") return opts } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index a7565d27a2..4d2fca8e67 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -344,9 +344,6 @@ syscall_test( add_fusefs = True, add_overlay = True, test = "//test/syscalls/linux:link_test", - # TODO(gvisor.dev/issue/6739): Remove use_tmpfs=True once gofer filesystem - # supports hard links correctly. - use_tmpfs = True, ) syscall_test( diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 48bffefba0..2ef33ec61f 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1231,6 +1231,7 @@ cc_binary( "//test/util:file_descriptor", "//test/util:fs_util", "//test/util:posix_error", + "//test/util:save_util", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc index 43e51667f5..754fbf57ac 100644 --- a/test/syscalls/linux/link.cc +++ b/test/syscalls/linux/link.cc @@ -21,13 +21,15 @@ #include +#include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/flags/flag.h" #include "absl/strings/str_cat.h" -#include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/linux_capability_util.h" #include "test/util/posix_error.h" +#include "test/util/save_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -41,6 +43,11 @@ namespace { // IsSameFile returns true if both filenames have the same device and inode. bool IsSameFile(const std::string& f1, const std::string& f2) { + // Inode numbers for gofer-accessed files on which no reference is held may + // change across save/restore because the information that the gofer client + // uses to track file identity (path) is inconsistent between gofer + // processes, which are restarted across save/restore. + DisableSave ds; // Use lstat rather than stat, so that symlinks are not followed. struct stat stat1 = {}; EXPECT_THAT(lstat(f1.c_str(), &stat1), SyscallSucceeds()); @@ -51,7 +58,6 @@ bool IsSameFile(const std::string& f1, const std::string& f2) { } // TODO(b/178640646): Add test for linkat with AT_EMPTY_PATH - TEST(LinkTest, CanCreateLinkFile) { auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const std::string newname = NewTempAbsPath(); @@ -76,6 +82,31 @@ TEST(LinkTest, CanCreateLinkFile) { IsPosixErrorOkAndHolds(initial_link_count)); } +TEST(LinkTest, HardlinkChangeMode) { + auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string newname = NewTempAbsPath(); + struct stat stat1 = {}; + struct stat stat2 = {}; + FileDescriptor file_fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(oldfile.path(), O_PATH)); + + ASSERT_THAT(linkat(file_fd.get(), oldfile.path().c_str(), file_fd.get(), + newname.c_str(), 0), + SyscallSucceeds()); + + EXPECT_THAT(chmod(oldfile.path().c_str(), S_IRUSR), SyscallSucceeds()); + EXPECT_THAT(lstat(newname.c_str(), &stat1), SyscallSucceeds()); + EXPECT_THAT(lstat(oldfile.path().c_str(), &stat2), SyscallSucceeds()); + + // files inode are preserved after save/restore only if the file is + // referenced. Do not use IsSameFile() here because it disables save before + // checking the st_ino and st_dev. + EXPECT_EQ(stat1.st_dev, stat2.st_dev); + EXPECT_EQ(stat1.st_ino, stat2.st_ino); + EXPECT_EQ(S_IRUSR, (stat1.st_mode & S_IRWXU)); + EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds()); +} + TEST(LinkTest, PermissionDenied) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER))); @@ -234,6 +265,8 @@ TEST(LinkTest, AbsPathsWithNonDirFDs) { EXPECT_THAT(linkat(file_fd.get(), oldfile.path().c_str(), file_fd.get(), newname.c_str(), 0), SyscallSucceeds()); + + EXPECT_TRUE(IsSameFile(oldfile.path(), newname)); } TEST(LinkTest, NewDirFDWithOpath) { @@ -279,10 +312,14 @@ TEST(LinkTest, AbsPathsNonDirFDsWithOpath) { TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); FileDescriptor file_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_PATH)); + FileDescriptor oldfile_fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(oldfile.path(), O_PATH)); + // Using file_fd as the dirfds is OK as long as paths are absolute. EXPECT_THAT(linkat(file_fd.get(), oldfile.path().c_str(), file_fd.get(), newname.c_str(), 0), SyscallSucceeds()); + EXPECT_TRUE(IsSameFile(oldfile.path(), newname)); } TEST(LinkTest, LinkDoesNotFollowSymlinks) { @@ -299,7 +336,7 @@ TEST(LinkTest, LinkDoesNotFollowSymlinks) { // The link should not have resolved the symlink, so newname and oldsymlink // are the same. EXPECT_TRUE(IsSameFile(oldsymlink, newname)); - EXPECT_FALSE(IsSameFile(oldfile.path(), newname)); + EXPECT_FALSE(IsSameFile(oldfile.path(), oldsymlink)); EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds()); EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds()); diff --git a/test/util/test_util.cc b/test/util/test_util.cc index a234289505..c526c88fe7 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -182,12 +183,14 @@ PosixErrorOr> GetOpenFDs() { return ret_fds; } +PosixErrorOr Permissions(const std::string& path) { + ASSIGN_OR_RETURN_ERRNO(auto stat_result, Stat(path)); + return static_cast(stat_result.st_mode); +} + PosixErrorOr Links(const std::string& path) { - struct stat st; - if (stat(path.c_str(), &st)) { - return PosixError(errno, absl::StrCat("Failed to stat ", path)); - } - return static_cast(st.st_nlink); + ASSIGN_OR_RETURN_ERRNO(auto stat_result, Stat(path)); + return static_cast(stat_result.st_nlink); } void RandomizeBuffer(char* buffer, size_t len) {