-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle file descriptor access control #4
Comments
Can't this be done by #define _GNU_SOURCE
#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <mntent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <linux/sched.h>
#ifndef __NR_execveat
#define __NR_execveat 322
#endif
#define SHELL "/bin/busybox"
#define RUNTIME_NAME "sandbox"
#define HOSTNAME "sandbox"
#define ARRAY_SIZE(A) (sizeof A / sizeof A[0U])
static char * const shell_arguments[] = { (char *)SHELL, "sh", NULL };
static char * const shell_environment[] = { "container=init", NULL };
static char fstab_config[] =
"# <file system> <mount point> <type> <options>\n"
"tmpfs tmp tmpfs mkdir,nodev,noexec,nosuid\n"
"\n"
"# Allow connecting to X11\n"
"/tmp/.X11-unix tmp/.X11-unix none mkdir,ro,bind,noexec,nosuid\n"
"\n"
"tmpfs dev tmpfs mkdir,nosuid,noexec\n"
"\n"
"# 3D acceleration\n"
"/dev/dri dev/dri none mkdir,ro,bind,noexec,nosuid\n"
"\n"
"/dev/null dev/null none touch,bind\n"
"/dev/full dev/full none touch,bind\n"
"/dev/zero dev/zero none touch,bind\n"
"/dev/urandom dev/urandom none touch,bind\n"
"\n"
"/dev/tty dev/tty none touch,bind\n"
"\n"
"devpts dev/pts devpts mkdir,ptmxmode=0666,newinstance\n"
"\n"
"tmpfs run tmpfs mkdir,nosuid,noexec\n"
"tmpfs run/lock tmpfs mkdir,nosuid,nodev,noexec\n"
"tmpfs run/shm tmpfs mkdir,nosuid,nodev\n"
"tmpfs var tmpfs mkdir,nosuid,noexec\n"
"\n"
"proc proc proc mkdir,ro,nodev,noexec,nosuid\n"
"sysfs sys sysfs mkdir,ro,nodev,noexec,nosuid\n"
"\n"
"/lib lib none mkdir,ro,nodev,nosuid,bind\n"
"/lib32 lib32 none mkdir,ro,nodev,nosuid,bind\n"
"/lib64 lib64 none mkdir,ro,nodev,nosuid,bind\n"
"\n"
"/bin bin none mkdir,ro,nodev,nosuid,bind\n"
"/sbin sbin none mkdir,ro,nodev,nosuid,bind\n"
"/usr usr none mkdir,ro,nodev,nosuid,bind\n"
"\n"
"/etc etc none mkdir,ro,nodev,nosuid,bind\n";
static int close_leaked_fds(void);
int main(void)
{
int errnum;
if (-1 == close_leaked_fds()) {
perror("close_leaked_fds");
return EXIT_FAILURE;
}
int sh_fd = open(SHELL, O_CLOEXEC | O_NONBLOCK | O_NOCTTY);
if (-1 == sh_fd) {
perror("open");
return EXIT_FAILURE;
}
uid_t uid = getuid();
gid_t gid = getgid();
uid_t mapped_uid = uid;
gid_t mapped_gid = gid;
/* Needed to do the rest of the unsharing */
if (-1 == unshare(CLONE_NEWUSER)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Prevent signals, ptracing of other processes */
if (-1 == unshare(CLONE_NEWPID)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Fork to allow for multithreading and to make the shell less
* buggy.
*/
{
pid_t child = fork();
if (-1 == child) {
perror("fork");
return EXIT_FAILURE;
}
if (child != 0) {
siginfo_t info;
do {
errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
} while (EINTR == errnum);
if (errnum != 0) {
assert(errnum != EINVAL);
assert(errnum != ECHILD);
assert(false);
}
return info.si_status;
}
}
{
int set_groups = open("/proc/self/setgroups", O_CLOEXEC | O_WRONLY);
if (-1 == set_groups) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(set_groups, "deny\n")) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(set_groups)) {
perror("close");
return EXIT_FAILURE;
}
}
{
int file = open("/proc/self/uid_map", O_CLOEXEC | O_WRONLY);
if (-1 == file) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(file, "%i %i 1\n", mapped_uid, uid)) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(file)) {
perror("close");
return EXIT_FAILURE;
}
}
{
int file = open("/proc/self/gid_map", O_CLOEXEC | O_WRONLY);
if (-1 == file) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(file, "%i %i 1\n", mapped_gid, gid)) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(file)) {
perror("close");
return EXIT_FAILURE;
}
}
if (-1 == setresgid(mapped_gid, mapped_gid, mapped_gid)) {
perror("setresgid");
return EXIT_FAILURE;
}
if (-1 == setresuid(mapped_uid, mapped_uid, mapped_uid)) {
perror("setresuid");
return EXIT_FAILURE;
}
/* With chroot prevent messing with user files */
if (-1 == unshare(CLONE_NEWNS)) {
perror("unshare");
return EXIT_FAILURE;
}
/* We have unshare the network namespace so we can mount /proc
* because of /proc/net
*/
if (-1 == unshare(CLONE_NEWNET)) {
perror("unshare");
return EXIT_FAILURE;
}
if (0) {
FILE * tmp = tmpfile();
if (NULL == tmp) {
perror("tmpfile");
return EXIT_FAILURE;
}
{
size_t bytes_to_write = sizeof fstab_config - 1U;
if (fwrite(fstab_config, 1U, bytes_to_write, tmp) != sizeof fstab_config - 1U) {
perror("fwrite");
return EXIT_FAILURE;
}
}
char tmppath[] = "/proc/self/fd/XXXXXXXXXXX";
sprintf(tmppath, "/proc/self/fd/%i", fileno(tmp));
FILE * fstab = setmntent(tmppath, "r");
if (NULL == fstab) {
perror("setmtent");
return EXIT_FAILURE;
}
if (EOF == fclose(tmp)) {
perror("fclose");
return EXIT_FAILURE;
}
if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
perror("mount");
return EXIT_FAILURE;
}
if (-1 == chdir(RUNTIME_NAME)) {
perror("chdir");
return EXIT_FAILURE;
}
for (;;) {
errno = 0;
struct mntent * entry = getmntent(fstab);
if (NULL == entry) {
errnum = errno;
if (errnum != 0) {
perror("getmntent");
return EXIT_FAILURE;
}
break;
}
enum {
MKDIR,
TOUCH,
BIND,
RBIND,
REMOUNT,
RO,
RW,
SUID,
NOSUID,
DEV,
NODEV,
EXEC,
NOEXEC,
USER,
NOUSER,
KERNMOUNT,
ACTIVE
};
char * const token[] = {
[MKDIR] = "mkdir",
[TOUCH] = "touch",
[BIND] = "bind",
[RBIND] = "rbind",
[REMOUNT] = "remount",
[RO] = MNTOPT_RO,
[RW] = MNTOPT_RW,
[SUID] = MNTOPT_SUID,
[NOSUID] = MNTOPT_NOSUID,
[DEV] = "dev",
[NODEV] = "nodev",
[EXEC] = "exec",
[NOEXEC] = "noexec",
[USER] = "user",
[NOUSER] = "nouser",
[KERNMOUNT] = "kernmount",
[ACTIVE] = "active",
NULL
};
bool mkdir_flag = false;
bool touch_flag = false;
bool bind = false;
bool rec = false;
bool remount = false;
bool readonly = false;
bool readwrite = false;
bool suid = true;
bool dev = true;
bool exec = true;
bool user = true;
bool kernmount = false;
bool active = false;
char *leftovers = NULL;
{
char *mnt_opts = entry->mnt_opts;
if (0 == strcmp("none", mnt_opts)) {
goto mount;
}
char *subopts_str = strdup(mnt_opts);
if (NULL == subopts_str) {
perror("strdup");
return EXIT_FAILURE;
}
char * subopts = subopts_str;
char *value = NULL;
while (*subopts != '\0') {
switch (getsubopt(&subopts, token, &value)) {
case MKDIR:
mkdir_flag = true;
break;
case TOUCH:
touch_flag = true;
break;
case BIND:
bind = true;
break;
case RBIND:
bind = true;
rec = true;
break;
case REMOUNT:
remount = true;
break;
case RO:
readonly = true;
break;
case RW:
readwrite = true;
break;
case SUID:
suid = true;
break;
case NOSUID:
suid = false;
break;
case DEV:
dev = true;
break;
case NODEV:
dev = false;
break;
case EXEC:
exec = true;
break;
case NOEXEC:
exec = false;
break;
case USER:
user = true;
break;
case NOUSER:
user = false;
break;
case KERNMOUNT:
kernmount = true;
break;
case ACTIVE:
active = true;
break;
default:;
leftovers = strstr(mnt_opts, value);
goto free_subopts_str;
}
}
free_subopts_str:
free(subopts_str);
}
mount:
if (bind && rec && readonly) {
fprintf(stderr,
"It's not possible to recursively bind readonly mounts\n");
return EXIT_FAILURE;
}
if (readwrite && readonly) {
fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
token[RO], token[RW]);
return EXIT_FAILURE;
}
if (mkdir_flag && touch_flag) {
fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
token[MKDIR], token[TOUCH]);
return EXIT_FAILURE;
}
unsigned long mountflags = 0;
if (bind) {
mountflags |= MS_BIND;
}
if (rec) {
mountflags |= MS_REC;
}
if (remount) {
mountflags |= MS_REMOUNT;
}
if (readonly) {
mountflags |= MS_RDONLY;
}
if (!suid) {
mountflags |= MS_NOSUID;
}
if (!dev) {
mountflags |= MS_NODEV;
}
if (!exec) {
mountflags |= MS_NOEXEC;
}
if (!user) {
mountflags |= MS_NOUSER;
}
if (kernmount) {
mountflags |= MS_KERNMOUNT;
}
if (active) {
mountflags |= MS_ACTIVE;
}
if (mkdir_flag) {
if (-1 == mkdir(entry->mnt_dir, S_IRWXU)) {
perror("mkdir");
return EXIT_FAILURE;
}
} else if (touch_flag) {
int fd = open(entry->mnt_dir, O_EXCL | O_CREAT | O_CLOEXEC, S_IRWXU);
if (-1 == fd) {
perror("open");
return EXIT_FAILURE;
}
close(fd);
}
if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
entry->mnt_dir,
entry->mnt_type, mountflags,
leftovers)) {
perror("mount");
return EXIT_FAILURE;
}
if (bind && readonly) {
mountflags |= MS_REMOUNT;
if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
entry->mnt_dir,
entry->mnt_type, mountflags,
leftovers)) {
perror("mount");
return EXIT_FAILURE;
}
}
}
if (endmntent(fstab) != 1) {
perror("endmntent");
return EXIT_FAILURE;
}
int old_root = open("/", O_DIRECTORY);
if (-1 == old_root) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == syscall(__NR_pivot_root, ".", ".")) {
perror("pivot_root");
return EXIT_FAILURE;
}
if (-1 == fchdir(old_root)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == umount2(".", MNT_DETACH)) {
perror("umount");
return EXIT_FAILURE;
}
if (-1 == close(old_root)) {
perror("close");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
} else {
if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
perror("mount");
return EXIT_FAILURE;
}
if (-1 == chdir(RUNTIME_NAME)) {
perror("chdir");
return EXIT_FAILURE;
}
if (-1 == mkdir("sandbox", S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
int old_root = open("/", O_DIRECTORY);
if (-1 == old_root) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == syscall(__NR_pivot_root, ".", ".")) {
perror("pivot_root");
return EXIT_FAILURE;
}
if (-1 == fchdir(old_root)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == umount2(".", MNT_DETACH)) {
perror("umount");
return EXIT_FAILURE;
}
if (-1 == close(old_root)) {
perror("close");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
int sandbox_fd = open("sandbox", O_CLOEXEC | O_DIRECTORY);
if (-1 == sandbox_fd) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == fchdir(sandbox_fd)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == rmdir("../sandbox")) {
perror("rmdir");
return EXIT_FAILURE;
}
if (-1 == chroot(".")) {
perror("chroot");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
close(sandbox_fd);
}
/* Sandbox the rest of the namespaces */
/* We can't unshare the IPC namespace because we need to share it
* to use X11's shared memory extensions. Not sure how to disable
* shared memory extensions.
*/
if (-1 == unshare(CLONE_NEWIPC | CLONE_NEWUTS)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Favor other processes over this process hierarchy. Only
* superuser may lower priorities so this is not stoppable. This
* also makes the process hierarchy nicer for the OOM killer.
*/
if (-1 == setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0) + 1)) {
perror("setpriority");
return EXIT_FAILURE;
}
if (-1 == sethostname(HOSTNAME, sizeof HOSTNAME - 1U)) {
perror("sethostname");
return EXIT_FAILURE;
}
if (0) {
if (-1 == symlink("/proc/self/fd", "/dev/fd")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/0", "/dev/stdin")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/1", "/dev/stdout")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/2", "/dev/stderr")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/run/shm", "/dev/shm")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/dev/pts/ptmx", "/dev/ptmx")) {
perror("symlink");
return EXIT_FAILURE;
}
}
/* Keep init super privileged */
{
pid_t child = fork();
if (-1 == child) {
perror("fork");
return EXIT_FAILURE;
}
if (child != 0) {
siginfo_t info;
do {
errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
} while (EINTR == errnum);
if (errnum != 0) {
assert(errnum != EINVAL);
assert(errnum != ECHILD);
assert(false);
}
return info.si_status;
}
}
/* In the shell drop all privileges I might possibly have. */
cap_t caps = cap_get_proc();
if (NULL == caps) {
perror("cap_get_proc");
return EXIT_FAILURE;
}
if (-1 == cap_clear_flag(caps, CAP_PERMITTED)) {
perror("cap_clear_flag");
return EXIT_FAILURE;
}
if (-1 == cap_clear_flag(caps, CAP_EFFECTIVE)) {
perror("cap_clear_flag");
return EXIT_FAILURE;
}
if (-1 == cap_set_proc(caps)) {
perror("cap_set_proc");
return EXIT_FAILURE;
}
if (-1 == cap_free(caps)) {
perror("cap_free");
return EXIT_FAILURE;
}
syscall(__NR_execveat, sh_fd, "",
(char *const *)shell_arguments, shell_environment,
AT_EMPTY_PATH);
perror("execveat");
return EXIT_FAILURE;
}
static int close_leaked_fds(void)
{
int errnum = 0;
size_t size = 0U;
int *fds = NULL;
DIR *const fds_dir = opendir("/proc/self/fd");
if (NULL == fds_dir) {
errnum = errno;
assert(errnum != 0);
return errnum;
}
for (;;) {
errno = 0;
struct dirent *const result = readdir(fds_dir);
{
errnum = errno;
if (errnum != 0) {
goto close_fds_dir;
}
}
if (NULL == result) {
break;
}
char const *const d_name = result->d_name;
if (0 == strcmp(d_name, ".")) {
continue;
}
if (0 == strcmp(d_name, "..")) {
continue;
}
int const fd = atoi(d_name);
if (fd == dirfd(fds_dir)) {
continue;
}
++size;
}
rewinddir(fds_dir);
fds = calloc(size, sizeof fds[0]);
if (size != 0U && NULL == fds) {
errnum = errno;
assert(errnum != 0);
goto close_fds_dir;
}
for (size_t ii = 0U; ii < size;) {
errno = 0;
struct dirent *const result = readdir(fds_dir);
{
errnum = errno;
if (errnum != 0) {
goto close_fds_dir;
}
}
char const *const d_name = result->d_name;
if (0 == strcmp(d_name, ".")) {
continue;
}
if (0 == strcmp(d_name, "..")) {
continue;
}
int const fd = atoi(d_name);
if (fd == dirfd(fds_dir)) {
continue;
}
fds[ii] = fd;
++ii;
}
close_fds_dir:
if (-1 == closedir(fds_dir)) {
int close_errnum = errno;
assert(close_errnum != 0);
assert(close_errnum != EBADF);
if (0 == errnum) {
errnum = close_errnum;
}
}
if (0 == errnum) {
for (size_t ii = 0U; ii < size; ++ii) {
int fd = fds[ii];
switch (fd) {
case STDIN_FILENO:
case STDOUT_FILENO:
case STDERR_FILENO:
break;
default:
close(fd);
break;
}
}
}
free(fds);
return errnum;
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A sandbox would greatly benefit from being able to only use a set of file descriptors/handles instead of accessing an explicit path, with seccom-bpf (e.g.
write(2)
,fstat(2)
…) and maybe later with capsicum (e.g.openat(2)
).This could also allow efficient data sharing (i.e.
memfd_create(2)
/seal/mmap).cc rust-lang/rust#21936
cc rust-lang/rfcs#941
cc #2
The text was updated successfully, but these errors were encountered: