Skip to content

Commit e3e6cb5

Browse files
committed
Make sure that changes we make to madvise(2) in the syscallbuf aren't visible to a seccomp filter.
1 parent c5f1ecc commit e3e6cb5

File tree

3 files changed

+82
-8
lines changed

3 files changed

+82
-8
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,6 +1331,7 @@ set(BASIC_TESTS
13311331
seccomp_clone_fail
13321332
seccomp_desched
13331333
seccomp_kill_exit
1334+
seccomp_madvise
13341335
seccomp_null
13351336
seccomp_sigsys_args
13361337
seccomp_sigsys_sigtrap

src/preload/syscallbuf.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2379,24 +2379,21 @@ static long sys_madvise(struct syscall_info* call) {
23792379
long ret;
23802380

23812381
switch (advice) {
2382-
// Whitelist advice values that we know are OK to pass through to the
2383-
// kernel directly.
2382+
// Advice values that we buffer and are ok to pass through to the kernel
2383+
// directly.
23842384
case MADV_NORMAL:
23852385
case MADV_RANDOM:
23862386
case MADV_SEQUENTIAL:
23872387
case MADV_WILLNEED:
2388-
case MADV_DONTNEED:
23892388
case MADV_MERGEABLE:
23902389
case MADV_UNMERGEABLE:
23912390
case MADV_HUGEPAGE:
23922391
case MADV_NOHUGEPAGE:
23932392
case MADV_DONTDUMP:
23942393
case MADV_DODUMP:
2395-
break;
2394+
// Advice values that we buffer but require special handling.
2395+
case MADV_DONTNEED:
23962396
case MADV_FREE:
2397-
// See record_syscall. We disallow MADV_FREE because it creates
2398-
// nondeterminism.
2399-
advice = -1;
24002397
break;
24012398
default:
24022399
return traced_raw_syscall(call);
@@ -2410,7 +2407,13 @@ static long sys_madvise(struct syscall_info* call) {
24102407
return traced_raw_syscall(call);
24112408
}
24122409

2413-
if (advice == MADV_DONTNEED) {
2410+
if (advice == MADV_FREE) {
2411+
// See record_syscall. We disallow MADV_FREE because it creates
2412+
// nondeterminism. NB: Since we veto this, we *don't* need to
2413+
// execute it during replay.
2414+
ret = privileged_untraced_syscall3(syscallno, addr, length, -1);
2415+
return commit_raw_syscall(syscallno, ptr, ret);
2416+
} else if (advice == MADV_DONTNEED) {
24142417
ret = privileged_untraced_syscall3(syscallno, addr, length, MADV_COLD);
24152418
commit_raw_syscall(syscallno, ptr, ret);
24162419
if (ret < 0) {

src/test/seccomp_madvise.c

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
2+
3+
#include "util.h"
4+
5+
static void install_filter(void) {
6+
struct sock_filter filter[] = {
7+
/* Load system call number from 'seccomp_data' buffer into
8+
accumulator */
9+
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)),
10+
/* Jump forward 5 instructions if system call number
11+
is not SYS_madvise */
12+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_madvise, 0, 6),
13+
/* Load advice argument from `seccomp_data` buffer into
14+
accumulator */
15+
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])),
16+
/* Jump forward 1 instruction if advice is not MADV_DONTNEED */
17+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, MADV_DONTNEED, 0, 1),
18+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
19+
/* Jump forward 1 instruction if advice is not MADV_FREE */
20+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, MADV_FREE, 0, 1),
21+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
22+
/* Trigger SIGSYS */
23+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
24+
/* Destination of system call number mismatch: allow other
25+
system calls */
26+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
27+
};
28+
struct sock_fprog prog = {
29+
.len = (unsigned short)(sizeof(filter) / sizeof(filter[0])),
30+
.filter = filter,
31+
};
32+
int ret;
33+
34+
ret = syscall(RR_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog);
35+
if (ret == -1 && errno == ENOSYS) {
36+
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
37+
}
38+
test_assert(ret == 0);
39+
}
40+
41+
int main(void) {
42+
int ret;
43+
size_t page_size = sysconf(_SC_PAGE_SIZE);
44+
void* p = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
45+
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
46+
test_assert(p != MAP_FAILED);
47+
48+
/* Trigger syscall patching for madvise. */
49+
test_assert(0 == madvise(p, page_size, MADV_NORMAL));
50+
51+
test_assert(0 == prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
52+
test_assert(1 == prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
53+
install_filter();
54+
55+
/* Test that MADV_DONTNEED (which we rewrite to MADV_COLD)
56+
* doesn't trigger the seccomp filter.
57+
*/
58+
ret = madvise(p, page_size, MADV_DONTNEED);
59+
test_assert(ret == 0);
60+
61+
/* Test that MADV_FREE (which we rewrite to -1 to disallow)
62+
* doesn't trigger the seccomp filter.
63+
*/
64+
ret = madvise(p, page_size, MADV_FREE);
65+
test_assert(ret == 0 || (ret == -1 && errno == EINVAL));
66+
67+
atomic_puts("EXIT-SUCCESS");
68+
69+
return 0;
70+
}

0 commit comments

Comments
 (0)