Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge arodchen's PR #60 to fix syscall and scheduler deadlock #4

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions misc/list_syscalls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# Produces a list of syscalls in the current system
import os, re
syscallCmd = "gcc -E -dD /usr/include/asm/unistd.h | grep __NR"
if (os.path.exists("/usr/include/asm")):
syscallCmd = "gcc -E -dD /usr/include/asm/unistd.h | grep __NR"
else:
syscallCmd = "gcc -E -dD /usr/include/x86_64-linux-gnu/asm/unistd.h | grep __NR"
syscallDefs = os.popen(syscallCmd).read()
sysList = [(int(numStr), name) for (name, numStr) in re.findall("#define __NR_(.*?) (\d+)", syscallDefs)]
denseList = ["INVALID"]*(max([num for (num, name) in sysList]) + 1)
Expand Down
24 changes: 12 additions & 12 deletions src/ooo_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,19 +270,19 @@ inline void OOOCore::bbl(Address bblAddr, BblInfo* bblInfo) {
if (addr != ((Address)-1L)) {
reqSatisfiedCycle = l1d->load(addr, dispatchCycle) + L1D_LAT;
cRec.record(curCycle, dispatchCycle, reqSatisfiedCycle);
}

// Enforce st-ld forwarding
uint32_t fwdIdx = (addr>>2) & (FWD_ENTRIES-1);
if (fwdArray[fwdIdx].addr == addr) {
// info("0x%lx FWD %ld %ld", addr, reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
/* Take the MAX (see FilterCache's code) Our fwdArray
* imposes more stringent timing constraints than the
* l1d, b/c FilterCache does not change the line's
* availCycle on a store. This allows FilterCache to
* track per-line, not per-word availCycles.
*/
reqSatisfiedCycle = MAX(reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
// Enforce st-ld forwarding
uint32_t fwdIdx = (addr>>2) & (FWD_ENTRIES-1);
if (fwdArray[fwdIdx].addr == addr) {
// info("0x%lx FWD %ld %ld", addr, reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
/* Take the MAX (see FilterCache's code) Our fwdArray
* imposes more stringent timing constraints than the
* l1d, b/c FilterCache does not change the line's
* availCycle on a store. This allows FilterCache to
* track per-line, not per-word availCycles.
*/
reqSatisfiedCycle = MAX(reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
}
}

commitCycle = reqSatisfiedCycle;
Expand Down
6 changes: 3 additions & 3 deletions src/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,12 @@ void Scheduler::watchdogThreadFunc() {
}

if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
//info("Watchdog Thread: Sleep dep detected...")
DEBUG_SCHEDULER("Watchdog Thread: Sleep dep detected...")
int64_t wakeupPhase = sleepQueue.front()->wakeupPhase;
int64_t wakeupCycles = (wakeupPhase - curPhase)*zinfo->phaseLength;
int64_t wakeupUsec = (wakeupCycles > 0)? wakeupCycles/zinfo->freqMHz : 0;

//info("Additional usecs of sleep %ld", wakeupUsec);
DEBUG_SCHEDULER("Additional usecs of sleep %ld", wakeupUsec);
if (wakeupUsec > 10*1000*1000) warn("Watchdog sleeping for a long time due to long sleep, %ld secs", wakeupUsec/1000/1000);

futex_unlock(&schedLock);
Expand All @@ -206,7 +206,7 @@ void Scheduler::watchdogThreadFunc() {

if (futex_haswaiters(&schedLock)) {
//happens commonly with multiple sleepers and very contended I/O...
//info("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
DEBUG_SCHEDULER("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
break;
}

Expand Down
59 changes: 40 additions & 19 deletions src/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
#include "stats.h"
#include "zsim.h"

//#define DEBUG_SCHEDULER(args...) info(args)
#define DEBUG_SCHEDULER(args...)

/**
* TODO (dsm): This class is due for a heavy pass or rewrite. Some things are more complex than they should:
* - The OUT state is unnecessary. It is done as a weak link between a thread that left and its context to preserve affinity, but
Expand Down Expand Up @@ -147,6 +150,10 @@ class Scheduler : public GlobAlloc, public Callee {
lock_t schedLock;
PAD();

PAD();
lock_t gidMapLock;
PAD();

uint64_t curPhase;
//uint32_t nextVictim;
MTRand rnd;
Expand Down Expand Up @@ -179,6 +186,7 @@ class Scheduler : public GlobAlloc, public Callee {
freeList.push_back(&contexts[i]);
}
schedLock = 0;
gidMapLock = 0;
//nextVictim = 0; //only used when freeList is empty.
curPhase = 0;
scheduledThreads = 0;
Expand Down Expand Up @@ -215,25 +223,29 @@ class Scheduler : public GlobAlloc, public Callee {
void start(uint32_t pid, uint32_t tid, const g_vector<bool>& mask) {
futex_lock(&schedLock);
uint32_t gid = getGid(pid, tid);
//info("[G %d] Start", gid);
DEBUG_SCHEDULER("[G %d] Start", gid);
assert((gidMap.find(gid) == gidMap.end()));
// Get pid and tid straight from the OS
// - SYS_gettid because glibc does not implement gettid()
// - SYS_getpid because after a fork (where zsim calls ThreadStart),
// getpid() returns the parent's pid (getpid() caches, and I'm
// guessing it hasn't flushed its cached pid at this point)
futex_lock(&gidMapLock);
gidMap[gid] = new ThreadInfo(gid, syscall(SYS_getpid), syscall(SYS_gettid), mask);
futex_unlock(&gidMapLock);
threadsCreated.inc();
futex_unlock(&schedLock);
}

void finish(uint32_t pid, uint32_t tid) {
futex_lock(&schedLock);
uint32_t gid = getGid(pid, tid);
//info("[G %d] Finish", gid);
DEBUG_SCHEDULER("[G %d] Finish", gid);
assert((gidMap.find(gid) != gidMap.end()));
ThreadInfo* th = gidMap[gid];
futex_lock(&gidMapLock);
gidMap.erase(gid);
futex_unlock(&gidMapLock);

// Check for suppressed syscall leave(), execute it
if (th->fakeLeave) {
Expand All @@ -251,19 +263,26 @@ class Scheduler : public GlobAlloc, public Callee {
futex_lock(&schedLock);
}

assert_msg(th->state == STARTED /*might be started but in fastFwd*/ ||th->state == OUT || th->state == BLOCKED || th->state == QUEUED, "gid %d finish with state %d", gid, th->state);
assert_msg(th->state == STARTED /*might be started but in fastFwd*/ ||th->state == OUT || th->state == BLOCKED || th->state == SLEEPING || th->state == QUEUED, "gid %d finish with state %d", gid, th->state);
if (th->state == QUEUED) {
assert(th->owner == &runQueue);
runQueue.remove(th);
} else if (th->owner) {
assert(th->owner == &outQueue);
outQueue.remove(th);
ContextInfo* ctx = &contexts[th->cid];
deschedule(th, ctx, BLOCKED);
freeList.push_back(ctx);
//no need to try to schedule anything; this context was already being considered while in outQueue
//assert(runQueue.empty()); need not be the case with masks
//info("[G %d] Removed from outQueue and descheduled", gid);
if (ctx->curThread == th) {
// descheduling finishing thread on the condition that it has been scheduled
assert(th->owner == &outQueue);
outQueue.remove(th);
deschedule(th, ctx, BLOCKED);
freeList.push_back(ctx);
//no need to try to schedule anything; this context was already being considered while in outQueue
//assert(runQueue.empty()); need not be the case with masks
DEBUG_SCHEDULER("[G %d] Removed from outQueue and descheduled", gid);
} else {
assert(th->owner == &sleepQueue);
sleepQueue.remove(th);
DEBUG_SCHEDULER("[G %d] Removed from sleepQueue", gid);
}
}
//At this point noone holds pointer to th, it's out from all queues, and either on OUT or BLOCKED means it's not pending a handoff
delete th;
Expand Down Expand Up @@ -398,7 +417,7 @@ class Scheduler : public GlobAlloc, public Callee {
schedule(dst, ctx);
wakeup(dst, false /*no join needed*/);
handoffEvents.inc();
//info("%d starting handoff cid %d to gid %d", th->gid, ctx->cid, dst->gid);
DEBUG_SCHEDULER("%d starting handoff cid %d to gid %d", th->gid, ctx->cid, dst->gid);

//We're descheduled and have completed the handoff. Now we need to see if we can be scheduled somewhere else.
ctx = schedThread(th);
Expand Down Expand Up @@ -474,15 +493,16 @@ class Scheduler : public GlobAlloc, public Callee {
}

bool isSleeping(uint32_t pid, uint32_t tid) {
futex_lock(&schedLock);
uint32_t gid = getGid(pid, tid);
futex_lock(&gidMapLock);
ThreadInfo* th = gidMap[gid];
futex_unlock(&gidMapLock);
bool res = th->state == SLEEPING;
futex_unlock(&schedLock);
return res;
}

void notifySleepEnd(uint32_t pid, uint32_t tid) {
// Returns the number of remaining phases to sleep
uint64_t notifySleepEnd(uint32_t pid, uint32_t tid) {
futex_lock(&schedLock);
uint32_t gid = getGid(pid, tid);
ThreadInfo* th = gidMap[gid];
Expand All @@ -496,6 +516,7 @@ class Scheduler : public GlobAlloc, public Callee {
th->state = BLOCKED;
}
futex_unlock(&schedLock);
return th->wakeupPhase - zinfo->numPhases;
}

void printThreadState(uint32_t pid, uint32_t tid) {
Expand Down Expand Up @@ -562,7 +583,7 @@ class Scheduler : public GlobAlloc, public Callee {
ctx->curThread = th;
scheduleEvents.inc();
scheduledThreads++;
//info("Scheduled %d <-> %d", th->gid, ctx->cid);
DEBUG_SCHEDULER("Scheduled %d <-> %d", th->gid, ctx->cid);
zinfo->cores[ctx->cid]->contextSwitch(th->gid);
}

Expand All @@ -581,26 +602,26 @@ class Scheduler : public GlobAlloc, public Callee {
//TODO: we may need more callbacks in the cores, e.g. in schedule(). Revise interface as needed...
zinfo->cores[ctx->cid]->contextSwitch(-1);
zinfo->processStats->notifyDeschedule(ctx->cid, getPid(th->gid));
//info("Descheduled %d <-> %d", th->gid, ctx->cid);
DEBUG_SCHEDULER("Descheduled %d <-> %d", th->gid, ctx->cid);
}

void waitForContext(ThreadInfo* th) {
th->futexWord = 1;
waitEvents.inc();
//info("%d waiting to be scheduled", th->gid);
DEBUG_SCHEDULER("%d waiting to be scheduled", th->gid);
//printState();
futex_unlock(&schedLock);
while (true) {
int futex_res = syscall(SYS_futex, &th->futexWord, FUTEX_WAIT, 1 /*a racing thread waking us up will change value to 0, and we won't block*/, nullptr, nullptr, 0);
if (futex_res == 0 || th->futexWord != 1) break;
}
//info("%d out of sched wait, got cid = %d, needsJoin = %d", th->gid, th->cid, th->needsJoin);
DEBUG_SCHEDULER("%d out of sched wait, got cid = %d, needsJoin = %d", th->gid, th->cid, th->needsJoin);
if (th->needsJoin) {
futex_lock(&schedLock);
assert(th->needsJoin); //re-check after the lock
zinfo->cores[th->cid]->join();
bar.join(th->cid, &schedLock);
//info("%d join done", th->gid);
DEBUG_SCHEDULER("%d join done", th->gid);
}
}

Expand Down
55 changes: 48 additions & 7 deletions src/virt/timeout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,25 +84,30 @@ static bool PrePatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD
int* uaddr = (int*) PIN_GetSyscallArgument(ctxt, std, 0);
int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
const struct timespec* timeout = (const struct timespec*) PIN_GetSyscallArgument(ctxt, std, 3);
uint64_t hostTimeoutNs = 0;

//info("FUTEX op %d waitOp %d uaddr %p ts %p", op, isFutexWaitOp(op), uaddr, timeout);
if (!(uaddr && isFutexWaitOp(op) && timeout)) return false; // not a timeout FUTEX_WAIT

waitNsec = timeout->tv_sec*1000000000L + timeout->tv_nsec;
waitNsec = timespecToNs(*timeout);

if (op & FUTEX_CLOCK_REALTIME) {
// NOTE: FUTEX_CLOCK_REALTIME is not a documented interface AFAIK, but looking at the Linux source code + with some verification, this is the xlat
struct timespec realtime;
uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
uint64_t offsetNs = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs;
//info(" REALTIME FUTEX: %ld %ld %ld %ld", waitNsec, simNs, offsetNs, waitNsec-offsetNs);
warn(" REALTIME FUTEX(%d): %ld %ld %ld %ld", op & FUTEX_CLOCK_REALTIME, waitNsec, simNs, offsetNs, waitNsec-offsetNs);
waitNsec = (waitNsec > (int64_t)offsetNs)? (waitNsec - offsetNs) : 0;

clock_gettime(CLOCK_REALTIME, &realtime);
hostTimeoutNs = timespecToNs(realtime);
}

if (waitNsec <= 0) return false; // while technically waiting, this does not block. I'm guessing this is done for trylocks? It's weird.

fakeTimeouts[tid].tv_sec = 0;
fakeTimeouts[tid].tv_nsec = 20*1000*1000; // timeout every 20ms of actual host time
hostTimeoutNs += 20*1000*1000; // timeout every 20ms of actual host time
fakeTimeouts[tid] = nsToTimespec(hostTimeoutNs);
PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)&fakeTimeouts[tid]);
} else {
assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
Expand Down Expand Up @@ -149,18 +154,54 @@ static bool PostPatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDAR
retrySyscall = isSleeping;
}

// Decide whether to retry when transitioning to FF
if (retrySyscall && zinfo->procArray[procIdx]->isInFastForward()) {
warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
retrySyscall = false;
assert(isSleeping);
zinfo->sched->notifySleepEnd(procIdx, tid);
uint64_t waitPhasesToSleep = zinfo->sched->notifySleepEnd(procIdx, tid);
if (waitPhasesToSleep > 0) {
ADDRINT timeoutRemArgVal;
uint64_t waitCycles = waitPhasesToSleep * zinfo->phaseLength;
uint64_t waitNsec = waitCycles * 1000 / zinfo->freqMHz;

if (syscall == SYS_futex) {
int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
if (op & FUTEX_CLOCK_REALTIME) {
struct timespec realtime;
clock_gettime(CLOCK_REALTIME, &realtime);
uint64_t offsetNs = timespecToNs(realtime);
waitNsec += offsetNs;
warn(" REALTIME FUTEX(%d) fast-forwarding retrial: %ld %ld %ld", op & FUTEX_CLOCK_REALTIME, waitNsec, offsetNs, waitNsec-offsetNs);
}
fakeTimeouts[tid] = nsToTimespec(waitNsec);
timeoutRemArgVal = (ADDRINT) & fakeTimeouts[tid];
} else {
assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
timeoutRemArgVal = (ADDRINT) waitNsec / (1000 * 1000);
}
warn("[%d] Fast-forwarding started, retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutRemArgVal);
} else {
warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
retrySyscall = false;
}
}

if (retrySyscall) {
// ADDRINT curIp = PIN_GetContextReg(ctxt, REG_INST_PTR);
//info("[%d] post-patch, retrying, IP: 0x%lx -> 0x%lx", tid, curIp, prevIp);
PIN_SetContextReg(ctxt, REG_INST_PTR, prevIp);
PIN_SetSyscallNumber(ctxt, std, syscall);
if (syscall == SYS_futex) {
int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
if (op & FUTEX_CLOCK_REALTIME) {
struct timespec realtime;
uint64_t hostTimeoutNs;

clock_gettime(CLOCK_REALTIME, &realtime);
hostTimeoutNs = timespecToNs(realtime) + 20*1000*1000; // timeout every 20ms of actual host time
fakeTimeouts[tid] = nsToTimespec(hostTimeoutNs);
}
}
} else {
// Restore timeout arg
PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutArgVal);
Expand Down
Loading