stanford-mast · grantae · Jul 12, 2015 · Jul 12, 2015 · Jul 13, 2015 · Jul 16, 2015
diff --git a/misc/list_syscalls.py b/misc/list_syscalls.py
@@ -2,6 +2,10 @@
 # Produces a list of syscalls in the current system
 import os, re
 syscallCmd = "gcc -E -dD /usr/include/asm/unistd.h | grep __NR"
+if (os.path.exists("/usr/include/asm")):
+    syscallCmd = "gcc -E -dD /usr/include/asm/unistd.h | grep __NR"
+else:
+    syscallCmd = "gcc -E -dD /usr/include/x86_64-linux-gnu/asm/unistd.h | grep __NR"
 syscallDefs = os.popen(syscallCmd).read()
 sysList = [(int(numStr), name) for (name, numStr) in re.findall("#define __NR_(.*?) (\d+)", syscallDefs)]
 denseList = ["INVALID"]*(max([num for (num, name) in sysList]) + 1)

diff --git a/src/ooo_core.cpp b/src/ooo_core.cpp
@@ -270,19 +270,19 @@ inline void OOOCore::bbl(Address bblAddr, BblInfo* bblInfo) {
                     if (addr != ((Address)-1L)) {
                         reqSatisfiedCycle = l1d->load(addr, dispatchCycle) + L1D_LAT;
                         cRec.record(curCycle, dispatchCycle, reqSatisfiedCycle);
-                    }
 
-                    // Enforce st-ld forwarding
-                    uint32_t fwdIdx = (addr>>2) & (FWD_ENTRIES-1);
-                    if (fwdArray[fwdIdx].addr == addr) {
-                        // info("0x%lx FWD %ld %ld", addr, reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
-                        /* Take the MAX (see FilterCache's code) Our fwdArray
-                         * imposes more stringent timing constraints than the
-                         * l1d, b/c FilterCache does not change the line's
-                         * availCycle on a store. This allows FilterCache to
-                         * track per-line, not per-word availCycles.
-                         */
-                        reqSatisfiedCycle = MAX(reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
+                        // Enforce st-ld forwarding
+                        uint32_t fwdIdx = (addr>>2) & (FWD_ENTRIES-1);
+                        if (fwdArray[fwdIdx].addr == addr) {
+                            // info("0x%lx FWD %ld %ld", addr, reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
+                            /* Take the MAX (see FilterCache's code) Our fwdArray
+                             * imposes more stringent timing constraints than the
+                             * l1d, b/c FilterCache does not change the line's
+                             * availCycle on a store. This allows FilterCache to
+                             * track per-line, not per-word availCycles.
+                             */
+                            reqSatisfiedCycle = MAX(reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
+                        }
                     }
 
                     commitCycle = reqSatisfiedCycle;

diff --git a/src/scheduler.cpp b/src/scheduler.cpp
@@ -179,12 +179,12 @@ void Scheduler::watchdogThreadFunc() {
         }
 
         if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
-            //info("Watchdog Thread: Sleep dep detected...")
+            DEBUG_SCHEDULER("Watchdog Thread: Sleep dep detected...")
             int64_t wakeupPhase = sleepQueue.front()->wakeupPhase;
             int64_t wakeupCycles = (wakeupPhase - curPhase)*zinfo->phaseLength;
             int64_t wakeupUsec = (wakeupCycles > 0)? wakeupCycles/zinfo->freqMHz : 0;
 
-            //info("Additional usecs of sleep %ld", wakeupUsec);
+            DEBUG_SCHEDULER("Additional usecs of sleep %ld", wakeupUsec);
             if (wakeupUsec > 10*1000*1000) warn("Watchdog sleeping for a long time due to long sleep, %ld secs", wakeupUsec/1000/1000);
 
             futex_unlock(&schedLock);
@@ -206,7 +206,7 @@ void Scheduler::watchdogThreadFunc() {
 
                     if (futex_haswaiters(&schedLock)) {
                         //happens commonly with multiple sleepers and very contended I/O...
-                        //info("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
+                        DEBUG_SCHEDULER("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
                         break;
                     }
 

diff --git a/src/scheduler.h b/src/scheduler.h
@@ -44,6 +44,9 @@
 #include "stats.h"
 #include "zsim.h"
 
+//#define DEBUG_SCHEDULER(args...) info(args)
+#define DEBUG_SCHEDULER(args...)
+
 /**
  * TODO (dsm): This class is due for a heavy pass or rewrite. Some things are more complex than they should:
  * - The OUT state is unnecessary. It is done as a weak link between a thread that left and its context to preserve affinity, but
@@ -147,6 +150,10 @@ class Scheduler : public GlobAlloc, public Callee {
         lock_t schedLock;
         PAD();
 
+        PAD();
+        lock_t gidMapLock;
+        PAD();
+
         uint64_t curPhase;
         //uint32_t nextVictim;
         MTRand rnd;
@@ -179,6 +186,7 @@ class Scheduler : public GlobAlloc, public Callee {
                 freeList.push_back(&contexts[i]);
             }
             schedLock = 0;
+            gidMapLock = 0;
             //nextVictim = 0; //only used when freeList is empty.
             curPhase = 0;
             scheduledThreads = 0;
@@ -215,25 +223,29 @@ class Scheduler : public GlobAlloc, public Callee {
         void start(uint32_t pid, uint32_t tid, const g_vector<bool>& mask) {
             futex_lock(&schedLock);
             uint32_t gid = getGid(pid, tid);
-            //info("[G %d] Start", gid);
+            DEBUG_SCHEDULER("[G %d] Start", gid);
             assert((gidMap.find(gid) == gidMap.end()));
             // Get pid and tid straight from the OS
             // - SYS_gettid because glibc does not implement gettid()
             // - SYS_getpid because after a fork (where zsim calls ThreadStart),
             //   getpid() returns the parent's pid (getpid() caches, and I'm
             //   guessing it hasn't flushed its cached pid at this point)
+            futex_lock(&gidMapLock);
             gidMap[gid] = new ThreadInfo(gid, syscall(SYS_getpid), syscall(SYS_gettid), mask);
+            futex_unlock(&gidMapLock);
             threadsCreated.inc();
             futex_unlock(&schedLock);
         }
 
         void finish(uint32_t pid, uint32_t tid) {
             futex_lock(&schedLock);
             uint32_t gid = getGid(pid, tid);
-            //info("[G %d] Finish", gid);
+            DEBUG_SCHEDULER("[G %d] Finish", gid);
             assert((gidMap.find(gid) != gidMap.end()));
             ThreadInfo* th = gidMap[gid];
+            futex_lock(&gidMapLock);
             gidMap.erase(gid);
+            futex_unlock(&gidMapLock);
 
             // Check for suppressed syscall leave(), execute it
             if (th->fakeLeave) {
@@ -251,19 +263,26 @@ class Scheduler : public GlobAlloc, public Callee {
                 futex_lock(&schedLock);
             }
 
-            assert_msg(th->state == STARTED /*might be started but in fastFwd*/ ||th->state == OUT || th->state == BLOCKED || th->state == QUEUED, "gid %d finish with state %d", gid, th->state);
+            assert_msg(th->state == STARTED /*might be started but in fastFwd*/ ||th->state == OUT || th->state == BLOCKED || th->state == SLEEPING || th->state == QUEUED, "gid %d finish with state %d", gid, th->state);
             if (th->state == QUEUED) {
                 assert(th->owner == &runQueue);
                 runQueue.remove(th);
             } else if (th->owner) {
-                assert(th->owner == &outQueue);
-                outQueue.remove(th);
                 ContextInfo* ctx = &contexts[th->cid];
-                deschedule(th, ctx, BLOCKED);
-                freeList.push_back(ctx);
-                //no need to try to schedule anything; this context was already being considered while in outQueue
-                //assert(runQueue.empty()); need not be the case with masks
-                //info("[G %d] Removed from outQueue and descheduled", gid);
+                if (ctx->curThread == th) {
+                    // descheduling finishing thread on the condition that it has been scheduled
+                    assert(th->owner == &outQueue);
+                    outQueue.remove(th);
+                    deschedule(th, ctx, BLOCKED);
+                    freeList.push_back(ctx);
+                    //no need to try to schedule anything; this context was already being considered while in outQueue
+                    //assert(runQueue.empty()); need not be the case with masks
+                    DEBUG_SCHEDULER("[G %d] Removed from outQueue and descheduled", gid);
+                } else {
+                    assert(th->owner == &sleepQueue);
+                    sleepQueue.remove(th);
+                    DEBUG_SCHEDULER("[G %d] Removed from sleepQueue", gid);
+                }
             }
             //At this point noone holds pointer to th, it's out from all queues, and either on OUT or BLOCKED means it's not pending a handoff
             delete th;
@@ -398,7 +417,7 @@ class Scheduler : public GlobAlloc, public Callee {
                 schedule(dst, ctx);
                 wakeup(dst, false /*no join needed*/);
                 handoffEvents.inc();
-                //info("%d starting handoff cid %d to gid %d", th->gid, ctx->cid, dst->gid);
+                DEBUG_SCHEDULER("%d starting handoff cid %d to gid %d", th->gid, ctx->cid, dst->gid);
 
                 //We're descheduled and have completed the handoff. Now we need to see if we can be scheduled somewhere else.
                 ctx = schedThread(th);
@@ -474,15 +493,16 @@ class Scheduler : public GlobAlloc, public Callee {
         }
 
         bool isSleeping(uint32_t pid, uint32_t tid) {
-            futex_lock(&schedLock);
             uint32_t gid = getGid(pid, tid);
+            futex_lock(&gidMapLock);
             ThreadInfo* th = gidMap[gid];
+            futex_unlock(&gidMapLock);
             bool res = th->state == SLEEPING;
-            futex_unlock(&schedLock);
             return res;
         }
 
-        void notifySleepEnd(uint32_t pid, uint32_t tid) {
+        // Returns the number of remaining phases to sleep
+        uint64_t notifySleepEnd(uint32_t pid, uint32_t tid) {
             futex_lock(&schedLock);
             uint32_t gid = getGid(pid, tid);
             ThreadInfo* th = gidMap[gid];
@@ -496,6 +516,7 @@ class Scheduler : public GlobAlloc, public Callee {
                 th->state = BLOCKED;
             }
             futex_unlock(&schedLock);
+            return th->wakeupPhase - zinfo->numPhases;
         }
 
         void printThreadState(uint32_t pid, uint32_t tid) {
@@ -562,7 +583,7 @@ class Scheduler : public GlobAlloc, public Callee {
             ctx->curThread = th;
             scheduleEvents.inc();
             scheduledThreads++;
-            //info("Scheduled %d <-> %d", th->gid, ctx->cid);
+            DEBUG_SCHEDULER("Scheduled %d <-> %d", th->gid, ctx->cid);
             zinfo->cores[ctx->cid]->contextSwitch(th->gid);
         }
 
@@ -581,26 +602,26 @@ class Scheduler : public GlobAlloc, public Callee {
             //TODO: we may need more callbacks in the cores, e.g. in schedule(). Revise interface as needed...
             zinfo->cores[ctx->cid]->contextSwitch(-1);
             zinfo->processStats->notifyDeschedule(ctx->cid, getPid(th->gid));
-            //info("Descheduled %d <-> %d", th->gid, ctx->cid);
+            DEBUG_SCHEDULER("Descheduled %d <-> %d", th->gid, ctx->cid);
         }
 
         void waitForContext(ThreadInfo* th) {
             th->futexWord = 1;
             waitEvents.inc();
-            //info("%d waiting to be scheduled", th->gid);
+            DEBUG_SCHEDULER("%d waiting to be scheduled", th->gid);
             //printState();
             futex_unlock(&schedLock);
             while (true) {
                 int futex_res = syscall(SYS_futex, &th->futexWord, FUTEX_WAIT, 1 /*a racing thread waking us up will change value to 0, and we won't block*/, nullptr, nullptr, 0);
                 if (futex_res == 0 || th->futexWord != 1) break;
             }
-            //info("%d out of sched wait, got cid = %d, needsJoin = %d", th->gid, th->cid, th->needsJoin);
+            DEBUG_SCHEDULER("%d out of sched wait, got cid = %d, needsJoin = %d", th->gid, th->cid, th->needsJoin);
             if (th->needsJoin) {
                 futex_lock(&schedLock);
                 assert(th->needsJoin); //re-check after the lock
                 zinfo->cores[th->cid]->join();
                 bar.join(th->cid, &schedLock);
-                //info("%d join done", th->gid);
+                DEBUG_SCHEDULER("%d join done", th->gid);
             }
         }
 

diff --git a/src/virt/timeout.cpp b/src/virt/timeout.cpp
@@ -84,25 +84,30 @@ static bool PrePatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD
         int* uaddr = (int*) PIN_GetSyscallArgument(ctxt, std, 0);
         int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
         const struct timespec* timeout = (const struct timespec*) PIN_GetSyscallArgument(ctxt, std, 3);
+        uint64_t hostTimeoutNs = 0;
 
         //info("FUTEX op %d  waitOp %d uaddr %p ts %p", op, isFutexWaitOp(op), uaddr, timeout);
         if (!(uaddr && isFutexWaitOp(op) && timeout)) return false;  // not a timeout FUTEX_WAIT
 
-        waitNsec = timeout->tv_sec*1000000000L + timeout->tv_nsec;
+        waitNsec = timespecToNs(*timeout);
 
         if (op & FUTEX_CLOCK_REALTIME) {
             // NOTE: FUTEX_CLOCK_REALTIME is not a documented interface AFAIK, but looking at the Linux source code + with some verification, this is the xlat
+            struct timespec realtime;
             uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
             uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
             uint64_t offsetNs = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs;
-            //info(" REALTIME FUTEX: %ld %ld %ld %ld", waitNsec, simNs, offsetNs, waitNsec-offsetNs);
+            warn(" REALTIME FUTEX(%d): %ld %ld %ld %ld", op & FUTEX_CLOCK_REALTIME, waitNsec, simNs, offsetNs, waitNsec-offsetNs);
             waitNsec = (waitNsec > (int64_t)offsetNs)? (waitNsec - offsetNs) : 0;
+
+            clock_gettime(CLOCK_REALTIME, &realtime);
+            hostTimeoutNs = timespecToNs(realtime);
         }
 
         if (waitNsec <= 0) return false;  // while technically waiting, this does not block. I'm guessing this is done for trylocks? It's weird.
 
-        fakeTimeouts[tid].tv_sec = 0;
-        fakeTimeouts[tid].tv_nsec = 20*1000*1000;  // timeout every 20ms of actual host time
+        hostTimeoutNs += 20*1000*1000;  // timeout every 20ms of actual host time
+        fakeTimeouts[tid] = nsToTimespec(hostTimeoutNs);
         PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)&fakeTimeouts[tid]);
     } else {
         assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
@@ -149,18 +154,54 @@ static bool PostPatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDAR
         retrySyscall = isSleeping;
     }
 
+    // Decide whether to retry when transitioning to FF
     if (retrySyscall && zinfo->procArray[procIdx]->isInFastForward()) {
-        warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
-        retrySyscall = false;
         assert(isSleeping);
-        zinfo->sched->notifySleepEnd(procIdx, tid);
+        uint64_t waitPhasesToSleep = zinfo->sched->notifySleepEnd(procIdx, tid);
+        if (waitPhasesToSleep > 0) {
+            ADDRINT timeoutRemArgVal;
+            uint64_t waitCycles = waitPhasesToSleep * zinfo->phaseLength;
+            uint64_t waitNsec = waitCycles * 1000 / zinfo->freqMHz;
+
+            if (syscall == SYS_futex) {
+                int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
+                if (op & FUTEX_CLOCK_REALTIME) {
+                    struct timespec realtime;
+                    clock_gettime(CLOCK_REALTIME, &realtime);
+                    uint64_t offsetNs = timespecToNs(realtime);
+                    waitNsec += offsetNs;
+                    warn(" REALTIME FUTEX(%d) fast-forwarding retrial: %ld %ld %ld", op & FUTEX_CLOCK_REALTIME, waitNsec, offsetNs, waitNsec-offsetNs);
+                }
+                fakeTimeouts[tid] = nsToTimespec(waitNsec);
+                timeoutRemArgVal = (ADDRINT) & fakeTimeouts[tid];
+            } else {
+                assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
+                timeoutRemArgVal = (ADDRINT) waitNsec / (1000 * 1000);
+            }
+            warn("[%d] Fast-forwarding started, retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
+            PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutRemArgVal);
+        } else {
+            warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
+            retrySyscall = false;
+        }
     }
 
     if (retrySyscall) {
         // ADDRINT curIp = PIN_GetContextReg(ctxt, REG_INST_PTR);
         //info("[%d] post-patch, retrying, IP: 0x%lx -> 0x%lx", tid, curIp, prevIp);
         PIN_SetContextReg(ctxt, REG_INST_PTR, prevIp);
         PIN_SetSyscallNumber(ctxt, std, syscall);
+        if (syscall == SYS_futex) {
+            int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
+            if (op & FUTEX_CLOCK_REALTIME) {
+                struct timespec realtime;
+                uint64_t hostTimeoutNs;
+
+                clock_gettime(CLOCK_REALTIME, &realtime);
+                hostTimeoutNs = timespecToNs(realtime) + 20*1000*1000;  // timeout every 20ms of actual host time
+                fakeTimeouts[tid] = nsToTimespec(hostTimeoutNs);
+            }
+        }
     } else {
         // Restore timeout arg
         PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutArgVal);