Skip to content

Commit

Permalink
Clean up LWP stopping.
Browse files Browse the repository at this point in the history
Only live processes can have threads stopped, but all process types have
disjoint mechanisms for discovering LWPs. Use a new "listLWPs" to list
all the LWPs we can use for backtracing, but for live processes,
maintain the set of LWPs we were able to stop for this operation, while
moving the discovery code (walking /proc/<pid>/task) into stopProcess.

This makes the Lwp collection in the base Process type unused except for
LiveProcess, so move it there.

Also, iterate multiple times over /proc/<pid>/task, in case new tasks
are created as we are suspending the existing set, and improve error
handling.
  • Loading branch information
peadar committed Feb 26, 2024
1 parent 83250ae commit abbca2c
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 109 deletions.
6 changes: 5 additions & 1 deletion dead.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@ CoreProcess::CoreProcess(Elf::Object::sptr exec, Elf::Object::sptr core,
if (note.name() == "CORE" && note.type() == NT_PRSTATUS) {
tasks.push_back( note.data()->readObj<prstatus_t>(0) );
prstatus_t &task = tasks.back();
(void)lwps[task.pr_pid];
if (verbose)
*debug << "task " << task.pr_pid << " current sig is " << task.pr_cursig << "\n";
}
#endif
}
}

void CoreProcess::listLWPs(std::function<void(lwpid_t)> cb) {
for (auto &task : tasks)
cb(task.pr_pid);
}

Reader::csptr
CoreProcess::getAUXV() const
{
Expand Down
7 changes: 1 addition & 6 deletions elf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,8 @@ GnuHash::findSymbol(const char *name) const {
for (;;) {
auto sym = syms->readObj<Sym>(idx * sizeof (Sym));
auto chainhash = hash->readObj<uint32_t>(chainoff(idx - header.symoffset));
if ((chainhash | 1) == (symhash | 1)) {
if (strings->readString(sym.st_name) == name) {
if (verbose >= 2)
*debug << "found '" << name << "' using GNU hash\n";
if ((chainhash | 1) == (symhash | 1) && strings->readString(sym.st_name) == name)
return std::make_pair(idx, sym);
}
}
if (chainhash & 1) {
if (verbose >= 2)
*debug << "failed to find '" << name << "' hit end of hash chain\n";
Expand Down
26 changes: 11 additions & 15 deletions libpstack/proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,6 @@ struct ThreadStack {
void unwind(Process &, Elf::CoreRegisters &regs);
};

/*
* This contains information about an LWP. In linux, since NPTL, this is
* essentially a thread. Old style, userland threads may have a single LWP for
* all threads.
*/
struct Lwp {
int stopCount;
int ptraceErr; // 0 if ptrace worked, otherwise, errno.
timeval stoppedAt;
Lwp() : stopCount{0}, ptraceErr{0}, stoppedAt{0,0} {}
};

struct PrintableFrame;

struct DevNode {
Expand Down Expand Up @@ -234,7 +222,6 @@ class Process : public ps_prochandle {
public:
PstackOptions options;
Elf::Addr sysent; // for AT_SYSINFO
std::map<pid_t, Lwp> lwps;
Dwarf::ImageCache &imageCache;
std::map<Elf::Addr, Elf::Object::sptr> objects;
virtual Reader::csptr getAUXV() const = 0;
Expand All @@ -254,6 +241,7 @@ class Process : public ps_prochandle {
std::ostream &dumpStackText(std::ostream &, const ThreadStack &) const;
std::ostream &dumpFrameText(std::ostream &, const StackFrame &, int) const;
template <typename T> void listThreads(const T &);
virtual void listLWPs(std::function<void(lwpid_t)>) {};


// find address of named symbol in the process.
Expand Down Expand Up @@ -298,17 +286,24 @@ std::string procname(pid_t pid, const std::string &);
struct LiveThreadList;
class LiveProcess final : public Process {
pid_t pid;

struct Lwp {
int stopCount = 0;
int ptraceErr = 0; // 0 if ptrace worked, otherwise, errno.
timeval stoppedAt { 0, 0 };
};
std::map<pid_t, Lwp> stoppedLWPs;
public:
// attach to existing process.
LiveProcess(Elf::Object::sptr &, pid_t, const PstackOptions &, Dwarf::ImageCache &, bool alreadyStopped=false);

void listLWPs(std::function<void(lwpid_t)>) override;
virtual bool getRegs(lwpid_t pid, Elf::CoreRegisters *reg) override;
virtual void stop(pid_t) override;
virtual void resume(pid_t) override;
void stopProcess() override;
void resumeProcess() override;
virtual Reader::csptr getAUXV() const override;
void findLWPs();
virtual pid_t getPID() const override;
protected:
bool loadSharedObjectsFromFileNote() override;
Expand All @@ -321,7 +316,7 @@ class SelfProcess : public Process {
public:
// attach to existing process.
SelfProcess(const Elf::Object::sptr &, const PstackOptions &, Dwarf::ImageCache &);

void listLWPs(std::function<void(lwpid_t)>) override;
virtual bool getRegs(lwpid_t pid, Elf::CoreRegisters *reg) override;
virtual void stop(pid_t) override;
virtual void resume(pid_t) override;
Expand Down Expand Up @@ -360,6 +355,7 @@ class CoreProcess final : public Process {
void resumeProcess() override { }
virtual Reader::csptr getAUXV() const override;
virtual pid_t getPID() const override;
void listLWPs(std::function<void(lwpid_t)>) override;
protected:
std::vector<prstatus_t> tasks;
bool loadSharedObjectsFromFileNote() override;
Expand Down
186 changes: 104 additions & 82 deletions live.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ LiveProcess::LiveProcess(Elf::Object::sptr &ex, pid_t pid_,
{
(void)ps_getpid(this);
if (alreadyStopped)
lwps[pid].stopCount = 1;
stoppedLWPs[pid].stopCount = 1;
}

Reader::csptr LiveProcess::getAUXV() const {
Expand All @@ -48,69 +48,62 @@ Reader::csptr LiveProcess::getAUXV() const {


bool
LiveProcess::getRegs(lwpid_t pid, Elf::CoreRegisters *reg)
LiveProcess::getRegs(lwpid_t lwpid, Elf::CoreRegisters *reg)
{
#ifdef __FreeBSD__
int rc;
rc = ptrace(PT_GETREGS, pid, (caddr_t)reg, 0);
rc = ptrace(PT_GETREGS, lwpid, (caddr_t)reg, 0);
if (rc == -1) {
warn("failed to trace LWP %d", (int)pid);
warn("failed to trace LWP %d", (int)lwpid);
return false;
}
return true;
#endif
#ifdef __linux__
stop(pid);
stop(lwpid);
iovec iov;
iov.iov_base = reg;
iov.iov_len = sizeof *reg;
int rc = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov);
resume(pid);
int rc = ptrace(PTRACE_GETREGSET, lwpid, NT_PRSTATUS, &iov);
resume(lwpid);
return rc == 0;
#endif
}

void
LiveProcess::resume(lwpid_t pid)
{
auto &tcb = lwps[pid];
assert(tcb.stopCount != 0); // We can't resume an LWP that is not suspended.
if (--tcb.stopCount != 0)
return;
if (tcb.ptraceErr != 0) {
if (verbose)
*debug << "not attempting to resume lwp " << pid << ", as it failed to stop\n";
return;
}
if (ptrace(PT_DETACH, pid, caddr_t(1), 0) != 0)
std::clog << "failed to detach from process " << pid << ": " << strerror(errno) << "\n";
dynamic_cast<CacheReader&>(*io).flush();
if (verbose >= 1) {
timeval tv;
gettimeofday(&tv, nullptr);
intmax_t usecs = (tv.tv_sec - tcb.stoppedAt.tv_sec) * 1000000;
usecs += tv.tv_usec;
usecs -= tcb.stoppedAt.tv_usec;
*debug << "resumed LWP " << pid << ": was stopped for " << std::dec <<
usecs << " microseconds" << std::endl;
}
LiveProcess::resume(lwpid_t lwpid) {
auto tcbi = stoppedLWPs.find(lwpid);
if (tcbi == stoppedLWPs.end())
return;
auto &tcb = tcbi->second;
assert(tcb.stopCount != 0); // We can't resume an LWP that is not suspended.
if (--tcb.stopCount != 0)
return;
if (tcb.ptraceErr != 0) {
if (verbose)
*debug << "not attempting to resume lwp " << lwpid << ", as it failed to stop\n";
return;
}
if (ptrace(PT_DETACH, lwpid, caddr_t(1), 0) != 0)
std::clog << "failed to detach from process " << lwpid << ": " << strerror(errno) << "\n";
dynamic_cast<CacheReader&>(*io).flush();
if (verbose >= 1) {
timeval tv;
gettimeofday(&tv, nullptr);
intmax_t usecs = (tv.tv_sec - tcb.stoppedAt.tv_sec) * 1000000;
usecs += tv.tv_usec;
usecs -= tcb.stoppedAt.tv_usec;
*debug << "resumed LWP " << lwpid << ": was stopped for " << std::dec <<
usecs << " microseconds" << std::endl;
}
}

void
LiveProcess::findLWPs()
LiveProcess::listLWPs(std::function<void(lwpid_t)> cb)
{
std::string dirName = procname(pid, "task");
DIR *d = opendir(dirName.c_str());
dirent *de;
if (d != nullptr) {
while ((de = readdir(d)) != nullptr) {
char *p;
lwpid_t pid = strtol(de->d_name, &p, 0);
if (*p == 0)
(void)lwps[pid];
}
closedir(d);
}
for (auto &lwp : stoppedLWPs)
if (lwp.second.ptraceErr == 0)
cb(lwp.first);
}

pid_t
Expand All @@ -122,17 +115,43 @@ LiveProcess::getPID() const
void
LiveProcess::stopProcess()
{
stop(pid); // suspend the main process itself first.
findLWPs();
// suspend the main process itself first.
// XXX: Note this can actually fail if the main thread exits before the
// remaining tasks. Other things also fail in that case - eg, opening
// stuff from /proc/pid/fd, etc. Really those operations should use
// /proc/<pid>/task/<tid> of a task we have suspended, rather than the main
// process
std::set<lwpid_t> suspended;
stop(pid);
suspended.insert(pid);

/*
* Stop all LWPs/kernel tasks. Do this before we stop the threads. Stopping the
* threads with thread_db actually just returns an error in linux, but
* stopping everything here ensures that we are not racing the process
* threads to read the thread list later.
* Stop all remaining LWPs/kernel tasks. Do this before we stop the
* threads. Stopping the threads with thread_db actually just returns an
* error in linux, but stopping everything here ensures that we are not
* racing the process threads to read the thread list later.
*/
for (auto &lwp : lwps)
stop(lwp.first);
size_t lastStopCount;
do {
lastStopCount = suspended.size();
std::string dirName = procname(pid, "task");
DIR *d = opendir(dirName.c_str());
if (d != nullptr) {
for (dirent *de; (de = readdir(d)) != nullptr; ) {
char *p;
lwpid_t tid = strtol(de->d_name, &p, 0);
if (*p == 0) {
auto [_, isnew ] = suspended.insert(tid);
if (isnew)
stop(tid);
}
}
}
closedir(d);
// if we found any threads, log it as debug. If we went around more than once, always log.
if (lastStopCount != suspended.size() && (verbose >= 2 || lastStopCount != 1))
*debug << "found " << suspended.size() - lastStopCount << " new LWPs after first " << lastStopCount << "\n";
} while (lastStopCount != suspended.size());

/*
* Attempt to enumerate the threads and suspend with pthread_db. This will
Expand All @@ -141,13 +160,10 @@ LiveProcess::stopProcess()
listThreads([this] (const td_thrhandle_t *thr) {
td_thrinfo_t info;
td_thr_get_info(thr, &info);
(void)lwps[info.ti_lid]; // make sure we have the LWP
if (td_thr_dbsuspend(thr) == TD_NOCAPAB) {
if (verbose >= 3)
*debug << "can't suspend thread " << thr
<< ": will suspend it's LWP " << info.ti_lid << "\n";
}
});
int suspendError = td_thr_dbsuspend(thr);
if (suspendError != 0 && suspendError != TD_NOCAPAB)
*debug << "can't suspend thread " << thr << ": will suspend it's LWP " << info.ti_lid << "\n";
});

if (verbose >= 2)
*debug << "stopped process " << pid << "\n";
Expand All @@ -161,36 +177,42 @@ LiveProcess::resumeProcess()
// this doesn't work in general, but it's ok, we'll suspend the LWP
if (verbose >= 3)
*debug << "can't resume thread " << thr << " (will resume it's LWP)\n";
}
});
}});

for (auto &lwp : lwps)
for (auto &lwp : stoppedLWPs)
resume(lwp.first);

resume(pid);
/* C++17 - remove all LWPs that are now resumed) */
for (auto it = stoppedLWPs.begin(); it != stoppedLWPs.end(); )
if (it->second.stopCount == 0)
it = stoppedLWPs.erase(it);
else
++it;
/* C++20:
std::erase_if(stoppedLWPs, [](auto &&entry) { return entry.stopCount == 0; } );
*/
}

void
LiveProcess::stop(lwpid_t pid)
{
auto &tcb = lwps[pid];
if (tcb.stopCount++ != 0)
return;

gettimeofday(&tcb.stoppedAt, nullptr);
if (ptrace(PT_ATTACH, pid, 0, 0) != 0) {
tcb.ptraceErr = errno;
*debug << "failed to stop LWP " << pid << ": ptrace failed: " << strerror(errno) << "\n";
return;
}
tcb.ptraceErr = 0;

int status;
pid_t waitedpid = waitpid(pid, &status, pid == this->pid ? 0 : __WCLONE);
if (waitedpid == -1)
*debug << "failed to stop LWP " << pid << ": wait failed: " << strerror(errno) << "\n";
else if (verbose >= 1)
*debug << "suspend LWP " << pid << std::endl;
LiveProcess::stop(lwpid_t tid) {
auto &tcb = stoppedLWPs[tid];
if (tcb.stopCount++ != 0)
return;

gettimeofday(&tcb.stoppedAt, nullptr);
if (ptrace(PT_ATTACH, tid, 0, 0) != 0) {
tcb.ptraceErr = errno;
*debug << "failed to stop LWP " << tid << ": ptrace failed: " << strerror(errno) << "\n";
return;
}
tcb.ptraceErr = 0;

int status;
pid_t waitedpid = waitpid(tid, &status, tid == this->pid ? 0 : __WCLONE);
if (waitedpid == -1)
*debug << "failed to stop LWP " << tid << ": wait failed: " << strerror(errno) << "\n";
else if (verbose >= 1)
*debug << "suspend LWP " << tid << std::endl;
}

std::vector<AddressRange>
Expand Down
10 changes: 5 additions & 5 deletions process.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1095,15 +1095,15 @@ Process::getStacks() {
* There are no extant linux systems that I'm aware of that use a non-1:1
* thread model, so we can't really test this.
*/
for (auto &lwp : lwps) {
if (tracedLwps.find(lwp.first) == tracedLwps.end()) {
listLWPs([this, &threadStacks, &tracedLwps](lwpid_t lwpid) {
if (tracedLwps.find(lwpid) == tracedLwps.end()) {
threadStacks.push_back(ThreadStack());
threadStacks.back().info.ti_lid = lwp.first;
threadStacks.back().info.ti_lid = lwpid;
Elf::CoreRegisters regs;
getRegs(lwp.first, &regs);
getRegs(lwpid, &regs);
threadStacks.back().unwind(*this, regs);
}
}
});

/*
* if we don't need to print arguments to functions, we now have the full
Expand Down
5 changes: 5 additions & 0 deletions self.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ SelfProcess::getAUXV() const
return loadFile("/proc/self/auxv");
}

void
SelfProcess::listLWPs(std::function<void(lwpid_t)> cb) {
cb(gettid());
}

bool
SelfProcess::getRegs(lwpid_t, Elf::CoreRegisters *reg) // for now, we just support the current thread.
{
Expand Down

0 comments on commit abbca2c

Please sign in to comment.