Skip to content

Commit

Permalink
Fix deadlock related to safepoint sync.
Browse files Browse the repository at this point in the history
`gc_lock` now never does safepoint check.  We always explicitly enter
safepoint via `ThreadBlockInVM`.

The VM companion thread now lets `VM_MMTkSTWOperation` (executed by the
VM thread) transition the state to `_threads_resumed` instead of doing
it after `VMThread::execute` returns.  This works around a problem where
`VMThread::execute` continues to block the companion thread when the VM
thread starts executing other VM operations.

Some minor style changes:

We consistently use the constant `Mutex::_no_safepoint_check_flag`
instead of `true` when acquiring or waiting for the `Monitor`.

Slightly updated comments and logs.  Logging statements no longer
include thread IDs because thread IDs can be turned on by a command line
option.
  • Loading branch information
wks committed May 27, 2024
1 parent b9e3aad commit 4dd2a57
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 53 deletions.
2 changes: 1 addition & 1 deletion openjdk/mmtkHeap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ MMTkHeap::MMTkHeap(MMTkCollectorPolicy* policy) :
_collector_policy(policy),
_num_root_scan_tasks(0),
_n_workers(0),
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_sometimes)),
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_never)),
_soft_ref_policy()
{
_heap = this;
Expand Down
45 changes: 26 additions & 19 deletions openjdk/mmtkUpcalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,19 @@ static void mmtk_resume_mutators(void *tls) {
#endif

// Note: we don't have to hold gc_lock to increment the counter.
// The increment has to be done before mutators can be resumed
// otherwise, mutators might see a stale value
// The increment has to be done before mutators can be resumed (from `block_for_gc` or yieldpoints).
// Otherwise, mutators might see an outdated start-the-world count.
Atomic::inc(&mmtk_start_the_world_count);
log_debug(gc)("Incremented start_the_world counter to %zu.", Atomic::load(&mmtk_start_the_world_count));

log_debug(gc)("Requesting the VM to resume all mutators...");
log_debug(gc)("Requesting the companion thread to resume all mutators blocking on yieldpoints...");
MMTkHeap::heap()->companion_thread()->request(MMTkVMCompanionThread::_threads_resumed, true);
log_debug(gc)("Mutators resumed. Now notify any mutators waiting for GC to finish...");

log_debug(gc)("Notifying mutators blocking on the start-the-world counter...");
{
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), true);
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);
MMTkHeap::heap()->gc_lock()->notify_all();
}
log_debug(gc)("Mutators notified.");
}

static const int GC_THREAD_KIND_WORKER = 1;
Expand All @@ -111,30 +111,37 @@ static void mmtk_spawn_gc_thread(void* tls, int kind, void* ctx) {

static void mmtk_block_for_gc() {
MMTkHeap::heap()->_last_gc_time = os::javaTimeNanos() / NANOSECS_PER_MILLISEC;
log_debug(gc)("Thread (id=%d) will block waiting for GC to finish.", Thread::current()->osthread()->thread_id());

// We must read the counter before entering safepoint.
// This thread has just triggered GC.
// Before this thread enters safe point, the GC cannot start, and therefore cannot finish,
// and cannot increment the counter mmtk_start_the_world_count.
// Otherwise, if we attempt to acquire the gc_lock first, GC may have triggered stop-the-world
// first, and this thread will be blocked for the entire stop-the-world duration before it can
// get the lock. Once that happens, the current thread will read the counter after the GC, and
// wait for the next non-existing GC forever.
// This thread (or another mutator) has just triggered GC.
// The GC cannot start until all mutators enter safepoint.
// If the GC cannot start, it cannot finish, and cannot increment mmtk_start_the_world_count.
// Otherwise, if we enter safepoint before reading mmtk_start_the_world_count,
// we will allow the GC to start before we read the counter,
// and the GC workers may run so fast that they have finished one whole GC and incremented the
// counter before this mutator reads the counter for the first time.
// Once that happens, the current mutator will wait for the next GC forever,
// which may not happen at all before the program exits.
size_t my_count = Atomic::load(&mmtk_start_the_world_count);
size_t next_count = my_count + 1;

log_debug(gc)("Will block until the start_the_world counter reaches %zu.", next_count);

{
// Once this thread acquires the lock, the VM will consider this thread to be "in safe point".
MutexLocker locker(MMTkHeap::heap()->gc_lock());
// Enter safepoint.
JavaThread* thread = JavaThread::current();
ThreadBlockInVM tbivm(thread);

// No safepoint check. We are already in safepoint.
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);

while (Atomic::load(&mmtk_start_the_world_count) < next_count) {
// wait() may wake up spuriously, but the authoritative condition for unblocking is
// mmtk_start_the_world_count being incremented.
MMTkHeap::heap()->gc_lock()->wait();
MMTkHeap::heap()->gc_lock()->wait(Mutex::_no_safepoint_check_flag);
}
}
log_debug(gc)("Thread (id=%d) resumed after GC finished.", Thread::current()->osthread()->thread_id());
log_debug(gc)("Resumed after GC finished.");
}

static void mmtk_out_of_memory(void* tls, MMTkAllocationError err_kind) {
Expand Down Expand Up @@ -167,7 +174,7 @@ static bool mmtk_is_mutator(void* tls) {

static void mmtk_get_mutators(MutatorClosure closure) {
JavaThread *thr;
for (JavaThreadIteratorWithHandle jtiwh; thr = jtiwh.next();) {
for (JavaThreadIteratorWithHandle jtiwh; (thr = jtiwh.next());) {
closure.invoke(&thr->third_party_heap_mutator);
}
}
Expand Down
93 changes: 62 additions & 31 deletions openjdk/mmtkVMCompanionThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void MMTkVMCompanionThread::run() {
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
assert(_reached_state == _threads_resumed, "Threads should be running at this moment.");
while (_desired_state != _threads_suspended) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
assert(_reached_state == _threads_resumed, "Threads should still be running at this moment.");
}
Expand All @@ -63,25 +63,9 @@ void MMTkVMCompanionThread::run() {
VM_MMTkSTWOperation op(this);
// VMThread::execute() is blocking. The companion thread will be blocked
// here waiting for the VM thread to execute op, and the VM thread will
// be blocked in reach_suspended_and_wait_for_resume() until a GC thread
// be blocked in do_mmtk_stw_operation() until a GC thread
// calls request(_threads_resumed).
VMThread::execute(&op);

// Tell the waiter thread that the world has resumed.
log_trace(gc)("MMTkVMCompanionThread: Notifying threads resumption...");
{
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
_reached_state = _threads_resumed;
_lock->notify_all();
}
{
MutexLocker x(Heap_lock);
if (Universe::has_reference_pending_list()) {
Heap_lock->notify_all();
}
}
}
}

Expand All @@ -107,7 +91,7 @@ void MMTkVMCompanionThread::request(stw_state desired_state, bool wait_until_rea

if (wait_until_reached) {
while (_reached_state != desired_state) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
}
}
Expand All @@ -123,23 +107,70 @@ void MMTkVMCompanionThread::wait_for_reached(stw_state desired_state) {
assert(_desired_state == desired_state, "State %d not requested.", desired_state);

while (_reached_state != desired_state) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
}

// Called by the VM thread to indicate that all Java threads have stopped.
// This method will block until the GC requests start-the-world.
void MMTkVMCompanionThread::reach_suspended_and_wait_for_resume() {
assert(Thread::current()->is_VM_thread(), "reach_suspended_and_wait_for_resume can only be executed by the VM thread");
// Called by the VM thread in `VM_MMTkSTWOperation`.
// This method notify that all Java threads have yielded, and will block the VM thread (thereby
// blocking Java threads) until the GC requests start-the-world.
void MMTkVMCompanionThread::do_mmtk_stw_operation() {
assert(Thread::current()->is_VM_thread(), "do_mmtk_stw_operation can only be executed by the VM thread");

MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
{
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);

// Tell the waiter thread that the world has stopped.
_reached_state = _threads_suspended;
_lock->notify_all();
// Tell the waiter thread that Java threads have stopped at yieldpoints.
_reached_state = _threads_suspended;
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_suspended state. Notifying...");
_lock->notify_all();

// Wait until resume-the-world is requested
while (_desired_state != _threads_resumed) {
_lock->wait(Mutex::_no_safepoint_check_flag);
}

// Wait until resume-the-world is requested
while (_desired_state != _threads_resumed) {
_lock->wait(true);
// Tell the waiter thread that Java threads will eventually resume from yieldpoints. This
// function will return, and, as soon as the VM thread stops executing safepoint VM operations,
// Java threads will resume from yieldpoints.
//
// Note: We have to notify *now* instead of after `VMThread::execute()`. For reasons unknown
// (likely a bug in OpenJDK 11), the VMThread fails to notify the companion thread after
// evaluating `VM_MMTkSTWOperation`, and continues to execute other VM operations (such as
// `RevokeBias`). This leaves the companion thread blocking on `VMThread::execute()` until the
// VM thread finishes executing the next batch of queued VM operations. If we notify after
// `VMThread::execute` in `run()`, it will cause a deadlock like the following:
//
// - The companion thread is blocked at `VMThread::execute()`, waiting for the next batch of VM
// operations to finish.
// - The VM thread is blocked in `SafepointSynchronize::begin()`, waiting for all mutators to
// reach safepoints.
// - One mutator is allocating too fast and triggers a GC, which requires the `WorkerMonitor`
// lock in mmtk-core.
// - A GC worker is still executing `mmtk_resume_mutator`, holding the `WorkerMutator` (as the
// last parked GC worker). It is asking the companion thread to resume mutators, and is still
// waiting for the companion thread to reach the `_thread_resumed` state. As we see before,
// the companion thread is waiting, too.
//
// By notifying now, we let the companion thread stop waiting, and therefore allowing the last
// parked GC worker to finish `resume_mutators`, breaking the deadlock. When the next GC
// starts, the GC worker running `mmtk_stop_all_mutators` will need to wait a little longer (as
// it always should) until the VM thread finishes executing other VM operations and the
// companion thread is ready to respond to another request from GC workers.
//
// Also note that OpenJDK 17 changed the way the VM thread executes VM operations. The same
// problem may not manifest in OpenJDK 17 or 21.
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
_reached_state = _threads_resumed;
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_resumed state. Notifying...");
_lock->notify_all();
}

{
MutexLockerEx x(Heap_lock, Mutex::_no_safepoint_check_flag);
if (Universe::has_reference_pending_list()) {
Heap_lock->notify_all();
}
}
}
2 changes: 1 addition & 1 deletion openjdk/mmtkVMCompanionThread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class MMTkVMCompanionThread: public NamedThread {
void wait_for_reached(stw_state reached_state);

// Interface for the VM_MMTkSTWOperation
void reach_suspended_and_wait_for_resume();
void do_mmtk_stw_operation();
};

#endif // MMTK_OPENJDK_MMTK_VM_COMPANION_THREAD_HPP
2 changes: 1 addition & 1 deletion openjdk/mmtkVMOperation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ VM_MMTkSTWOperation::VM_MMTkSTWOperation(MMTkVMCompanionThread *companion_thread

void VM_MMTkSTWOperation::doit() {
log_trace(vmthread)("Entered VM_MMTkSTWOperation::doit().");
_companion_thread->reach_suspended_and_wait_for_resume();
_companion_thread->do_mmtk_stw_operation();
log_trace(vmthread)("Leaving VM_MMTkSTWOperation::doit()");
}

0 comments on commit 4dd2a57

Please sign in to comment.