From d971d7e57687e51071583cbcb2acaabf8cdb6074 Mon Sep 17 00:00:00 2001 From: Nicolas Geoffray Date: Thu, 19 Aug 2021 13:55:30 +0100 Subject: Fix missing MsToNs in profile saver. Test: test.py Bug: 194880260 Ignore-AOSP-First: cherry pick Change-Id: I30b5608ef891805754986a0e00207e017280a97e Merged-In: I30b5608ef891805754986a0e00207e017280a97e (cherry picked from commit 3bc03531d1ae5cbe5b0a7ac34c10d5c51f20f233) (cherry picked from commit e806799b94606c0d108de1f3e9f440e0a908a197) --- runtime/jit/profile_saver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc index 425eadca41..b86badcba5 100644 --- a/runtime/jit/profile_saver.cc +++ b/runtime/jit/profile_saver.cc @@ -189,7 +189,7 @@ void ProfileSaver::Run() { // We might have been woken up by a huge number of notifications to guarantee saving. // If we didn't meet the minimum saving period go back to sleep (only if missed by // a reasonable margin). - uint64_t min_save_period_ns = options_.GetMinSavePeriodMs(); + uint64_t min_save_period_ns = MsToNs(options_.GetMinSavePeriodMs()); while (min_save_period_ns * 0.9 > sleep_time) { { MutexLock mu(self, wait_lock_); -- cgit v1.2.3 From 1363c1bf72f25f088b1f83250e91a47fc8cc7173 Mon Sep 17 00:00:00 2001 From: Hans Boehm Date: Fri, 21 May 2021 09:23:38 -0700 Subject: Improve suspension timeout diagnostic and fix race Fix a data race on state_and_flags. Since the access was volatile and there are system calls in the loop, this is extremely unlikey to have casused the bug here, but ... So, assuming this is still broken, produce more informative output once we time out. Remove unused argument from SuspendThreadByPeer(). It made the logic more complicated and made it harder to reason about correctness. Remove dead code after LOG(FATAL, ...) Bug: 181778559 Test: TreeHugger, temporarily paste log message into hotter path. Merged-In: I6f3455925b3a3f4726a870150aeb54ea60a38d67 (cherry picked from commit 9d27fbc8ced914f4726187920a7794b07eca3e71) Change-Id: Ia3f04153fb0a4f1b899fb0f68a6121728f89cb91 (cherry picked from commit 116203735734738cbfdffc2163b08b1707089f9c) --- openjdkjvm/OpenjdkJvm.cc | 1 - openjdkjvmti/ti_thread.cc | 1 - runtime/native/dalvik_system_VMStack.cc | 1 - runtime/native/java_lang_Thread.cc | 1 - runtime/thread.cc | 6 +++++ runtime/thread.h | 5 +++- runtime/thread_list.cc | 30 ++++++++++++---------- runtime/thread_list.h | 5 +--- .../stack_walk_concurrent.cc | 2 +- 9 files changed, 29 insertions(+), 23 deletions(-) diff --git a/openjdkjvm/OpenjdkJvm.cc b/openjdkjvm/OpenjdkJvm.cc index d64086d43c..9b514af452 100644 --- a/openjdkjvm/OpenjdkJvm.cc +++ b/openjdkjvm/OpenjdkJvm.cc @@ -423,7 +423,6 @@ JNIEXPORT void JVM_SetNativeThreadName(JNIEnv* env, jobject jthread, jstring jav art::Thread* thread; { thread = thread_list->SuspendThreadByPeer(jthread, - true, art::SuspendReason::kInternal, &timed_out); } diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc index bb8fa3b694..a9a6ee8895 100644 --- a/openjdkjvmti/ti_thread.cc +++ b/openjdkjvmti/ti_thread.cc @@ -898,7 +898,6 @@ jvmtiError ThreadUtil::SuspendOther(art::Thread* self, bool timeout = true; art::Thread* ret_target = art::Runtime::Current()->GetThreadList()->SuspendThreadByPeer( target_jthread, - /* request_suspension= */ true, art::SuspendReason::kForUserCode, &timeout); if (ret_target == nullptr && !timeout) { diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc index 9d2dfac069..e88516e248 100644 --- a/runtime/native/dalvik_system_VMStack.cc +++ b/runtime/native/dalvik_system_VMStack.cc @@ -59,7 +59,6 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa, ThreadList* thread_list = Runtime::Current()->GetThreadList(); bool timed_out; Thread* thread = thread_list->SuspendThreadByPeer(peer, - /* request_suspension= */ true, SuspendReason::kInternal, &timed_out); if (thread != nullptr) { diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc index 37b3fe642e..c3b4fe09de 100644 --- a/runtime/native/java_lang_Thread.cc +++ b/runtime/native/java_lang_Thread.cc @@ -148,7 +148,6 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) { bool timed_out; // Take suspend thread lock to avoid races with threads trying to suspend this one. Thread* thread = thread_list->SuspendThreadByPeer(peer, - /* request_suspension= */ true, SuspendReason::kInternal, &timed_out); if (thread != nullptr) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 46aa2b59ad..16a5f93be4 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -4468,6 +4468,12 @@ bool Thread::IsSystemDaemon() const { WellKnownClasses::java_lang_Thread_systemDaemon)->GetBoolean(GetPeer()); } +std::string Thread::StateAndFlagsAsHexString() const { + std::stringstream result_stream; + result_stream << std::hex << tls32_.state_and_flags.as_atomic_int.load(); + return result_stream.str(); +} + ScopedExceptionStorage::ScopedExceptionStorage(art::Thread* self) : self_(self), hs_(self_), excp_(hs_.NewHandle(self_->GetException())) { self_->ClearException(); diff --git a/runtime/thread.h b/runtime/thread.h index 7a408021c1..676bfd81de 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -254,7 +254,7 @@ class Thread { bool IsSuspended() const { union StateAndFlags state_and_flags; - state_and_flags.as_int = tls32_.state_and_flags.as_int; + state_and_flags.as_int = tls32_.state_and_flags.as_atomic_int.load(std::memory_order_relaxed); return state_and_flags.as_struct.state != kRunnable && (state_and_flags.as_struct.flags & kSuspendRequest) != 0; } @@ -1517,6 +1517,9 @@ class Thread { }; static_assert(sizeof(StateAndFlags) == sizeof(int32_t), "Weird state_and_flags size"); + // Format state and flags as a hex string. For diagnostic output. + std::string StateAndFlagsAsHexString() const; + static void ThreadExitCallback(void* arg); // Maximum number of suspend barriers. diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc index f8e99e8f62..84b7384c46 100644 --- a/runtime/thread_list.cc +++ b/runtime/thread_list.cc @@ -874,10 +874,11 @@ static void ThreadSuspendByPeerWarning(Thread* self, } Thread* ThreadList::SuspendThreadByPeer(jobject peer, - bool request_suspension, SuspendReason reason, bool* timed_out) { + bool request_suspension = true; const uint64_t start_time = NanoTime(); + int self_suspend_count = 0; useconds_t sleep_us = kThreadSuspendInitialSleepUs; *timed_out = false; Thread* const self = Thread::Current(); @@ -926,6 +927,7 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer, // We hold the suspend count lock but another thread is trying to suspend us. Its not // safe to try to suspend another thread in case we get a cycle. Start the loop again // which will allow this thread to be suspended. + ++self_suspend_count; continue; } CHECK(suspended_thread == nullptr); @@ -957,20 +959,22 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer, } const uint64_t total_delay = NanoTime() - start_time; if (total_delay >= thread_suspend_timeout_ns_) { - ThreadSuspendByPeerWarning(self, - ::android::base::FATAL, - "Thread suspension timed out", - peer); - if (suspended_thread != nullptr) { + if (suspended_thread == nullptr) { + ThreadSuspendByPeerWarning(self, + ::android::base::FATAL, + "Failed to issue suspend request", + peer); + } else { CHECK_EQ(suspended_thread, thread); - bool updated = suspended_thread->ModifySuspendCount(soa.Self(), - -1, - nullptr, - reason); - DCHECK(updated); + LOG(WARNING) << "Suspended thread state_and_flags: " + << suspended_thread->StateAndFlagsAsHexString() + << ", self_suspend_count = " << self_suspend_count; + ThreadSuspendByPeerWarning(self, + ::android::base::FATAL, + "Thread suspension timed out", + peer); } - *timed_out = true; - return nullptr; + UNREACHABLE(); } else if (sleep_us == 0 && total_delay > static_cast(kThreadSuspendMaxYieldUs) * 1000) { // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent diff --git a/runtime/thread_list.h b/runtime/thread_list.h index 87a4c8dc61..f5b58a0c54 100644 --- a/runtime/thread_list.h +++ b/runtime/thread_list.h @@ -81,11 +81,8 @@ class ThreadList { // Suspend a thread using a peer, typically used by the debugger. Returns the thread on success, // else null. The peer is used to identify the thread to avoid races with the thread terminating. - // If the thread should be suspended then value of request_suspension should be true otherwise - // the routine will wait for a previous suspend request. If the suspension times out then *timeout - // is set to true. + // If the suspension times out then *timeout is set to true. Thread* SuspendThreadByPeer(jobject peer, - bool request_suspension, SuspendReason reason, bool* timed_out) REQUIRES(!Locks::mutator_lock_, diff --git a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc index a185446ca5..a10fe2e905 100644 --- a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc +++ b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc @@ -81,7 +81,7 @@ extern "C" JNIEXPORT void JNICALL Java_Main_waitAndDeopt(JNIEnv*, jobject, jobje } bool timed_out = false; Thread* other = Runtime::Current()->GetThreadList()->SuspendThreadByPeer( - target, true, SuspendReason::kInternal, &timed_out); + target, SuspendReason::kInternal, &timed_out); CHECK(!timed_out); CHECK(other != nullptr); ScopedSuspendAll ssa(__FUNCTION__); -- cgit v1.2.3 From d4da905ba12bfcd9b967a47583f875bbcafeeda2 Mon Sep 17 00:00:00 2001 From: Hans Boehm Date: Wed, 11 Aug 2021 12:11:19 -0700 Subject: Do not acquire runtime_shutdown_lock_ in Abort() Abort can be called, particularly in OOM situations, when we already hold the lock. Abort() should minimize the locks it acquires. This is intended to be a minimal, low-risk change. Generated code should be essentially unchanged, except in Abort(). This does not address the question of whether IsShuttingDown really needs to lock at all. Test: Build and boot AOSP. Bug: 195884830 Merged-In: I0ee4a7ca7348153436fec0fecc1d1f2ca1f7a30c (cherry picked from commit 70aa29e2d93ba66e71a8ff88c9210719efae1c31) Change-Id: I9d7dca18bc480a37197bca3205834da13321cc58 (cherry picked from commit aefbed79a1e37d6901228b0a7e03ce63c2495703) --- runtime/runtime.cc | 4 ++-- runtime/runtime.h | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/runtime/runtime.cc b/runtime/runtime.cc index 433f564b2d..6c99c1fa8d 100644 --- a/runtime/runtime.cc +++ b/runtime/runtime.cc @@ -410,7 +410,7 @@ Runtime::~Runtime() { while (threads_being_born_ > 0) { shutdown_cond_->Wait(self); } - shutting_down_ = true; + SetShuttingDown(); } // Shutdown and wait for the daemons. CHECK(self != nullptr); @@ -641,7 +641,7 @@ void Runtime::Abort(const char* msg) { // May be coming from an unattached thread. if (Thread::Current() == nullptr) { Runtime* current = Runtime::Current(); - if (current != nullptr && current->IsStarted() && !current->IsShuttingDown(nullptr)) { + if (current != nullptr && current->IsStarted() && !current->IsShuttingDownUnsafe()) { // We do not flag this to the unexpected-signal handler so that that may dump the stack. abort(); UNREACHABLE(); diff --git a/runtime/runtime.h b/runtime/runtime.h index 68456cd37b..b2093a303c 100644 --- a/runtime/runtime.h +++ b/runtime/runtime.h @@ -222,7 +222,13 @@ class Runtime { bool IsShuttingDown(Thread* self); bool IsShuttingDownLocked() const REQUIRES(Locks::runtime_shutdown_lock_) { - return shutting_down_; + return shutting_down_.load(std::memory_order_relaxed); + } + bool IsShuttingDownUnsafe() const { + return shutting_down_.load(std::memory_order_relaxed); + } + void SetShuttingDown() REQUIRES(Locks::runtime_shutdown_lock_) { + shutting_down_.store(true, std::memory_order_relaxed); } size_t NumberOfThreadsBeingBorn() const REQUIRES(Locks::runtime_shutdown_lock_) { @@ -1190,8 +1196,10 @@ class Runtime { // Waited upon until no threads are being born. std::unique_ptr shutdown_cond_ GUARDED_BY(Locks::runtime_shutdown_lock_); - // Set when runtime shutdown is past the point that new threads may attach. - bool shutting_down_ GUARDED_BY(Locks::runtime_shutdown_lock_); + // Set when runtime shutdown is past the point that new threads may attach. Usually + // GUARDED_BY(Locks::runtime_shutdown_lock_). But we need to check it in Abort without the + // lock, because we may already own it. + std::atomic shutting_down_; // The runtime is starting to shutdown but is blocked waiting on shutdown_cond_. bool shutting_down_started_ GUARDED_BY(Locks::runtime_shutdown_lock_); -- cgit v1.2.3 From 8992f92aab381f0dc85cfd4003d935263f8451c8 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Tue, 17 Aug 2021 15:55:40 -0700 Subject: Replace weak-ref access disable checkpoint with STW pause Disabling weak-ref access in ConcurrentCopying collector can lead to deadlocks. For instance, if mutator M1 acquires W1 mutex and then participates in the checkpoint and then gets blocked in getReferent(), waiting for the gc-thread to finish reference processing. Mutator M2 waits for M1 to release W1 so that it can acquire the mutex before participating in the checkpoint. On the other hand, GC-thread waits for M2 to finish checkpoint. A STW pause avoids the deadlock by ensuring that mutators are not blocked on weak-ref access before the pause, and GC-thread can make progress after the pause in reference processing. Bug: 195336624 Bug: 195261575 Test: art/test/testrunner/testrunner.py Merged-In: I03d6bcd4d53f37ec84064edd8292951d30f48eaf Change-Id: I03d6bcd4d53f37ec84064edd8292951d30f48eaf (cherry picked from commit 555eefef9a27995ef341cdf44ed60c61953e2e3f) (cherry picked from commit 16f1ef2d09e82d419a2a51ac3d7f7fb7e9553dd1) --- runtime/gc/collector/concurrent_copying.cc | 50 +++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc index 538d7bef7c..936b9199b5 100644 --- a/runtime/gc/collector/concurrent_copying.cc +++ b/runtime/gc/collector/concurrent_copying.cc @@ -1630,11 +1630,16 @@ void ConcurrentCopying::CopyingPhase() { // for the last time before transitioning to the shared mark stack mode, which would process new // refs that may have been concurrently pushed onto the mark stack during the ProcessMarkStack() // call above. At the same time, disable weak ref accesses using a per-thread flag. It's - // important to do these together in a single checkpoint so that we can ensure that mutators - // won't newly gray objects and push new refs onto the mark stack due to weak ref accesses and + // important to do these together so that we can ensure that mutators won't + // newly gray objects and push new refs onto the mark stack due to weak ref accesses and // mutators safely transition to the shared mark stack mode (without leaving unprocessed refs on // the thread-local mark stacks), without a race. This is why we use a thread-local weak ref // access flag Thread::tls32_.weak_ref_access_enabled_ instead of the global ones. + // We must use a stop-the-world pause to disable weak ref access. A checkpoint may lead to a + // deadlock if one mutator acquires a low-level mutex and then gets blocked while accessing + // a weak-ref (after participating in the checkpoint), and another mutator indefinitely waits + // for the mutex before it participates in the checkpoint. Consequently, the gc-thread blocks + // forever as the checkpoint never finishes (See runtime/mutator_gc_coord.md). SwitchToSharedMarkStackMode(); CHECK(!self->GetWeakRefAccessEnabled()); // Now that weak refs accesses are disabled, once we exhaust the shared mark stack again here @@ -2044,21 +2049,36 @@ class ConcurrentCopying::AssertToSpaceInvariantFieldVisitor { void ConcurrentCopying::RevokeThreadLocalMarkStacks(bool disable_weak_ref_access, Closure* checkpoint_callback) { Thread* self = Thread::Current(); - RevokeThreadLocalMarkStackCheckpoint check_point(this, disable_weak_ref_access); + Locks::mutator_lock_->AssertSharedHeld(self); ThreadList* thread_list = Runtime::Current()->GetThreadList(); - gc_barrier_->Init(self, 0); - size_t barrier_count = thread_list->RunCheckpoint(&check_point, checkpoint_callback); - // If there are no threads to wait which implys that all the checkpoint functions are finished, - // then no need to release the mutator lock. - if (barrier_count == 0) { - return; - } - Locks::mutator_lock_->SharedUnlock(self); - { - ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun); - gc_barrier_->Increment(self, barrier_count); + RevokeThreadLocalMarkStackCheckpoint check_point(this, disable_weak_ref_access); + if (disable_weak_ref_access) { + // We're the only thread that could possibly ask for exclusive access here. + Locks::mutator_lock_->SharedUnlock(self); + { + ScopedPause pause(this); + MutexLock mu(self, *Locks::thread_list_lock_); + checkpoint_callback->Run(self); + for (Thread* thread : thread_list->GetList()) { + check_point.Run(thread); + } + } + Locks::mutator_lock_->SharedLock(self); + } else { + gc_barrier_->Init(self, 0); + size_t barrier_count = thread_list->RunCheckpoint(&check_point, checkpoint_callback); + // If there are no threads to wait which implys that all the checkpoint functions are finished, + // then no need to release the mutator lock. + if (barrier_count == 0) { + return; + } + Locks::mutator_lock_->SharedUnlock(self); + { + ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun); + gc_barrier_->Increment(self, barrier_count); + } + Locks::mutator_lock_->SharedLock(self); } - Locks::mutator_lock_->SharedLock(self); } void ConcurrentCopying::RevokeThreadLocalMarkStack(Thread* thread) { -- cgit v1.2.3