-
Notifications
You must be signed in to change notification settings - Fork 78
Description
This bug is observed in the CI for a PR, but is reproducible on the master branch. https://github.com/mmtk/mmtk-core/actions/runs/8548408967/job/23453824271?pr=1067
The bug manifests as GC hanging during the first GC, with 100% CPU usage.
The bug is related to the native MarkSweep plan and eager sweeping. The "eager_sweeping" feature must be enabled to trigger this bug. The "vo_bit" feature implies "eager_sweeping".
To reproduce this bug, we need to compile mmtk-openjdk with VO-bit enabled (which will enable "eager_sweeping").
export DEBUG_LEVEL=fastdebug
make CONF=linux-x86_64-normal-server-$DEBUG_LEVEL THIRD_PARTY_HEAP=$PWD/../mmtk-openjdk/openjdk MMTK_VO_BIT=1
One benchmark that is know to reproduce this error is antlr
in dacapo-2006-10-MR2.jar
, with the following command line:
MMTK_THREADS=128 MMTK_PLAN=MarkSweep /path/to/openjdk/build/linux-x86_64-normal-server-fastdebug/jdk/bin/java -XX:+UseThirdPartyHeap -server -XX:MetaspaceSize=100M -Xms40M -Xmx40M -jar dacapo-2006-10-MR2.jar antlr
When the error occurs, it will hang during the very first GC.
Here is a backtrace captured using GDB:
#0 0x00007ffff5414956 in mmtk::util::metadata::side_metadata::helpers::meta_byte_mask (metadata_spec=0x7ffff5a6a508)
at src/util/metadata/side_metadata/helpers.rs:135
#1 0x00007ffff5459b5b in mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure#0}<u8> ()
at src/util/metadata/side_metadata/global.rs:592
#2 0x00007ffff5458c78 in mmtk::util::metadata::side_metadata::global::SideMetadataSpec::side_metadata_access<u8, (), mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure_env#0}<u8>, mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure_env#1}<u8>> (self=0x7ffff5a6a508, data_addr=..., input=..., access_func=..., verify_func=...)
at src/util/metadata/side_metadata/global.rs:483
#3 0x00007ffff5459908 in mmtk::util::metadata::side_metadata::global::SideMetadataSpec::store_atomic<u8> (self=0x7ffff5a6a508,
data_addr=..., metadata=0, order=core::sync::atomic::Ordering::SeqCst) at src/util/metadata/side_metadata/global.rs:584
#4 0x00007ffff512b3ca in mmtk::util::metadata::vo_bit::unset_vo_bit_nocheck<mmtk_openjdk::OpenJDK<true>> (object=...)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/metadata/vo_bit/mod.rs:81
#5 0x00007ffff5188066 in mmtk::policy::marksweepspace::native_ms::block::Block::simple_sweep<mmtk_openjdk::OpenJDK<true>> (
self=0x7fff839f2580)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block.rs:298
#6 0x00007ffff51899af in mmtk::policy::marksweepspace::native_ms::block::Block::sweep<mmtk_openjdk::OpenJDK<true>> (
self=0x7fff839f2580)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block.rs:273
#7 0x00007ffff51a91be in mmtk::policy::marksweepspace::native_ms::block_list::BlockList::sweep_blocks<mmtk_openjdk::OpenJDK<true>>
(self=0x7ffff02ceea0, space=0x7ffff02c83f0)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block_list.rs:171
#8 0x00007ffff518601e in mmtk::util::alloc::free_list_allocator::FreeListAllocator<mmtk_openjdk::OpenJDK<true>>::reset<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ca8)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/alloc/free_list_allocator.rs:467
#9 0x00007ffff51865aa in mmtk::util::alloc::free_list_allocator::FreeListAllocator<mmtk_openjdk::OpenJDK<true>>::release<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ca8)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/alloc/free_list_allocator.rs:422
#10 0x00007ffff5136d5b in mmtk::plan::marksweep::mutator::native_mark_sweep::ms_mutator_release<mmtk_openjdk::OpenJDK<true>> (
mutator=0x7ffff0032ae8, _tls=...)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/plan/marksweep/mutator.rs:78
#11 0x00007ffff536c850 in core::ops::function::Fn::call<fn(&mut mmtk::plan::mutator_context::Mutator<mmtk_openjdk::OpenJDK<true>>, mmtk::util::opaque_pointer::VMWorkerThread), (&mut mmtk::plan::mutator_context::Mutator<mmtk_openjdk::OpenJDK<true>>, mmtk::util::opaque_pointer::VMWorkerThread)> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ops/function.rs:79
#12 0x00007ffff5294fd9 in mmtk::plan::mutator_context::{impl#1}::release<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ae8,
tls=...) at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/plan/mutator_context.rs:105
#13 0x00007ffff51bad52 in mmtk::scheduler::gc_work::{impl#11}::do_work<mmtk_openjdk::OpenJDK<true>> (self=0x7ffedc00b980,
worker=0x7ffff0583bd0,
_mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/gc_work.rs:172
#14 0x00007ffff51fbe9d in mmtk::scheduler::work::GCWork::do_work_with_stat<mmtk::scheduler::gc_work::ReleaseMutator<mmtk_openjdk::OpenJDK<true>>, mmtk_openjdk::OpenJDK<true>> (self=0x7ffedc00b980, worker=0x7ffff0583bd0,
mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/work.rs:45
#15 0x00007ffff53b8146 in mmtk::scheduler::worker::GCWorker<mmtk_openjdk::OpenJDK<true>>::run<mmtk_openjdk::OpenJDK<true>> (
self=0x7ffff0583bd0, tls=...,
mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/worker.rs:244
#16 0x00007ffff50c6395 in mmtk::memory_manager::start_worker<mmtk_openjdk::OpenJDK<true>> (
mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>, tls=...,
--Type <RET> for more, q to quit, c to continue without paging--
worker=0x7ffff0583bd0) at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/memory_manager.rs:470
#17 0x00007ffff53ca7be in mmtk_openjdk::api::start_worker (tls=..., worker=0x7ffff0583bd0) at src/api.rs:210
#18 0x00007ffff7553f47 in Thread::call_run (this=0x7ffff061b800)
at /home/wks/projects/mmtk-github/parallels/feature/fork/openjdk/src/hotspot/share/runtime/thread.cpp:402
#19 0x00007ffff72403e6 in thread_native_entry (thread=0x7ffff061b800)
at /home/wks/projects/mmtk-github/parallels/feature/fork/openjdk/src/hotspot/os/linux/os_linux.cpp:826
#20 0x00007ffff7df455a in ?? () from /usr/lib/libc.so.6
#21 0x00007ffff7e71a3c in ?? () from /usr/lib/libc.so.6
Frame 0-4 are not important. The problem lies in frame 5. Inside the function Block::simple_sweep
, the variable cell_size
can sometimes be 0. If this happens, the following while
loop will never exit.
fn simple_sweep<VM: VMBinding>(&self) {
let cell_size = self.load_block_cell_size(); // This may be zero.
let mut cell = self.start();
// ...
while cell + cell_size <= self.start() + Block::BYTES {
// ...
cell += cell_size; // If `cell_size` is 0, `cell` will be stuck at the same value forever.
}
self.store_free_list(last);
}
Presumably, the cell size should never be zero. If the cell size is zero, it means the block is either uninitialized, or there is a race such that another thread overwrote the block size to 0 before the current thread attempts to sweep it.
I have not found the root cause, yet, but the following patch can help us reproduce the error. The debug_assert_ne
will make it panic immediately if the block size is zero, and the std::process::exit(0);
statement will make the program exit earlier because the error is known to occur during the first GC.
Patch:
diff --git a/src/policy/marksweepspace/native_ms/block.rs b/src/policy/marksweepspace/native_ms/block.rs
index 625a82d85..57f64ce1f 100644
--- a/src/policy/marksweepspace/native_ms/block.rs
+++ b/src/policy/marksweepspace/native_ms/block.rs
@@ -188,6 +188,7 @@ impl Block {
}
pub fn store_block_cell_size(&self, size: usize) {
+ debug_assert_ne!(size, 0);
unsafe { Block::SIZE_TABLE.store::<usize>(self.start(), size) }
}
@@ -282,6 +283,7 @@ impl Block {
/// that we need to use this method correctly.
fn simple_sweep<VM: VMBinding>(&self) {
let cell_size = self.load_block_cell_size();
+ debug_assert_ne!(cell_size, 0);
let mut cell = self.start();
let mut last = unsafe { Address::zero() };
while cell + cell_size <= self.start() + Block::BYTES {
diff --git a/src/scheduler/controller.rs b/src/scheduler/controller.rs
index 3608f1ebf..df9a0fe6f 100644
--- a/src/scheduler/controller.rs
+++ b/src/scheduler/controller.rs
@@ -143,6 +143,9 @@ impl<VM: VMBinding> GCController<VM> {
};
end_of_gc.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
+ println!("First GC finished without error.");
+ std::process::exit(0);
+
self.scheduler.debug_assert_all_buckets_deactivated();
}
}
diff --git a/src/util/alloc/free_list_allocator.rs b/src/util/alloc/free_list_allocator.rs
index 50ce36bd2..1b9abde54 100644
--- a/src/util/alloc/free_list_allocator.rs
+++ b/src/util/alloc/free_list_allocator.rs
@@ -333,6 +333,7 @@ impl<VM: VMBinding> FreeListAllocator<VM> {
}
fn init_block(&self, block: Block, cell_size: usize) {
+ debug_assert_ne!(cell_size, 0);
self.space.record_new_block(block);
// construct free list
Then use the following command to repeat the experiment:
while MMTK_THREADS=128 MMTK_PLAN=MarkSweep ~/projects/mmtk-github/openjdk/build/linux-x86_64-normal-server-fastdebug/jdk/bin/java -XX:+UseThirdPartyHeap -server -XX:MetaspaceSize=100M -Xms40M -Xmx40M -jar dacapo-2006-10-MR2.jar antlr; do true; done
It will need about 30-40 attempts before it panics.