Skip to content

Block cell size is sometimes zero in native MarkSweep #1103

@wks

Description

@wks

This bug is observed in the CI for a PR, but is reproducible on the master branch. https://github.com/mmtk/mmtk-core/actions/runs/8548408967/job/23453824271?pr=1067

The bug manifests as GC hanging during the first GC, with 100% CPU usage.

The bug is related to the native MarkSweep plan and eager sweeping. The "eager_sweeping" feature must be enabled to trigger this bug. The "vo_bit" feature implies "eager_sweeping".

To reproduce this bug, we need to compile mmtk-openjdk with VO-bit enabled (which will enable "eager_sweeping").

export DEBUG_LEVEL=fastdebug
make CONF=linux-x86_64-normal-server-$DEBUG_LEVEL THIRD_PARTY_HEAP=$PWD/../mmtk-openjdk/openjdk MMTK_VO_BIT=1

One benchmark that is know to reproduce this error is antlr in dacapo-2006-10-MR2.jar, with the following command line:

MMTK_THREADS=128 MMTK_PLAN=MarkSweep /path/to/openjdk/build/linux-x86_64-normal-server-fastdebug/jdk/bin/java -XX:+UseThirdPartyHeap -server -XX:MetaspaceSize=100M -Xms40M -Xmx40M -jar dacapo-2006-10-MR2.jar antlr

When the error occurs, it will hang during the very first GC.

Here is a backtrace captured using GDB:

#0  0x00007ffff5414956 in mmtk::util::metadata::side_metadata::helpers::meta_byte_mask (metadata_spec=0x7ffff5a6a508)
    at src/util/metadata/side_metadata/helpers.rs:135
#1  0x00007ffff5459b5b in mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure#0}<u8> ()
    at src/util/metadata/side_metadata/global.rs:592
#2  0x00007ffff5458c78 in mmtk::util::metadata::side_metadata::global::SideMetadataSpec::side_metadata_access<u8, (), mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure_env#0}<u8>, mmtk::util::metadata::side_metadata::global::{impl#0}::store_atomic::{closure_env#1}<u8>> (self=0x7ffff5a6a508, data_addr=..., input=..., access_func=..., verify_func=...)
    at src/util/metadata/side_metadata/global.rs:483
#3  0x00007ffff5459908 in mmtk::util::metadata::side_metadata::global::SideMetadataSpec::store_atomic<u8> (self=0x7ffff5a6a508,
    data_addr=..., metadata=0, order=core::sync::atomic::Ordering::SeqCst) at src/util/metadata/side_metadata/global.rs:584
#4  0x00007ffff512b3ca in mmtk::util::metadata::vo_bit::unset_vo_bit_nocheck<mmtk_openjdk::OpenJDK<true>> (object=...)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/metadata/vo_bit/mod.rs:81
#5  0x00007ffff5188066 in mmtk::policy::marksweepspace::native_ms::block::Block::simple_sweep<mmtk_openjdk::OpenJDK<true>> (
    self=0x7fff839f2580)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block.rs:298
#6  0x00007ffff51899af in mmtk::policy::marksweepspace::native_ms::block::Block::sweep<mmtk_openjdk::OpenJDK<true>> (
    self=0x7fff839f2580)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block.rs:273
#7  0x00007ffff51a91be in mmtk::policy::marksweepspace::native_ms::block_list::BlockList::sweep_blocks<mmtk_openjdk::OpenJDK<true>>
    (self=0x7ffff02ceea0, space=0x7ffff02c83f0)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/policy/marksweepspace/native_ms/block_list.rs:171
#8  0x00007ffff518601e in mmtk::util::alloc::free_list_allocator::FreeListAllocator<mmtk_openjdk::OpenJDK<true>>::reset<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ca8)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/alloc/free_list_allocator.rs:467
#9  0x00007ffff51865aa in mmtk::util::alloc::free_list_allocator::FreeListAllocator<mmtk_openjdk::OpenJDK<true>>::release<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ca8)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/util/alloc/free_list_allocator.rs:422
#10 0x00007ffff5136d5b in mmtk::plan::marksweep::mutator::native_mark_sweep::ms_mutator_release<mmtk_openjdk::OpenJDK<true>> (
    mutator=0x7ffff0032ae8, _tls=...)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/plan/marksweep/mutator.rs:78
#11 0x00007ffff536c850 in core::ops::function::Fn::call<fn(&mut mmtk::plan::mutator_context::Mutator<mmtk_openjdk::OpenJDK<true>>, mmtk::util::opaque_pointer::VMWorkerThread), (&mut mmtk::plan::mutator_context::Mutator<mmtk_openjdk::OpenJDK<true>>, mmtk::util::opaque_pointer::VMWorkerThread)> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ops/function.rs:79
#12 0x00007ffff5294fd9 in mmtk::plan::mutator_context::{impl#1}::release<mmtk_openjdk::OpenJDK<true>> (self=0x7ffff0032ae8,
    tls=...) at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/plan/mutator_context.rs:105
#13 0x00007ffff51bad52 in mmtk::scheduler::gc_work::{impl#11}::do_work<mmtk_openjdk::OpenJDK<true>> (self=0x7ffedc00b980,
    worker=0x7ffff0583bd0,
    _mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/gc_work.rs:172
#14 0x00007ffff51fbe9d in mmtk::scheduler::work::GCWork::do_work_with_stat<mmtk::scheduler::gc_work::ReleaseMutator<mmtk_openjdk::OpenJDK<true>>, mmtk_openjdk::OpenJDK<true>> (self=0x7ffedc00b980, worker=0x7ffff0583bd0,
    mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/work.rs:45
#15 0x00007ffff53b8146 in mmtk::scheduler::worker::GCWorker<mmtk_openjdk::OpenJDK<true>>::run<mmtk_openjdk::OpenJDK<true>> (
    self=0x7ffff0583bd0, tls=...,
    mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/scheduler/worker.rs:244
#16 0x00007ffff50c6395 in mmtk::memory_manager::start_worker<mmtk_openjdk::OpenJDK<true>> (
    mmtk=0x7ffff5ad9010 <<mmtk_openjdk::SINGLETON_COMPRESSED as core::ops::deref::Deref>::deref::__stability::LAZY>, tls=...,
--Type <RET> for more, q to quit, c to continue without paging--
    worker=0x7ffff0583bd0) at /home/wks/projects/mmtk-github/parallels/feature/fork/mmtk-core/src/memory_manager.rs:470
#17 0x00007ffff53ca7be in mmtk_openjdk::api::start_worker (tls=..., worker=0x7ffff0583bd0) at src/api.rs:210
#18 0x00007ffff7553f47 in Thread::call_run (this=0x7ffff061b800)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/openjdk/src/hotspot/share/runtime/thread.cpp:402
#19 0x00007ffff72403e6 in thread_native_entry (thread=0x7ffff061b800)
    at /home/wks/projects/mmtk-github/parallels/feature/fork/openjdk/src/hotspot/os/linux/os_linux.cpp:826
#20 0x00007ffff7df455a in ?? () from /usr/lib/libc.so.6
#21 0x00007ffff7e71a3c in ?? () from /usr/lib/libc.so.6

Frame 0-4 are not important. The problem lies in frame 5. Inside the function Block::simple_sweep, the variable cell_size can sometimes be 0. If this happens, the following while loop will never exit.

    fn simple_sweep<VM: VMBinding>(&self) {
        let cell_size = self.load_block_cell_size(); // This may be zero.
        let mut cell = self.start();
        // ...
        while cell + cell_size <= self.start() + Block::BYTES {
            // ...
            cell += cell_size; // If `cell_size` is 0, `cell` will be stuck at the same value forever.
        }

        self.store_free_list(last);
    }

Presumably, the cell size should never be zero. If the cell size is zero, it means the block is either uninitialized, or there is a race such that another thread overwrote the block size to 0 before the current thread attempts to sweep it.

I have not found the root cause, yet, but the following patch can help us reproduce the error. The debug_assert_ne will make it panic immediately if the block size is zero, and the std::process::exit(0); statement will make the program exit earlier because the error is known to occur during the first GC.

Patch:

diff --git a/src/policy/marksweepspace/native_ms/block.rs b/src/policy/marksweepspace/native_ms/block.rs
index 625a82d85..57f64ce1f 100644
--- a/src/policy/marksweepspace/native_ms/block.rs
+++ b/src/policy/marksweepspace/native_ms/block.rs
@@ -188,6 +188,7 @@ impl Block {
     }
 
     pub fn store_block_cell_size(&self, size: usize) {
+        debug_assert_ne!(size, 0);
         unsafe { Block::SIZE_TABLE.store::<usize>(self.start(), size) }
     }
 
@@ -282,6 +283,7 @@ impl Block {
     /// that we need to use this method correctly.
     fn simple_sweep<VM: VMBinding>(&self) {
         let cell_size = self.load_block_cell_size();
+        debug_assert_ne!(cell_size, 0);
         let mut cell = self.start();
         let mut last = unsafe { Address::zero() };
         while cell + cell_size <= self.start() + Block::BYTES {
diff --git a/src/scheduler/controller.rs b/src/scheduler/controller.rs
index 3608f1ebf..df9a0fe6f 100644
--- a/src/scheduler/controller.rs
+++ b/src/scheduler/controller.rs
@@ -143,6 +143,9 @@ impl<VM: VMBinding> GCController<VM> {
         };
         end_of_gc.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
 
+        println!("First GC finished without error.");
+        std::process::exit(0);
+
         self.scheduler.debug_assert_all_buckets_deactivated();
     }
 }
diff --git a/src/util/alloc/free_list_allocator.rs b/src/util/alloc/free_list_allocator.rs
index 50ce36bd2..1b9abde54 100644
--- a/src/util/alloc/free_list_allocator.rs
+++ b/src/util/alloc/free_list_allocator.rs
@@ -333,6 +333,7 @@ impl<VM: VMBinding> FreeListAllocator<VM> {
     }
 
     fn init_block(&self, block: Block, cell_size: usize) {
+        debug_assert_ne!(cell_size, 0);
         self.space.record_new_block(block);
 
         // construct free list

Then use the following command to repeat the experiment:

while MMTK_THREADS=128 MMTK_PLAN=MarkSweep ~/projects/mmtk-github/openjdk/build/linux-x86_64-normal-server-fastdebug/jdk/bin/java -XX:+UseThirdPartyHeap -server -XX:MetaspaceSize=100M -Xms40M -Xmx40M -jar dacapo-2006-10-MR2.jar antlr; do true; done

It will need about 30-40 attempts before it panics.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions