lazy stack: ensure next stack page statically

wkozaczuk · wkozaczuk · commit 9b1e9b0daf70 · 2022-10-16T11:44:51.000-04:00
This patch modifies all relevant places, where we statically know
that both interrupts and preemption should be enabled, to unconditionally
pre-fault the stack.

These include places in code before:
- WITH_LOCK(preemption_lock) block
- WITH_LOCK(rcu_read_lock) block
- sched::preempt_disable() call
- WITH_LOCK(irq_lock) block in one case

In general, these are the places that follow the assumption
that most of the time preemption and interrupts are enabled.
And the functions/method below are called directly or indirectly by
an application and there is no other kernel code in that application
call stack above that also disables interrupts or preemption.

One good example is the memory allocation code in mempool.cc
that disables preemption in quite few places. Many of those
use WITH_LOCK/DROP_LOCK(preempt_lock) combination that implies
they were intended to be called with preemption enabled otherwise
code in DROP_LOCK would not work. Also all of those end up calling
some form of thread::wait() method that asserts that both interrupts
and interrupts are enabled.

To validate the reasoning, we add relevant invariants before we pre-fault
the stack.

Signed-off-by: Waldemar Kozaczuk &lt;jwkozaczuk@gmail.com&gt;
diff --git a/bsd/porting/uma_stub.cc b/bsd/porting/uma_stub.cc
@@ -34,6 +34,12 @@ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
     void * ptr;
 
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
         ptr = (*zone->percpu_cache)->alloc();
     }
@@ -101,6 +107,12 @@ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
         zone->uz_dtor(item, zone->uz_size, udata);
     }
 
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
         if ((*zone->percpu_cache)->free(item)) {
             return;
diff --git a/bsd/sys/net/routecache.hh b/bsd/sys/net/routecache.hh
@@ -162,6 +162,12 @@ public:
         // route.cc).
         assert(fibnum == 0);
 
+#if CONF_lazy_stack_invariant
+        assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(osv::rcu_read_lock) {
             auto *c = cache.read();
             auto entry = c->search(dst->sin_addr.s_addr);
diff --git a/bsd/sys/netinet/arpcache.hh b/bsd/sys/netinet/arpcache.hh
@@ -16,6 +16,7 @@
 #include <osv/rcu.hh>
 #include <osv/types.h>
 #include <osv/mutex.h>
+#include <osv/sched.hh>
 
 #include <boost/optional.hpp>
 #include <functional>
@@ -98,6 +99,12 @@ struct arp_cache {
 
     boost::optional<entry> lookup(const in_addr ip)
     {
+#if CONF_lazy_stack_invariant
+        assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(osv::rcu_read_lock) {
             auto i = _entries.reader_find(ip, std::hash<in_addr>(), entry_compare());
             return boost::optional<entry>(!!i, *i);
diff --git a/core/async.cc b/core/async.cc
@@ -104,6 +104,12 @@ class async_worker {
 
     void insert(percpu_timer_task& task)
     {
+#if CONF_lazy_stack_invariant
+        assert(arch::irq_enabled() && sched::preemptable());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(preempt_lock) {
             trace_async_timer_task_insert(this, &task);
 
@@ -125,6 +131,12 @@ class async_worker {
 
     percpu_timer_task& borrow_task()
     {
+#if CONF_lazy_stack_invariant
+        assert(arch::irq_enabled() && sched::preemptable());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(preempt_lock) {
             auto task = released_timer_tasks.pop();
             if (task) {
@@ -142,6 +154,12 @@ class async_worker {
     {
         auto task = new one_shot_task(std::move(callback));
 
+#if CONF_lazy_stack_invariant
+        assert(arch::irq_enabled() && sched::preemptable());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(preempt_lock) {
             if (_queue.empty()) {
                 _thread->wake();
diff --git a/core/condvar.cc b/core/condvar.cc
@@ -35,6 +35,13 @@ int condvar::wait(mutex* user_mutex, sched::timer* tmr)
     _user_mutex = user_mutex;
     // This preempt_disable() is just an optimization, to avoid context
     // switch between the two unlocks.
+#if CONF_lazy_stack_invariant
+    assert(arch::irq_enabled());
+    assert(sched::preemptable());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     sched::preempt_disable();
     user_mutex->unlock();
     _m.unlock();
diff --git a/core/elf.cc b/core/elf.cc
@@ -1654,6 +1654,12 @@ object *program::object_containing_addr(const void *addr)
 {
     object *ret = nullptr;
     module_delete_disable();
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(osv::rcu_read_lock) {
          const auto &modules = _modules_rcu.read()->objects;
          for (object *module : modules) {
@@ -1819,6 +1825,12 @@ void program::free_dtv(object* obj)
 program::modules_list program::modules_get() const
 {
     modules_list ret;
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(osv::rcu_read_lock) {
         auto modules = _modules_rcu.read();
         auto needed = modules->objects.size();
diff --git a/core/mempool.cc b/core/mempool.cc
@@ -216,6 +216,12 @@ TRACEPOINT(trace_pool_free_different_cpu, "this=%p, obj=%p, obj_cpu=%d", void*,
 void* pool::alloc()
 {
     void * ret = nullptr;
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
 
         // We enable preemption because add_page() may take a Mutex.
@@ -255,8 +261,14 @@ void pool::add_page()
 {
     // FIXME: this function allocated a page and set it up but on rare cases
     // we may add this page to the free list of a different cpu, due to the
-    // enablment of preemption
+    // enablement of preemption
     void* page = untracked_alloc_page();
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
         page_header* header = new (page) page_header;
         header->cpu_id = mempool_cpuid();
@@ -321,6 +333,12 @@ void pool::free(void* object)
 {
     trace_pool_free(this, object);
 
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
 
         free_object* obj = static_cast<free_object*>(object);
@@ -1260,6 +1278,9 @@ class l2 {
         while (!(pb = try_alloc_page_batch())) {
             WITH_LOCK(migration_lock) {
                 DROP_LOCK(preempt_lock) {
+#if CONF_lazy_stack_invariant
+                    assert(sched::preemptable());
+#endif
                     refill();
                 }
             }
@@ -1272,6 +1293,9 @@ class l2 {
         while (!try_free_page_batch(pb)) {
             WITH_LOCK(migration_lock) {
                 DROP_LOCK(preempt_lock) {
+#if CONF_lazy_stack_invariant
+                    assert(sched::preemptable());
+#endif
                     unfill();
                 }
             }
@@ -1377,6 +1401,12 @@ void l1::fill_thread()
 
 void l1::refill()
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     SCOPE_LOCK(preempt_lock);
     auto& pbuf = get_l1();
     if (pbuf.nr + page_batch::nr_pages < pbuf.max / 2) {
@@ -1398,6 +1428,12 @@ void l1::refill()
 
 void l1::unfill()
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     SCOPE_LOCK(preempt_lock);
     auto& pbuf = get_l1();
     if (pbuf.nr > page_batch::nr_pages + pbuf.max / 2) {
@@ -1411,6 +1447,12 @@ void l1::unfill()
 
 void* l1::alloc_page_local()
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     SCOPE_LOCK(preempt_lock);
     auto& pbuf = get_l1();
     if (pbuf.nr < pbuf.watermark_lo) {
@@ -1424,6 +1466,12 @@ void* l1::alloc_page_local()
 
 bool l1::free_page_local(void* v)
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     SCOPE_LOCK(preempt_lock);
     auto& pbuf = get_l1();
     if (pbuf.nr > pbuf.watermark_hi) {
diff --git a/core/rcu.cc b/core/rcu.cc
@@ -196,6 +196,12 @@ using namespace rcu;
 
 void rcu_defer(std::function<void ()>&& func)
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(preempt_lock) {
         auto p = &*percpu_callbacks;
         while (p->ncallbacks[p->buf] == p->callbacks[p->buf].size()) {
diff --git a/core/sched.cc b/core/sched.cc
@@ -704,6 +704,12 @@ void thread::unpin()
     // to pin, unpin, or migrate the same thread, we need to run the actual
     // unpinning code on the same CPU as the target thread.
     if (this == current()) {
+#if CONF_lazy_stack_invariant
+        assert(arch::irq_enabled() && sched::preemptable());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK(preempt_lock) {
             if (_pinned) {
                 _pinned = false;
@@ -882,6 +888,12 @@ static thread_runtime::duration total_app_time_exited(0);
 
 thread_runtime::duration thread::thread_clock() {
     if (this == current()) {
+#if CONF_lazy_stack_invariant
+        assert(arch::irq_enabled() && sched::preemptable());
+#endif
+#if CONF_lazy_stack
+        arch::ensure_next_stack_page();
+#endif
         WITH_LOCK (preempt_lock) {
             // Inside preempt_lock, we are running and the scheduler can't
             // intervene and change _total_cpu_time or _running_since
@@ -1158,6 +1170,13 @@ void thread::prepare_wait()
 {
     // After setting the thread's status to "waiting", we must not preempt it,
     // as it is no longer in "running" state and therefore will not return.
+#if CONF_lazy_stack_invariant
+    assert(arch::irq_enabled());
+    assert(sched::preemptable());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     preempt_disable();
     assert(_detached_state->st.load() == status::running);
     _detached_state->st.store(status::waiting);
@@ -1257,6 +1276,12 @@ void thread::wake_with_irq_disabled()
 void thread::wake_lock(mutex* mtx, wait_record* wr)
 {
     // must be called with mtx held
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(rcu_read_lock) {
         auto st = _detached_state.get();
         // We want to send_lock() to this thread, but we want to be sure we're the only
@@ -1283,6 +1308,12 @@ void thread::wake_lock(mutex* mtx, wait_record* wr)
 
 bool thread::unsafe_stop()
 {
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(rcu_read_lock) {
         auto st = _detached_state.get();
         auto expected = status::waiting;
@@ -1338,6 +1369,13 @@ void thread::complete()
     }
     // If this thread gets preempted after changing status it will never be
     // scheduled again to set terminating_thread. So must disable preemption.
+#if CONF_lazy_stack_invariant
+    assert(arch::irq_enabled());
+    assert(preemptable());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     preempt_disable();
     _detached_state->st.store(status::terminating);
     // We want to run destroy() here, but can't because it cause the stack we're
@@ -1462,6 +1500,13 @@ void thread::sleep_impl(timer &t)
 
 void thread_handle::wake()
 {
+#if CONF_lazy_stack_invariant
+    assert(arch::irq_enabled());
+    assert(sched::preemptable());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(rcu_read_lock) {
         thread::detached_state* ds = _t.read();
         if (ds) {
@@ -1589,6 +1634,13 @@ void timer_base::set_with_irq_disabled(osv::clock::uptime::time_point time)
 void timer_base::set(osv::clock::uptime::time_point time)
 {
     trace_timer_set(this, time.time_since_epoch().count());
+#if CONF_lazy_stack_invariant
+    assert(arch::irq_enabled());
+    assert(sched::preemptable());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     irq_save_lock_type irq_lock;
     WITH_LOCK(irq_lock) {
         _state = state::armed;
diff --git a/fs/vfs/kern_descrip.cc b/fs/vfs/kern_descrip.cc
@@ -145,6 +145,12 @@ int fget(int fd, struct file **out_fp)
     if (fd < 0 || fd >= FDMAX)
         return EBADF;
 
+#if CONF_lazy_stack_invariant
+    assert(sched::preemptable() && arch::irq_enabled());
+#endif
+#if CONF_lazy_stack
+    arch::ensure_next_stack_page();
+#endif
     WITH_LOCK(rcu_read_lock) {
         fp = gfdt[fd].read();
         if (fp == nullptr) {
diff --git a/include/osv/elf.hh b/include/osv/elf.hh
diff --git a/include/osv/percpu_xmit.hh b/include/osv/percpu_xmit.hh
diff --git a/include/osv/sched.hh b/include/osv/sched.hh