Unfortunately, after more extensive testing (even though all unit tests
pass), I have realized this patch has some major flaws but hopefully
repairable.
I have tried to run the http test (cd modules/httpserver-api && make
check-http) and got the app hung. After some debugging, I found this stack
trace that explains why:
#0 sched::thread::switch_to (this=0xffff8000009c8040,
this@entry=0xffff80007fee1040) at arch/x64/arch-switch.hh:108
#1 0x000000004040c57a in sched::cpu::reschedule_from_interrupt
(this=0xffff80000001f040, called_from_yield=called_from_yield@entry=false,
preempt_after=..., preempt_after@entry=...) at core/sched.cc:339
#2 0x000000004040d2e8 in sched::cpu::schedule () at
include/osv/sched.hh:1316
#3 0x000000004040d406 in sched::thread::wait
(this=this@entry=0xffff800003fa8040) at core/sched.cc:1216
#4 0x000000004043a856 in sched::thread::do_wait_for<lockfree::mutex,
sched::wait_object<waitqueue> > (mtx=...) at include/osv/mutex.h:41
#5 sched::thread::wait_for<waitqueue&> (mtx=...) at
include/osv/sched.hh:1226
#6 waitqueue::wait (this=this@entry=0x408f04d0 <mmu::vma_list_mutex+48>,
mtx=...) at core/waitqueue.cc:56
#7 0x00000000403ea41b in rwlock::reader_wait_lockable (this=<optimized
out>) at core/rwlock.cc:174
#8 rwlock::rlock (this=this@entry=0x408f04a0 <mmu::vma_list_mutex>) at
core/rwlock.cc:29
#9 0x000000004034cbac in rwlock_for_read::lock (this=0x408f04a0
<mmu::vma_list_mutex>) at include/osv/rwlock.h:113
#10 std::lock_guard<rwlock_for_read&>::lock_guard (__m=..., this=<synthetic
pointer>) at /usr/include/c++/9/bits/std_mutex.h:159
#11 lock_guard_for_with_lock<rwlock_for_read&>::lock_guard_for_with_lock
(lock=..., this=<synthetic pointer>) at include/osv/mutex.h:89
#12 mmu::vm_fault (addr=35184666537984, addr@entry=35184666541728,
ef=ef@entry=0xffff800003fad068) at core/mmu.cc:1333
#13 0x00000000403ad539 in page_fault (ef=0xffff800003fad068) at
arch/x64/mmu.cc:42
#14 <signal handler called>
#15 arch::ensure_next_stack_page () at arch/x64/arch.hh:37
#16 sched::preempt_disable () at include/osv/sched.hh:1008
#17 preempt_lock_t::lock (this=<optimized out>) at
include/osv/preempt-lock.hh:15
#18 std::lock_guard<preempt_lock_t>::lock_guard (__m=..., this=<synthetic
pointer>) at /usr/include/c++/9/bits/std_mutex.h:159
#19 lock_guard_for_with_lock<preempt_lock_t>::lock_guard_for_with_lock
(lock=..., this=<synthetic pointer>) at include/osv/mutex.h:89
#20 memory::pool::alloc (this=0x40907608 <memory::malloc_pools+168>) at
core/mempool.cc:214
#21 0x00000000403f936f in std_malloc (size=80, alignment=16) at
core/mempool.cc:1679
#22 0x00000000403f97db in malloc (size=80) at core/mempool.cc:1887
#23 0x00000000404c0089 in operator new(unsigned long) ()
#24 0x000000004034c9d2 in mmu::anon_vma::split (this=0xffffa00001f69e80,
edge=4136108032) at include/osv/addr_range.hh:16
#25 0x000000004034ef93 in mmu::evacuate (start=<optimized out>,
end=<optimized out>) at /usr/include/boost/move/detail/meta_utils.hpp:267
#26 0x000000004035007c in mmu::allocate (v=v@entry=0xffffa000050e4600,
start=start@entry=4127195136, size=size@entry=8912896,
search=search@entry=false)
at core/mmu.cc:1116
#27 0x0000000040350e87 in mmu::map_anon (addr=addr@entry=0xf6000000,
size=size@entry=8912896, flags=flags@entry=1, perm=perm@entry=3) at
core/mmu.cc:1219
#28 0x000000004047d9f0 in mmap (addr=0xf6000000, length=8912896,
prot=<optimized out>, flags=<optimized out>, fd=<optimized out>, offset=0)
at libc/mman.cc:152
#29 0x0000100000f2dcef in os::Linux::commit_memory_impl(char*, unsigned
long, bool) ()
#30 0x0000100000f2dfe9 in os::pd_commit_memory(char*, unsigned long,
unsigned long, bool) ()
#31 0x0000100000f22a4a in os::commit_memory(char*, unsigned long, unsigned
long, bool) ()
#32 0x0000100000f956db in PSVirtualSpace::expand_by(unsigned long) ()
#33 0x0000100000f96998 in PSYoungGen::resize_generation(unsigned long,
unsigned long) ()
#34 0x0000100000f95ad2 in PSYoungGen::resize(unsigned long, unsigned long)
()
#35 0x0000100000f92e9f in PSScavenge::invoke_no_policy() ()
#36 0x0000100000f93673 in PSScavenge::invoke() ()
#37 0x0000100000f4de5d in
ParallelScavengeHeap::failed_mem_allocate(unsigned long) ()
#38 0x00001000010cfd97 in VM_ParallelGCFailedAllocation::doit() ()
#39 0x00001000010d7572 in VM_Operation::evaluate() ()
#40 0x00001000010d526b in VMThread::evaluate_operation(VM_Operation*) ()
#41 0x00001000010d65ab in VMThread::loop() ()
#42 0x00001000010d6a13 in VMThread::run() ()
#43 0x0000100000f2e152 in java_start(Thread*) ()
#44 0x00000000404773ba in pthread_private::pthread::<lambda()>::operator()
(__closure=0xffffa00002ddbd00) at libc/pthread.cc:115
#45 std::_Function_handler<void(), pthread_private::pthread::pthread(void*
(*)(void*), void*, sigset_t, const
pthread_private::thread_attr*)::<lambda()> >::_M_invoke(const
std::_Any_data &) (__functor=...) at
/usr/include/c++/9/bits/std_function.h:300
#46 0x000000004040da1c in sched::thread_main_c (t=0xffff800003fa8040) at
arch/x64/arch-switch.hh:326
#47 0x00000000403ad2f3 in thread_main () at arch/x64/entry.S:113
First of I realized that reschedule_from_interrupt() happens within the
context of irq_lock which ends up calling disable/enable irq methods which
increment/decrement the thread-local irq counter. This means that at least
in this case where irq_lock is used (and I think in most others), the
counter gets incremented on a different thread that decremented. It looks i
have been just lucky that all other tests have worked so far. Or possibly
most cases where interrupts are enabled/disabled not much stack is used.
But clearly I have to either replace this counter mechanism with something
else or adjust it.
The first bug is not really a reason for the deadlock you see above - frame
27 core/mmu.cc:1219 and frame 12 core/mmu.cc:1333. I think we have a
situation when we have ongoing memory allocation (see memory::pool::alloc)
which then requires disabling preemption which triggers
ensure_next_stack_page() trigger a fault that tries to allocate memory and
ends up trying to acquire same lock - mmu::vma_list_mutex.
I am not sure what is the right way to fix it. One way could be to prevent
this deadlock could be accomplished by preventing the fault. I experimented
a bit and adding these 3 lines to mmap() to trigger earlier fault to ensure
2 pages of stack made the deadlock go away:
--- a/libc/mman.cc
+++ b/libc/mman.cc
@@ -154,6 +149,9 @@ void *mmap(void *addr, size_t length, int prot, int
flags,
}
}
try {
+ char i;
+ asm volatile("movb -4096(%%rsp), %0" : "=r"(i));
+ asm volatile("movb -8096(%%rsp), %0" : "=r"(i));
ret = mmu::map_anon(addr, length, mmap_flags, mmap_perm);
} catch (error& err) {
err.to_libc(); // sets errno
Now he most troubling question is whether it an example of the situation
where memory allocation causes a page fault by ensure_next_stack_page() on
disabling preemption or interrupts and we just need to pre-fault deeper in
all these cases (hopefully not to many) or there are many other types of
scenarios we have not foreseen?
Waldek
On Wednesday, January 29, 2020 at 8:14:30 AM UTC-5, Waldek Kozaczuk wrote:
>
> As the issue #143 explains, currently applications stacks are eagerly
> allocated (aka pre-populated) which may lead to substantial memory waste.
> We want those stacks to be lazily allocated instead, so that physical
> memory behind the stack gets allocated as needed by standard fault
> handling mechanism.
>
> The page fault handling logic requires that both interrupts and preemption
> are enabled. Therefore this patch implements simple strategy to read
> single byte on a stack
> one page (4K) deeper before disabling interrupts or preemption.
>
> It turns out we cannot read that single byte "blindly" because in some
> scenarios
> interrupts might be already disabled when we are about to disable
> preemption and vice versa.
> Also similarly, which is more speculative, disabling preemption (and
> possibly interrupts) may nest.
> Therefore we need a "conditional" read of that byte on stack. More
> specifically
> we should ONLY read the byte IF both interrupts and preemption are
> enabled.
>
> This patch achieves this by adding extra thread-local counter to track
> interrupts disabling/enabling
> that operates in a similar way the preemption counter does. So every time
> interrupts
> are disabled we increment the counter and decrement it everytime
> interrupts are enabled back.
> Then everytime before we disable interrupts or preemption, we check if
> both counters
> are 0 and only then try to read a byte to potentially trigger a fault.
>
> Please note that performance-wise this patch is slighy more expensive than
> it
> was originally hoped. As the disassembled snippets illustrate it costs
> extra 3-4 instructions
> extra every time we disable preemption or interrupts.
>
> A. Disabling preemption in __tls_get_addr before the patch:
> ```
> 4035cae8: e8 83 f6 ff ff callq 4035c170 <elf::get_program()>
> 4035caed: 4c 8b 2d 6c 40 51 00 mov 0x51406c(%rip),%r13 #
> 40870b60 <sched::preempt_counter+0x40870b60>
> 4035caf4: 4c 8b 33 mov (%rbx),%r14
> 4035caf7: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13)
> 4035cafd: e8 6e f6 ff ff callq 4035c170 <elf::get_program()>
> ```
>
> B. Disabling preemption in __tls_get_addr after the patch:
> ```
> 4035fdd8: e8 63 f6 ff ff callq 4035f440 <elf::get_program()>
> 4035fddd: 4c 8b 2d a4 4d 51 00 mov 0x514da4(%rip),%r13 #
> 40874b88 <arch::irq_preempt_counters+0x40874b88>
> 4035fde4: 4c 8b 33 mov (%rbx),%r14
> 4035fde7: 64 49 83 7d 00 00 cmpq $0x0,%fs:0x0(%r13)
> 4035fded: 75 07 jne 4035fdf6 <__tls_get_addr+0x76>
> 4035fdef: 8a 84 24 00 f0 ff ff mov -0x1000(%rsp),%al
> 4035fdf6: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13)
> 4035fdfc: e8 3f f6 ff ff callq 4035f440 <elf::get_program()>
> ```
>
> As an example of memory saving, tomcat using ~ 300 thread ended up 365MB
> instead
> of 620MB before the patch.
>
> Fixes #143
>
> Signed-off-by: Matthew Pabst <[email protected]>
> Signed-off-by: Waldemar Kozaczuk <[email protected]>
> ---
> arch/aarch64/arch.hh | 6 ++++++
> arch/x64/arch-switch.hh | 10 ++++++++++
> arch/x64/arch.hh | 20 ++++++++++++++++++++
> core/sched.cc | 5 ++++-
> include/osv/counters.hh | 31 +++++++++++++++++++++++++++++++
> include/osv/irqlock.hh | 2 ++
> include/osv/mmu-defs.hh | 3 +--
> include/osv/sched.hh | 9 +++++----
> libc/mman.cc | 9 ++++-----
> libc/pthread.cc | 5 +++++
> 10 files changed, 88 insertions(+), 12 deletions(-)
> create mode 100644 include/osv/counters.hh
>
> diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh
> index 855f1987..7dae53f5 100644
> --- a/arch/aarch64/arch.hh
> +++ b/arch/aarch64/arch.hh
> @@ -11,6 +11,8 @@
> #define ARCH_HH_
>
> #include "processor.hh"
> +#include <osv/barrier.hh>
> +#include <osv/counters.hh>
>
> // architecture independent interface for architecture dependent
> operations
>
> @@ -20,6 +22,10 @@ namespace arch {
> #define INSTR_SIZE_MIN 4
> #define ELF_IMAGE_START (OSV_KERNEL_BASE + 0x10000)
>
> +inline void ensure_next_stack_page() {
> + //TODO: Implement lazy stack check for AARCH64
> +}
> +
> inline void irq_disable()
> {
> processor::irq_disable();
> diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
> index 6803498f..048beb6f 100644
> --- a/arch/x64/arch-switch.hh
> +++ b/arch/x64/arch-switch.hh
> @@ -148,6 +148,13 @@ void thread::init_stack()
> _state.rip = reinterpret_cast<void*>(thread_main);
> _state.rsp = stacktop;
> _state.exception_stack = _arch.exception_stack +
> sizeof(_arch.exception_stack);
> +
> + if (stack.lazy) {
> + // If the thread stack is setup to get lazily allocated and given
> the thread initially starts
> + // running with preemption disabled, we need to pre-fault the
> first stack page.
> + volatile char r = *((char *) (stacktop - 1));
> + (void) r; // trick the compiler into thinking that r is used
> + }
> }
>
> void thread::setup_tcb()
> @@ -311,6 +318,9 @@ void thread::free_tcb()
>
> void thread_main_c(thread* t)
> {
> + if (t->get_stack_info().lazy) {
> + arch::irq_preempt_counters.irq =
> arch::irq_counter_lazy_stack_init_value;
> + }
> arch::irq_enable();
> #ifdef CONF_preempt
> preempt_enable();
> diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh
> index 17df5f5c..5cadc562 100644
> --- a/arch/x64/arch.hh
> +++ b/arch/x64/arch.hh
> @@ -10,6 +10,8 @@
>
> #include "processor.hh"
> #include "msr.hh"
> +#include <osv/barrier.hh>
> +#include <osv/counters.hh>
>
> // namespace arch - architecture independent interface for architecture
> // dependent operations (e.g. irq_disable vs. cli)
> @@ -20,8 +22,19 @@ namespace arch {
> #define INSTR_SIZE_MIN 1
> #define ELF_IMAGE_START OSV_KERNEL_BASE
>
> +inline void ensure_next_stack_page() {
> + if (irq_preempt_counters.any_is_on) {
> + return;
> + }
> +
> + char i;
> + asm volatile("movb -4096(%%rsp), %0" : "=r"(i));
> +}
> +
> inline void irq_disable()
> {
> + ensure_next_stack_page();
> + ++irq_preempt_counters.irq;
> processor::cli();
> }
>
> @@ -36,11 +49,15 @@ inline void irq_disable_notrace()
> inline void irq_enable()
> {
> processor::sti();
> + --irq_preempt_counters.irq;
> + barrier();
> }
>
> inline void wait_for_interrupt()
> {
> processor::sti_hlt();
> + --irq_preempt_counters.irq;
> + barrier();
> }
>
> inline void halt_no_interrupts()
> @@ -78,12 +95,15 @@ private:
>
> inline void irq_flag_notrace::save()
> {
> + ensure_next_stack_page();
> + ++irq_preempt_counters.irq;
> asm volatile("sub $128, %%rsp; pushfq; popq %0; add $128, %%rsp" :
> "=r"(_rflags));
> }
>
> inline void irq_flag_notrace::restore()
> {
> asm volatile("sub $128, %%rsp; pushq %0; popfq; add $128, %%rsp" : :
> "r"(_rflags));
> + --irq_preempt_counters.irq;
> }
>
> inline bool irq_flag_notrace::enabled() const
> diff --git a/core/sched.cc b/core/sched.cc
> index 06f849d1..295e3de0 100644
> --- a/core/sched.cc
> +++ b/core/sched.cc
> @@ -42,6 +42,10 @@ extern char _percpu_start[], _percpu_end[];
> using namespace osv;
> using namespace osv::clock::literals;
>
> +namespace arch {
> +counters __thread irq_preempt_counters = {{1,
> irq_counter_default_init_value}};
> +}
> +
> namespace sched {
>
> TRACEPOINT(trace_sched_idle, "");
> @@ -69,7 +73,6 @@ std::vector<cpu*> cpus
> __attribute__((init_priority((int)init_prio::cpus)));
> thread __thread * s_current;
> cpu __thread * current_cpu;
>
> -unsigned __thread preempt_counter = 1;
> bool __thread need_reschedule = false;
>
> elf::tls_data tls;
> diff --git a/include/osv/counters.hh b/include/osv/counters.hh
> new file mode 100644
> index 00000000..38d40fb6
> --- /dev/null
> +++ b/include/osv/counters.hh
> @@ -0,0 +1,31 @@
> +/*
> + * Copyright (C) 2020 Waldemar Kozaczuk
> + *
> + * This work is open source software, licensed under the terms of the
> + * BSD license as described in the LICENSE file in the top-level
> directory.
> + */
> +
> +#ifndef OSV_COUNTERS_HH
> +#define OSV_COUNTERS_HH
> +
> +namespace arch {
> +
> +// Both preempt and irq counters are colocated next to each other in this
> +// union by design to let compiler optimize ensure_next_stack_page() so
> +// that it can check if any counter is non-zero by checking single 64-bit
> any_is_on field
> +union counters {
> + struct {
> + unsigned preempt;
> + unsigned irq;
> + };
> + uint64_t any_is_on;
> +};
> +
> +extern counters __thread irq_preempt_counters;
> +
> +constexpr unsigned irq_counter_default_init_value = 11;
> +constexpr unsigned irq_counter_lazy_stack_init_value = 1;
> +
> +}
> +
> +#endif //OSV_COUNTERS_HH
> diff --git a/include/osv/irqlock.hh b/include/osv/irqlock.hh
> index 2d5372fc..1a163e45 100644
> --- a/include/osv/irqlock.hh
> +++ b/include/osv/irqlock.hh
> @@ -34,6 +34,8 @@ inline void irq_save_lock_type::lock()
> inline void irq_save_lock_type::unlock()
> {
> _flags.restore();
> + --arch::irq_preempt_counters.irq;
> + barrier();
> }
>
> extern irq_lock_type irq_lock;
> diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh
> index 18edf441..a731ed6e 100644
> --- a/include/osv/mmu-defs.hh
> +++ b/include/osv/mmu-defs.hh
> @@ -15,8 +15,6 @@
>
> struct exception_frame;
>
> -struct exception_frame;
> -
> namespace mmu {
>
> constexpr uintptr_t page_size = 4096;
> @@ -84,6 +82,7 @@ enum {
> mmap_small = 1ul << 5,
> mmap_jvm_balloon = 1ul << 6,
> mmap_file = 1ul << 7,
> + mmap_stack = 1ul << 8,
> };
>
> enum {
> diff --git a/include/osv/sched.hh b/include/osv/sched.hh
> index 0fb44f77..d0e9b00f 100644
> --- a/include/osv/sched.hh
> +++ b/include/osv/sched.hh
> @@ -335,6 +335,7 @@ public:
> void* begin;
> size_t size;
> void (*deleter)(stack_info si); // null: don't delete
> + bool lazy = false;
> static void default_deleter(stack_info si);
> };
> struct attr {
> @@ -978,14 +979,13 @@ inline void release(mutex_t* mtx)
> }
> }
>
> -extern unsigned __thread preempt_counter;
> extern bool __thread need_reschedule;
>
> #ifdef __OSV_CORE__
> inline unsigned int get_preempt_counter()
> {
> barrier();
> - return preempt_counter;
> + return arch::irq_preempt_counters.preempt;
> }
>
> inline bool preemptable()
> @@ -1005,14 +1005,15 @@ inline void preempt()
>
> inline void preempt_disable()
> {
> - ++preempt_counter;
> + arch::ensure_next_stack_page();
> + ++arch::irq_preempt_counters.preempt;
> barrier();
> }
>
> inline void preempt_enable()
> {
> barrier();
> - --preempt_counter;
> + --arch::irq_preempt_counters.preempt;
> if (preemptable() && need_reschedule && arch::irq_enabled()) {
> cpu::schedule();
> }
> diff --git a/libc/mman.cc b/libc/mman.cc
> index d0803ac4..d00befc3 100644
> --- a/libc/mman.cc
> +++ b/libc/mman.cc
> @@ -38,12 +38,11 @@ unsigned libc_flags_to_mmap(int flags)
> mmap_flags |= mmu::mmap_populate;
> }
> if (flags & MAP_STACK) {
> - // OSv currently requires that stacks be pinned (see issue #143).
> So
> - // if an application wants to mmap() a stack for
> pthread_attr_setstack
> - // and did us the courtesy of telling this to ue (via MAP_STACK),
> - // let's return the courtesy by returning pre-faulted memory.
> - // FIXME: If issue #143 is fixed, this workaround should be
> removed.
> +#ifdef AARCH64_PORT_STUB
> mmap_flags |= mmu::mmap_populate;
> +#else
> + mmap_flags |= mmu::mmap_stack;
> +#endif
> }
> if (flags & MAP_SHARED) {
> mmap_flags |= mmu::mmap_shared;
> diff --git a/libc/pthread.cc b/libc/pthread.cc
> index 8c976bf6..5c00e8b9 100644
> --- a/libc/pthread.cc
> +++ b/libc/pthread.cc
> @@ -140,10 +140,15 @@ namespace pthread_private {
> return {attr.stack_begin, attr.stack_size};
> }
> size_t size = attr.stack_size;
> +#ifdef AARCH64_PORT_STUB
> void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate,
> mmu::perm_rw);
> +#else
> + void *addr = mmu::map_anon(nullptr, size, mmu::mmap_stack,
> mmu::perm_rw);
> +#endif
> mmu::mprotect(addr, attr.guard_size, 0);
> sched::thread::stack_info si{addr, size};
> si.deleter = free_stack;
> + si.lazy = true;
> return si;
> }
>
> --
> 2.20.1
>
>
--
You received this message because you are subscribed to the Google Groups "OSv
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/osv-dev/78ecc1d4-ce40-4578-bbba-9901616f6be8%40googlegroups.com.