Please note this is an optimized and final version of the option 1 patch sent earlier
On Wednesday, January 29, 2020 at 8:14:30 AM UTC-5, Waldek Kozaczuk wrote: > > As the issue #143 explains, currently applications stacks are eagerly > allocated (aka pre-populated) which may lead to substantial memory waste. > We want those stacks to be lazily allocated instead, so that physical > memory behind the stack gets allocated as needed by standard fault > handling mechanism. > > The page fault handling logic requires that both interrupts and preemption > are enabled. Therefore this patch implements simple strategy to read > single byte on a stack > one page (4K) deeper before disabling interrupts or preemption. > > It turns out we cannot read that single byte "blindly" because in some > scenarios > interrupts might be already disabled when we are about to disable > preemption and vice versa. > Also similarly, which is more speculative, disabling preemption (and > possibly interrupts) may nest. > Therefore we need a "conditional" read of that byte on stack. More > specifically > we should ONLY read the byte IF both interrupts and preemption are > enabled. > > This patch achieves this by adding extra thread-local counter to track > interrupts disabling/enabling > that operates in a similar way the preemption counter does. So every time > interrupts > are disabled we increment the counter and decrement it everytime > interrupts are enabled back. > Then everytime before we disable interrupts or preemption, we check if > both counters > are 0 and only then try to read a byte to potentially trigger a fault. > > Please note that performance-wise this patch is slighy more expensive than > it > was originally hoped. As the disassembled snippets illustrate it costs > extra 3-4 instructions > extra every time we disable preemption or interrupts. > > A. Disabling preemption in __tls_get_addr before the patch: > ``` > 4035cae8: e8 83 f6 ff ff callq 4035c170 <elf::get_program()> > 4035caed: 4c 8b 2d 6c 40 51 00 mov 0x51406c(%rip),%r13 # > 40870b60 <sched::preempt_counter+0x40870b60> > 4035caf4: 4c 8b 33 mov (%rbx),%r14 > 4035caf7: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13) > 4035cafd: e8 6e f6 ff ff callq 4035c170 <elf::get_program()> > ``` > > B. Disabling preemption in __tls_get_addr after the patch: > ``` > 4035fdd8: e8 63 f6 ff ff callq 4035f440 <elf::get_program()> > 4035fddd: 4c 8b 2d a4 4d 51 00 mov 0x514da4(%rip),%r13 # > 40874b88 <arch::irq_preempt_counters+0x40874b88> > 4035fde4: 4c 8b 33 mov (%rbx),%r14 > 4035fde7: 64 49 83 7d 00 00 cmpq $0x0,%fs:0x0(%r13) > 4035fded: 75 07 jne 4035fdf6 <__tls_get_addr+0x76> > 4035fdef: 8a 84 24 00 f0 ff ff mov -0x1000(%rsp),%al > 4035fdf6: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13) > 4035fdfc: e8 3f f6 ff ff callq 4035f440 <elf::get_program()> > ``` > > As an example of memory saving, tomcat using ~ 300 thread ended up 365MB > instead > of 620MB before the patch. > > Fixes #143 > > Signed-off-by: Matthew Pabst <[email protected]> > Signed-off-by: Waldemar Kozaczuk <[email protected]> > --- > arch/aarch64/arch.hh | 6 ++++++ > arch/x64/arch-switch.hh | 10 ++++++++++ > arch/x64/arch.hh | 20 ++++++++++++++++++++ > core/sched.cc | 5 ++++- > include/osv/counters.hh | 31 +++++++++++++++++++++++++++++++ > include/osv/irqlock.hh | 2 ++ > include/osv/mmu-defs.hh | 3 +-- > include/osv/sched.hh | 9 +++++---- > libc/mman.cc | 9 ++++----- > libc/pthread.cc | 5 +++++ > 10 files changed, 88 insertions(+), 12 deletions(-) > create mode 100644 include/osv/counters.hh > > diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh > index 855f1987..7dae53f5 100644 > --- a/arch/aarch64/arch.hh > +++ b/arch/aarch64/arch.hh > @@ -11,6 +11,8 @@ > #define ARCH_HH_ > > #include "processor.hh" > +#include <osv/barrier.hh> > +#include <osv/counters.hh> > > // architecture independent interface for architecture dependent > operations > > @@ -20,6 +22,10 @@ namespace arch { > #define INSTR_SIZE_MIN 4 > #define ELF_IMAGE_START (OSV_KERNEL_BASE + 0x10000) > > +inline void ensure_next_stack_page() { > + //TODO: Implement lazy stack check for AARCH64 > +} > + > inline void irq_disable() > { > processor::irq_disable(); > diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh > index 6803498f..048beb6f 100644 > --- a/arch/x64/arch-switch.hh > +++ b/arch/x64/arch-switch.hh > @@ -148,6 +148,13 @@ void thread::init_stack() > _state.rip = reinterpret_cast<void*>(thread_main); > _state.rsp = stacktop; > _state.exception_stack = _arch.exception_stack + > sizeof(_arch.exception_stack); > + > + if (stack.lazy) { > + // If the thread stack is setup to get lazily allocated and given > the thread initially starts > + // running with preemption disabled, we need to pre-fault the > first stack page. > + volatile char r = *((char *) (stacktop - 1)); > + (void) r; // trick the compiler into thinking that r is used > + } > } > > void thread::setup_tcb() > @@ -311,6 +318,9 @@ void thread::free_tcb() > > void thread_main_c(thread* t) > { > + if (t->get_stack_info().lazy) { > + arch::irq_preempt_counters.irq = > arch::irq_counter_lazy_stack_init_value; > + } > arch::irq_enable(); > #ifdef CONF_preempt > preempt_enable(); > diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh > index 17df5f5c..5cadc562 100644 > --- a/arch/x64/arch.hh > +++ b/arch/x64/arch.hh > @@ -10,6 +10,8 @@ > > #include "processor.hh" > #include "msr.hh" > +#include <osv/barrier.hh> > +#include <osv/counters.hh> > > // namespace arch - architecture independent interface for architecture > // dependent operations (e.g. irq_disable vs. cli) > @@ -20,8 +22,19 @@ namespace arch { > #define INSTR_SIZE_MIN 1 > #define ELF_IMAGE_START OSV_KERNEL_BASE > > +inline void ensure_next_stack_page() { > + if (irq_preempt_counters.any_is_on) { > + return; > + } > + > + char i; > + asm volatile("movb -4096(%%rsp), %0" : "=r"(i)); > +} > + > inline void irq_disable() > { > + ensure_next_stack_page(); > + ++irq_preempt_counters.irq; > processor::cli(); > } > > @@ -36,11 +49,15 @@ inline void irq_disable_notrace() > inline void irq_enable() > { > processor::sti(); > + --irq_preempt_counters.irq; > + barrier(); > } > > inline void wait_for_interrupt() > { > processor::sti_hlt(); > + --irq_preempt_counters.irq; > + barrier(); > } > > inline void halt_no_interrupts() > @@ -78,12 +95,15 @@ private: > > inline void irq_flag_notrace::save() > { > + ensure_next_stack_page(); > + ++irq_preempt_counters.irq; > asm volatile("sub $128, %%rsp; pushfq; popq %0; add $128, %%rsp" : > "=r"(_rflags)); > } > > inline void irq_flag_notrace::restore() > { > asm volatile("sub $128, %%rsp; pushq %0; popfq; add $128, %%rsp" : : > "r"(_rflags)); > + --irq_preempt_counters.irq; > } > > inline bool irq_flag_notrace::enabled() const > diff --git a/core/sched.cc b/core/sched.cc > index 06f849d1..295e3de0 100644 > --- a/core/sched.cc > +++ b/core/sched.cc > @@ -42,6 +42,10 @@ extern char _percpu_start[], _percpu_end[]; > using namespace osv; > using namespace osv::clock::literals; > > +namespace arch { > +counters __thread irq_preempt_counters = {{1, > irq_counter_default_init_value}}; > +} > + > namespace sched { > > TRACEPOINT(trace_sched_idle, ""); > @@ -69,7 +73,6 @@ std::vector<cpu*> cpus > __attribute__((init_priority((int)init_prio::cpus))); > thread __thread * s_current; > cpu __thread * current_cpu; > > -unsigned __thread preempt_counter = 1; > bool __thread need_reschedule = false; > > elf::tls_data tls; > diff --git a/include/osv/counters.hh b/include/osv/counters.hh > new file mode 100644 > index 00000000..38d40fb6 > --- /dev/null > +++ b/include/osv/counters.hh > @@ -0,0 +1,31 @@ > +/* > + * Copyright (C) 2020 Waldemar Kozaczuk > + * > + * This work is open source software, licensed under the terms of the > + * BSD license as described in the LICENSE file in the top-level > directory. > + */ > + > +#ifndef OSV_COUNTERS_HH > +#define OSV_COUNTERS_HH > + > +namespace arch { > + > +// Both preempt and irq counters are colocated next to each other in this > +// union by design to let compiler optimize ensure_next_stack_page() so > +// that it can check if any counter is non-zero by checking single 64-bit > any_is_on field > +union counters { > + struct { > + unsigned preempt; > + unsigned irq; > + }; > + uint64_t any_is_on; > +}; > + > +extern counters __thread irq_preempt_counters; > + > +constexpr unsigned irq_counter_default_init_value = 11; > +constexpr unsigned irq_counter_lazy_stack_init_value = 1; > + > +} > + > +#endif //OSV_COUNTERS_HH > diff --git a/include/osv/irqlock.hh b/include/osv/irqlock.hh > index 2d5372fc..1a163e45 100644 > --- a/include/osv/irqlock.hh > +++ b/include/osv/irqlock.hh > @@ -34,6 +34,8 @@ inline void irq_save_lock_type::lock() > inline void irq_save_lock_type::unlock() > { > _flags.restore(); > + --arch::irq_preempt_counters.irq; > + barrier(); > } > > extern irq_lock_type irq_lock; > diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh > index 18edf441..a731ed6e 100644 > --- a/include/osv/mmu-defs.hh > +++ b/include/osv/mmu-defs.hh > @@ -15,8 +15,6 @@ > > struct exception_frame; > > -struct exception_frame; > - > namespace mmu { > > constexpr uintptr_t page_size = 4096; > @@ -84,6 +82,7 @@ enum { > mmap_small = 1ul << 5, > mmap_jvm_balloon = 1ul << 6, > mmap_file = 1ul << 7, > + mmap_stack = 1ul << 8, > }; > > enum { > diff --git a/include/osv/sched.hh b/include/osv/sched.hh > index 0fb44f77..d0e9b00f 100644 > --- a/include/osv/sched.hh > +++ b/include/osv/sched.hh > @@ -335,6 +335,7 @@ public: > void* begin; > size_t size; > void (*deleter)(stack_info si); // null: don't delete > + bool lazy = false; > static void default_deleter(stack_info si); > }; > struct attr { > @@ -978,14 +979,13 @@ inline void release(mutex_t* mtx) > } > } > > -extern unsigned __thread preempt_counter; > extern bool __thread need_reschedule; > > #ifdef __OSV_CORE__ > inline unsigned int get_preempt_counter() > { > barrier(); > - return preempt_counter; > + return arch::irq_preempt_counters.preempt; > } > > inline bool preemptable() > @@ -1005,14 +1005,15 @@ inline void preempt() > > inline void preempt_disable() > { > - ++preempt_counter; > + arch::ensure_next_stack_page(); > + ++arch::irq_preempt_counters.preempt; > barrier(); > } > > inline void preempt_enable() > { > barrier(); > - --preempt_counter; > + --arch::irq_preempt_counters.preempt; > if (preemptable() && need_reschedule && arch::irq_enabled()) { > cpu::schedule(); > } > diff --git a/libc/mman.cc b/libc/mman.cc > index d0803ac4..d00befc3 100644 > --- a/libc/mman.cc > +++ b/libc/mman.cc > @@ -38,12 +38,11 @@ unsigned libc_flags_to_mmap(int flags) > mmap_flags |= mmu::mmap_populate; > } > if (flags & MAP_STACK) { > - // OSv currently requires that stacks be pinned (see issue #143). > So > - // if an application wants to mmap() a stack for > pthread_attr_setstack > - // and did us the courtesy of telling this to ue (via MAP_STACK), > - // let's return the courtesy by returning pre-faulted memory. > - // FIXME: If issue #143 is fixed, this workaround should be > removed. > +#ifdef AARCH64_PORT_STUB > mmap_flags |= mmu::mmap_populate; > +#else > + mmap_flags |= mmu::mmap_stack; > +#endif > } > if (flags & MAP_SHARED) { > mmap_flags |= mmu::mmap_shared; > diff --git a/libc/pthread.cc b/libc/pthread.cc > index 8c976bf6..5c00e8b9 100644 > --- a/libc/pthread.cc > +++ b/libc/pthread.cc > @@ -140,10 +140,15 @@ namespace pthread_private { > return {attr.stack_begin, attr.stack_size}; > } > size_t size = attr.stack_size; > +#ifdef AARCH64_PORT_STUB > void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate, > mmu::perm_rw); > +#else > + void *addr = mmu::map_anon(nullptr, size, mmu::mmap_stack, > mmu::perm_rw); > +#endif > mmu::mprotect(addr, attr.guard_size, 0); > sched::thread::stack_info si{addr, size}; > si.deleter = free_stack; > + si.lazy = true; > return si; > } > > -- > 2.20.1 > > -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/eff8ced4-1711-4f78-ad02-69404255c9fb%40googlegroups.com.
