As the issue #143 explains, currently applications stacks are eagerly allocated (aka pre-populated) which may lead to substantial memory waste. We want those stacks to be lazily allocated instead, so that physical memory behind the stack gets allocated as needed by standard fault handling mechanism.
The page fault handling logic requires that both interrupts and preemption are enabled. Therefore this patch implements simple strategy to read single byte on a stack one page (4K) deeper before disabling interrupts or preemption. It turns out we cannot read that single byte "blindly" because in some scenarios interrupts might be already disabled when we are about to disable preemption and vice versa. Also similarly, which is more speculative, disabling preemption (and possibly interrupts) may nest. Therefore we need a "conditional" read of that byte on stack. More specifically we should ONLY read the byte IF both interrupts and preemption are enabled. This patch achieves this by adding extra thread-local counter to track interrupts disabling/enabling that operates in a similar way the preemption counter does. So every time interrupts are disabled we increment the counter and decrement it everytime interrupts are enabled back. Then everytime before we disable interrupts or preemption, we check if both counters are 0 and only then try to read a byte to potentially trigger a fault. Please note that performance-wise this patch is slighy more expensive than it was originally hoped. As the disassembled snippets illustrate it costs extra 3-4 instructions extra every time we disable preemption or interrupts. A. Disabling preemption in __tls_get_addr before the patch: ``` 4035cae8: e8 83 f6 ff ff callq 4035c170 <elf::get_program()> 4035caed: 4c 8b 2d 6c 40 51 00 mov 0x51406c(%rip),%r13 # 40870b60 <sched::preempt_counter+0x40870b60> 4035caf4: 4c 8b 33 mov (%rbx),%r14 4035caf7: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13) 4035cafd: e8 6e f6 ff ff callq 4035c170 <elf::get_program()> ``` B. Disabling preemption in __tls_get_addr after the patch: ``` 4035fdd8: e8 63 f6 ff ff callq 4035f440 <elf::get_program()> 4035fddd: 4c 8b 2d a4 4d 51 00 mov 0x514da4(%rip),%r13 # 40874b88 <arch::irq_preempt_counters+0x40874b88> 4035fde4: 4c 8b 33 mov (%rbx),%r14 4035fde7: 64 49 83 7d 00 00 cmpq $0x0,%fs:0x0(%r13) 4035fded: 75 07 jne 4035fdf6 <__tls_get_addr+0x76> 4035fdef: 8a 84 24 00 f0 ff ff mov -0x1000(%rsp),%al 4035fdf6: 64 41 83 45 00 01 addl $0x1,%fs:0x0(%r13) 4035fdfc: e8 3f f6 ff ff callq 4035f440 <elf::get_program()> ``` As an example of memory saving, tomcat using ~ 300 thread ended up 365MB instead of 620MB before the patch. Fixes #143 Signed-off-by: Matthew Pabst <[email protected]> Signed-off-by: Waldemar Kozaczuk <[email protected]> --- arch/aarch64/arch.hh | 6 ++++++ arch/x64/arch-switch.hh | 10 ++++++++++ arch/x64/arch.hh | 20 ++++++++++++++++++++ core/sched.cc | 5 ++++- include/osv/counters.hh | 31 +++++++++++++++++++++++++++++++ include/osv/irqlock.hh | 2 ++ include/osv/mmu-defs.hh | 3 +-- include/osv/sched.hh | 9 +++++---- libc/mman.cc | 9 ++++----- libc/pthread.cc | 5 +++++ 10 files changed, 88 insertions(+), 12 deletions(-) create mode 100644 include/osv/counters.hh diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh index 855f1987..7dae53f5 100644 --- a/arch/aarch64/arch.hh +++ b/arch/aarch64/arch.hh @@ -11,6 +11,8 @@ #define ARCH_HH_ #include "processor.hh" +#include <osv/barrier.hh> +#include <osv/counters.hh> // architecture independent interface for architecture dependent operations @@ -20,6 +22,10 @@ namespace arch { #define INSTR_SIZE_MIN 4 #define ELF_IMAGE_START (OSV_KERNEL_BASE + 0x10000) +inline void ensure_next_stack_page() { + //TODO: Implement lazy stack check for AARCH64 +} + inline void irq_disable() { processor::irq_disable(); diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh index 6803498f..048beb6f 100644 --- a/arch/x64/arch-switch.hh +++ b/arch/x64/arch-switch.hh @@ -148,6 +148,13 @@ void thread::init_stack() _state.rip = reinterpret_cast<void*>(thread_main); _state.rsp = stacktop; _state.exception_stack = _arch.exception_stack + sizeof(_arch.exception_stack); + + if (stack.lazy) { + // If the thread stack is setup to get lazily allocated and given the thread initially starts + // running with preemption disabled, we need to pre-fault the first stack page. + volatile char r = *((char *) (stacktop - 1)); + (void) r; // trick the compiler into thinking that r is used + } } void thread::setup_tcb() @@ -311,6 +318,9 @@ void thread::free_tcb() void thread_main_c(thread* t) { + if (t->get_stack_info().lazy) { + arch::irq_preempt_counters.irq = arch::irq_counter_lazy_stack_init_value; + } arch::irq_enable(); #ifdef CONF_preempt preempt_enable(); diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh index 17df5f5c..5cadc562 100644 --- a/arch/x64/arch.hh +++ b/arch/x64/arch.hh @@ -10,6 +10,8 @@ #include "processor.hh" #include "msr.hh" +#include <osv/barrier.hh> +#include <osv/counters.hh> // namespace arch - architecture independent interface for architecture // dependent operations (e.g. irq_disable vs. cli) @@ -20,8 +22,19 @@ namespace arch { #define INSTR_SIZE_MIN 1 #define ELF_IMAGE_START OSV_KERNEL_BASE +inline void ensure_next_stack_page() { + if (irq_preempt_counters.any_is_on) { + return; + } + + char i; + asm volatile("movb -4096(%%rsp), %0" : "=r"(i)); +} + inline void irq_disable() { + ensure_next_stack_page(); + ++irq_preempt_counters.irq; processor::cli(); } @@ -36,11 +49,15 @@ inline void irq_disable_notrace() inline void irq_enable() { processor::sti(); + --irq_preempt_counters.irq; + barrier(); } inline void wait_for_interrupt() { processor::sti_hlt(); + --irq_preempt_counters.irq; + barrier(); } inline void halt_no_interrupts() @@ -78,12 +95,15 @@ private: inline void irq_flag_notrace::save() { + ensure_next_stack_page(); + ++irq_preempt_counters.irq; asm volatile("sub $128, %%rsp; pushfq; popq %0; add $128, %%rsp" : "=r"(_rflags)); } inline void irq_flag_notrace::restore() { asm volatile("sub $128, %%rsp; pushq %0; popfq; add $128, %%rsp" : : "r"(_rflags)); + --irq_preempt_counters.irq; } inline bool irq_flag_notrace::enabled() const diff --git a/core/sched.cc b/core/sched.cc index 06f849d1..295e3de0 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -42,6 +42,10 @@ extern char _percpu_start[], _percpu_end[]; using namespace osv; using namespace osv::clock::literals; +namespace arch { +counters __thread irq_preempt_counters = {{1, irq_counter_default_init_value}}; +} + namespace sched { TRACEPOINT(trace_sched_idle, ""); @@ -69,7 +73,6 @@ std::vector<cpu*> cpus __attribute__((init_priority((int)init_prio::cpus))); thread __thread * s_current; cpu __thread * current_cpu; -unsigned __thread preempt_counter = 1; bool __thread need_reschedule = false; elf::tls_data tls; diff --git a/include/osv/counters.hh b/include/osv/counters.hh new file mode 100644 index 00000000..38d40fb6 --- /dev/null +++ b/include/osv/counters.hh @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2020 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef OSV_COUNTERS_HH +#define OSV_COUNTERS_HH + +namespace arch { + +// Both preempt and irq counters are colocated next to each other in this +// union by design to let compiler optimize ensure_next_stack_page() so +// that it can check if any counter is non-zero by checking single 64-bit any_is_on field +union counters { + struct { + unsigned preempt; + unsigned irq; + }; + uint64_t any_is_on; +}; + +extern counters __thread irq_preempt_counters; + +constexpr unsigned irq_counter_default_init_value = 11; +constexpr unsigned irq_counter_lazy_stack_init_value = 1; + +} + +#endif //OSV_COUNTERS_HH diff --git a/include/osv/irqlock.hh b/include/osv/irqlock.hh index 2d5372fc..1a163e45 100644 --- a/include/osv/irqlock.hh +++ b/include/osv/irqlock.hh @@ -34,6 +34,8 @@ inline void irq_save_lock_type::lock() inline void irq_save_lock_type::unlock() { _flags.restore(); + --arch::irq_preempt_counters.irq; + barrier(); } extern irq_lock_type irq_lock; diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh index 18edf441..a731ed6e 100644 --- a/include/osv/mmu-defs.hh +++ b/include/osv/mmu-defs.hh @@ -15,8 +15,6 @@ struct exception_frame; -struct exception_frame; - namespace mmu { constexpr uintptr_t page_size = 4096; @@ -84,6 +82,7 @@ enum { mmap_small = 1ul << 5, mmap_jvm_balloon = 1ul << 6, mmap_file = 1ul << 7, + mmap_stack = 1ul << 8, }; enum { diff --git a/include/osv/sched.hh b/include/osv/sched.hh index 0fb44f77..d0e9b00f 100644 --- a/include/osv/sched.hh +++ b/include/osv/sched.hh @@ -335,6 +335,7 @@ public: void* begin; size_t size; void (*deleter)(stack_info si); // null: don't delete + bool lazy = false; static void default_deleter(stack_info si); }; struct attr { @@ -978,14 +979,13 @@ inline void release(mutex_t* mtx) } } -extern unsigned __thread preempt_counter; extern bool __thread need_reschedule; #ifdef __OSV_CORE__ inline unsigned int get_preempt_counter() { barrier(); - return preempt_counter; + return arch::irq_preempt_counters.preempt; } inline bool preemptable() @@ -1005,14 +1005,15 @@ inline void preempt() inline void preempt_disable() { - ++preempt_counter; + arch::ensure_next_stack_page(); + ++arch::irq_preempt_counters.preempt; barrier(); } inline void preempt_enable() { barrier(); - --preempt_counter; + --arch::irq_preempt_counters.preempt; if (preemptable() && need_reschedule && arch::irq_enabled()) { cpu::schedule(); } diff --git a/libc/mman.cc b/libc/mman.cc index d0803ac4..d00befc3 100644 --- a/libc/mman.cc +++ b/libc/mman.cc @@ -38,12 +38,11 @@ unsigned libc_flags_to_mmap(int flags) mmap_flags |= mmu::mmap_populate; } if (flags & MAP_STACK) { - // OSv currently requires that stacks be pinned (see issue #143). So - // if an application wants to mmap() a stack for pthread_attr_setstack - // and did us the courtesy of telling this to ue (via MAP_STACK), - // let's return the courtesy by returning pre-faulted memory. - // FIXME: If issue #143 is fixed, this workaround should be removed. +#ifdef AARCH64_PORT_STUB mmap_flags |= mmu::mmap_populate; +#else + mmap_flags |= mmu::mmap_stack; +#endif } if (flags & MAP_SHARED) { mmap_flags |= mmu::mmap_shared; diff --git a/libc/pthread.cc b/libc/pthread.cc index 8c976bf6..5c00e8b9 100644 --- a/libc/pthread.cc +++ b/libc/pthread.cc @@ -140,10 +140,15 @@ namespace pthread_private { return {attr.stack_begin, attr.stack_size}; } size_t size = attr.stack_size; +#ifdef AARCH64_PORT_STUB void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate, mmu::perm_rw); +#else + void *addr = mmu::map_anon(nullptr, size, mmu::mmap_stack, mmu::perm_rw); +#endif mmu::mprotect(addr, attr.guard_size, 0); sched::thread::stack_info si{addr, size}; si.deleter = free_stack; + si.lazy = true; return si; } -- 2.20.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20200129131424.4937-1-jwkozaczuk%40gmail.com.
