Please note this is an optimized and final version of the option 1 patch 
sent earlier

On Wednesday, January 29, 2020 at 8:14:30 AM UTC-5, Waldek Kozaczuk wrote:
>
> As the issue #143 explains, currently applications stacks are eagerly 
> allocated (aka pre-populated) which may lead to substantial memory waste. 
> We want those stacks to be lazily allocated instead, so that physical 
> memory behind the stack gets allocated as needed by standard fault 
> handling mechanism. 
>
> The page fault handling logic requires that both interrupts and preemption 
> are enabled. Therefore this patch implements simple strategy to read 
> single byte on a stack 
> one page (4K) deeper before disabling interrupts or preemption. 
>
> It turns out we cannot read that single byte "blindly" because in some 
> scenarios 
> interrupts might be already disabled when we are about to disable 
> preemption and vice versa. 
> Also similarly, which is more speculative, disabling preemption (and 
> possibly interrupts) may nest. 
> Therefore we need a "conditional" read of that byte on stack. More 
> specifically 
> we should ONLY read the byte IF both interrupts and preemption are 
> enabled. 
>
> This patch achieves this by adding extra thread-local counter to track 
> interrupts disabling/enabling 
> that operates in a similar way the preemption counter does. So every time 
> interrupts 
> are disabled we increment the counter and decrement it everytime 
> interrupts are enabled back. 
> Then everytime before we disable interrupts or preemption, we check if 
> both counters 
> are 0 and only then try to read a byte to potentially trigger a fault. 
>
> Please note that performance-wise this patch is slighy more expensive than 
> it 
> was originally hoped. As the disassembled snippets illustrate it costs 
> extra 3-4 instructions 
> extra every time we disable preemption or interrupts. 
>
> A. Disabling preemption in __tls_get_addr before the patch: 
> ``` 
>  4035cae8:   e8 83 f6 ff ff          callq  4035c170 <elf::get_program()> 
>  4035caed:   4c 8b 2d 6c 40 51 00    mov    0x51406c(%rip),%r13        # 
> 40870b60 <sched::preempt_counter+0x40870b60> 
>  4035caf4:   4c 8b 33                mov    (%rbx),%r14 
>  4035caf7:   64 41 83 45 00 01       addl   $0x1,%fs:0x0(%r13) 
>  4035cafd:   e8 6e f6 ff ff          callq  4035c170 <elf::get_program()> 
> ``` 
>
> B. Disabling preemption in __tls_get_addr after the patch: 
> ``` 
>  4035fdd8:   e8 63 f6 ff ff          callq  4035f440 <elf::get_program()> 
>  4035fddd:   4c 8b 2d a4 4d 51 00    mov    0x514da4(%rip),%r13        # 
> 40874b88 <arch::irq_preempt_counters+0x40874b88> 
>  4035fde4:   4c 8b 33                mov    (%rbx),%r14 
>  4035fde7:   64 49 83 7d 00 00       cmpq   $0x0,%fs:0x0(%r13) 
>  4035fded:   75 07                   jne    4035fdf6 <__tls_get_addr+0x76> 
>  4035fdef:   8a 84 24 00 f0 ff ff    mov    -0x1000(%rsp),%al 
>  4035fdf6:   64 41 83 45 00 01       addl   $0x1,%fs:0x0(%r13) 
>  4035fdfc:   e8 3f f6 ff ff          callq  4035f440 <elf::get_program()> 
> ``` 
>
> As an example of memory saving, tomcat using ~ 300 thread ended up 365MB 
> instead 
> of 620MB before the patch. 
>
> Fixes #143 
>
> Signed-off-by: Matthew Pabst <[email protected]> 
> Signed-off-by: Waldemar Kozaczuk <[email protected]> 
> --- 
>  arch/aarch64/arch.hh    |  6 ++++++ 
>  arch/x64/arch-switch.hh | 10 ++++++++++ 
>  arch/x64/arch.hh        | 20 ++++++++++++++++++++ 
>  core/sched.cc           |  5 ++++- 
>  include/osv/counters.hh | 31 +++++++++++++++++++++++++++++++ 
>  include/osv/irqlock.hh  |  2 ++ 
>  include/osv/mmu-defs.hh |  3 +-- 
>  include/osv/sched.hh    |  9 +++++---- 
>  libc/mman.cc            |  9 ++++----- 
>  libc/pthread.cc         |  5 +++++ 
>  10 files changed, 88 insertions(+), 12 deletions(-) 
>  create mode 100644 include/osv/counters.hh 
>
> diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh 
> index 855f1987..7dae53f5 100644 
> --- a/arch/aarch64/arch.hh 
> +++ b/arch/aarch64/arch.hh 
> @@ -11,6 +11,8 @@ 
>  #define ARCH_HH_ 
>   
>  #include "processor.hh" 
> +#include <osv/barrier.hh> 
> +#include <osv/counters.hh> 
>   
>  // architecture independent interface for architecture dependent 
> operations 
>   
> @@ -20,6 +22,10 @@ namespace arch { 
>  #define INSTR_SIZE_MIN 4 
>  #define ELF_IMAGE_START (OSV_KERNEL_BASE + 0x10000) 
>   
> +inline void ensure_next_stack_page() { 
> +    //TODO: Implement lazy stack check for AARCH64 
> +} 
> + 
>  inline void irq_disable() 
>  { 
>      processor::irq_disable(); 
> diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh 
> index 6803498f..048beb6f 100644 
> --- a/arch/x64/arch-switch.hh 
> +++ b/arch/x64/arch-switch.hh 
> @@ -148,6 +148,13 @@ void thread::init_stack() 
>      _state.rip = reinterpret_cast<void*>(thread_main); 
>      _state.rsp = stacktop; 
>      _state.exception_stack = _arch.exception_stack + 
> sizeof(_arch.exception_stack); 
> + 
> +    if (stack.lazy) { 
> +        // If the thread stack is setup to get lazily allocated and given 
> the thread initially starts 
> +        // running with preemption disabled, we need to pre-fault the 
> first stack page. 
> +        volatile char r = *((char *) (stacktop - 1)); 
> +        (void) r; // trick the compiler into thinking that r is used 
> +    } 
>  } 
>   
>  void thread::setup_tcb() 
> @@ -311,6 +318,9 @@ void thread::free_tcb() 
>   
>  void thread_main_c(thread* t) 
>  { 
> +    if (t->get_stack_info().lazy) { 
> +        arch::irq_preempt_counters.irq = 
> arch::irq_counter_lazy_stack_init_value; 
> +    } 
>      arch::irq_enable(); 
>  #ifdef CONF_preempt 
>      preempt_enable(); 
> diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh 
> index 17df5f5c..5cadc562 100644 
> --- a/arch/x64/arch.hh 
> +++ b/arch/x64/arch.hh 
> @@ -10,6 +10,8 @@ 
>   
>  #include "processor.hh" 
>  #include "msr.hh" 
> +#include <osv/barrier.hh> 
> +#include <osv/counters.hh> 
>   
>  // namespace arch - architecture independent interface for architecture 
>  //                  dependent operations (e.g. irq_disable vs. cli) 
> @@ -20,8 +22,19 @@ namespace arch { 
>  #define INSTR_SIZE_MIN 1 
>  #define ELF_IMAGE_START OSV_KERNEL_BASE 
>   
> +inline void ensure_next_stack_page() { 
> +    if (irq_preempt_counters.any_is_on) { 
> +        return; 
> +    } 
> + 
> +    char i; 
> +    asm volatile("movb -4096(%%rsp), %0" : "=r"(i)); 
> +} 
> + 
>  inline void irq_disable() 
>  { 
> +    ensure_next_stack_page(); 
> +    ++irq_preempt_counters.irq; 
>      processor::cli(); 
>  } 
>   
> @@ -36,11 +49,15 @@ inline void irq_disable_notrace() 
>  inline void irq_enable() 
>  { 
>      processor::sti(); 
> +    --irq_preempt_counters.irq; 
> +    barrier(); 
>  } 
>   
>  inline void wait_for_interrupt() 
>  { 
>      processor::sti_hlt(); 
> +    --irq_preempt_counters.irq; 
> +    barrier(); 
>  } 
>   
>  inline void halt_no_interrupts() 
> @@ -78,12 +95,15 @@ private: 
>   
>  inline void irq_flag_notrace::save() 
>  { 
> +    ensure_next_stack_page(); 
> +    ++irq_preempt_counters.irq; 
>      asm volatile("sub $128, %%rsp; pushfq; popq %0; add $128, %%rsp" : 
> "=r"(_rflags)); 
>  } 
>   
>  inline void irq_flag_notrace::restore() 
>  { 
>      asm volatile("sub $128, %%rsp; pushq %0; popfq; add $128, %%rsp" : : 
> "r"(_rflags)); 
> +    --irq_preempt_counters.irq; 
>  } 
>   
>  inline bool irq_flag_notrace::enabled() const 
> diff --git a/core/sched.cc b/core/sched.cc 
> index 06f849d1..295e3de0 100644 
> --- a/core/sched.cc 
> +++ b/core/sched.cc 
> @@ -42,6 +42,10 @@ extern char _percpu_start[], _percpu_end[]; 
>  using namespace osv; 
>  using namespace osv::clock::literals; 
>   
> +namespace arch { 
> +counters __thread irq_preempt_counters = {{1, 
> irq_counter_default_init_value}}; 
> +} 
> + 
>  namespace sched { 
>   
>  TRACEPOINT(trace_sched_idle, ""); 
> @@ -69,7 +73,6 @@ std::vector<cpu*> cpus 
> __attribute__((init_priority((int)init_prio::cpus))); 
>  thread __thread * s_current; 
>  cpu __thread * current_cpu; 
>   
> -unsigned __thread preempt_counter = 1; 
>  bool __thread need_reschedule = false; 
>   
>  elf::tls_data tls; 
> diff --git a/include/osv/counters.hh b/include/osv/counters.hh 
> new file mode 100644 
> index 00000000..38d40fb6 
> --- /dev/null 
> +++ b/include/osv/counters.hh 
> @@ -0,0 +1,31 @@ 
> +/* 
> + * Copyright (C) 2020 Waldemar Kozaczuk 
> + * 
> + * This work is open source software, licensed under the terms of the 
> + * BSD license as described in the LICENSE file in the top-level 
> directory. 
> + */ 
> + 
> +#ifndef OSV_COUNTERS_HH 
> +#define OSV_COUNTERS_HH 
> + 
> +namespace arch { 
> + 
> +// Both preempt and irq counters are colocated next to each other in this 
> +// union by design to let compiler optimize ensure_next_stack_page() so 
> +// that it can check if any counter is non-zero by checking single 64-bit 
> any_is_on field 
> +union counters { 
> +    struct { 
> +        unsigned preempt; 
> +        unsigned irq; 
> +    }; 
> +    uint64_t any_is_on; 
> +}; 
> + 
> +extern counters __thread irq_preempt_counters; 
> + 
> +constexpr unsigned irq_counter_default_init_value = 11; 
> +constexpr unsigned irq_counter_lazy_stack_init_value = 1; 
> + 
> +} 
> + 
> +#endif //OSV_COUNTERS_HH 
> diff --git a/include/osv/irqlock.hh b/include/osv/irqlock.hh 
> index 2d5372fc..1a163e45 100644 
> --- a/include/osv/irqlock.hh 
> +++ b/include/osv/irqlock.hh 
> @@ -34,6 +34,8 @@ inline void irq_save_lock_type::lock() 
>  inline void irq_save_lock_type::unlock() 
>  { 
>      _flags.restore(); 
> +    --arch::irq_preempt_counters.irq; 
> +    barrier(); 
>  } 
>   
>  extern irq_lock_type irq_lock; 
> diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh 
> index 18edf441..a731ed6e 100644 
> --- a/include/osv/mmu-defs.hh 
> +++ b/include/osv/mmu-defs.hh 
> @@ -15,8 +15,6 @@ 
>   
>  struct exception_frame; 
>   
> -struct exception_frame; 
> - 
>  namespace mmu { 
>   
>  constexpr uintptr_t page_size = 4096; 
> @@ -84,6 +82,7 @@ enum { 
>      mmap_small       = 1ul << 5, 
>      mmap_jvm_balloon = 1ul << 6, 
>      mmap_file        = 1ul << 7, 
> +    mmap_stack       = 1ul << 8, 
>  }; 
>   
>  enum { 
> diff --git a/include/osv/sched.hh b/include/osv/sched.hh 
> index 0fb44f77..d0e9b00f 100644 
> --- a/include/osv/sched.hh 
> +++ b/include/osv/sched.hh 
> @@ -335,6 +335,7 @@ public: 
>          void* begin; 
>          size_t size; 
>          void (*deleter)(stack_info si);  // null: don't delete 
> +        bool lazy = false; 
>          static void default_deleter(stack_info si); 
>      }; 
>      struct attr { 
> @@ -978,14 +979,13 @@ inline void release(mutex_t* mtx) 
>      } 
>  } 
>   
> -extern unsigned __thread preempt_counter; 
>  extern bool __thread need_reschedule; 
>   
>  #ifdef __OSV_CORE__ 
>  inline unsigned int get_preempt_counter() 
>  { 
>      barrier(); 
> -    return preempt_counter; 
> +    return arch::irq_preempt_counters.preempt; 
>  } 
>   
>  inline bool preemptable() 
> @@ -1005,14 +1005,15 @@ inline void preempt() 
>   
>  inline void preempt_disable() 
>  { 
> -    ++preempt_counter; 
> +    arch::ensure_next_stack_page(); 
> +    ++arch::irq_preempt_counters.preempt; 
>      barrier(); 
>  } 
>   
>  inline void preempt_enable() 
>  { 
>      barrier(); 
> -    --preempt_counter; 
> +    --arch::irq_preempt_counters.preempt; 
>      if (preemptable() && need_reschedule && arch::irq_enabled()) { 
>          cpu::schedule(); 
>      } 
> diff --git a/libc/mman.cc b/libc/mman.cc 
> index d0803ac4..d00befc3 100644 
> --- a/libc/mman.cc 
> +++ b/libc/mman.cc 
> @@ -38,12 +38,11 @@ unsigned libc_flags_to_mmap(int flags) 
>          mmap_flags |= mmu::mmap_populate; 
>      } 
>      if (flags & MAP_STACK) { 
> -        // OSv currently requires that stacks be pinned (see issue #143). 
> So 
> -        // if an application wants to mmap() a stack for 
> pthread_attr_setstack 
> -        // and did us the courtesy of telling this to ue (via MAP_STACK), 
> -        // let's return the courtesy by returning pre-faulted memory. 
> -        // FIXME: If issue #143 is fixed, this workaround should be 
> removed. 
> +#ifdef AARCH64_PORT_STUB 
>          mmap_flags |= mmu::mmap_populate; 
> +#else 
> +        mmap_flags |= mmu::mmap_stack; 
> +#endif 
>      } 
>      if (flags & MAP_SHARED) { 
>          mmap_flags |= mmu::mmap_shared; 
> diff --git a/libc/pthread.cc b/libc/pthread.cc 
> index 8c976bf6..5c00e8b9 100644 
> --- a/libc/pthread.cc 
> +++ b/libc/pthread.cc 
> @@ -140,10 +140,15 @@ namespace pthread_private { 
>              return {attr.stack_begin, attr.stack_size}; 
>          } 
>          size_t size = attr.stack_size; 
> +#ifdef AARCH64_PORT_STUB 
>          void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate, 
> mmu::perm_rw); 
> +#else 
> +        void *addr = mmu::map_anon(nullptr, size, mmu::mmap_stack, 
> mmu::perm_rw); 
> +#endif 
>          mmu::mprotect(addr, attr.guard_size, 0); 
>          sched::thread::stack_info si{addr, size}; 
>          si.deleter = free_stack; 
> +        si.lazy = true; 
>          return si; 
>      } 
>   
> -- 
> 2.20.1 
>
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/eff8ced4-1711-4f78-ad02-69404255c9fb%40googlegroups.com.

Reply via email to