On Mon, Mar 30, 2020 at 7:01 AM Waldek Kozaczuk <jwkozac...@gmail.com>
wrote:

>
>
> On Sunday, March 29, 2020 at 3:34:25 PM UTC-4, Nadav Har'El wrote:
>>
>> On Sun, Mar 29, 2020 at 7:56 PM Waldemar Kozaczuk <jwkoz...@gmail.com>
>> wrote:
>>
>>> This patch mostly addresses the issue #854. In essence it changes
>>> malloc_large()
>>> to use mmu::map_anon() if requested memory is greater than size of "huge
>>> page"
>>> and it does not have be contiguous in physical memory. Also it makes it
>>> fallback
>>> to map_anon() based allocation for requests > 4K and < 2MB that cannot
>>> be satisfied
>>> when memory is heavily fragmented.
>>>
>>> It is supposed to help OSv memory allocation behave better when memory
>>> in free_page_ranges is heavily fragmented and large allocations (>=2MB)
>>> cannot be satisfied with single contiguous page range. When working on
>>> this patch it has also become clear, that applications that heavily use
>>> realloc()
>>> which allocates new chunks of memory and frees old ones, might very much
>>> encounter
>>> heavy fragmentation of memory. Hopefully this patch is going to help
>>> in these scenarios.
>>>
>>> It just happens that object_size() and large_object_size() needed to be
>>> adjusted to work with new malloc_large() implementation. It was
>>> discovered
>>> that therefore other related issues are fixed by this patch as well.
>>>
>>> Lastly this patch adds new test verifying realloc() and
>>> malloc_usable_size().
>>>
>>> Fixes #784
>>> Fixes #854
>>> Fixes #1077
>>>
>>> Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
>>> ---
>>>  core/mempool.cc        | 81 ++++++++++++++++++++++++++++++------
>>>  modules/tests/Makefile |  2 +-
>>>  tests/tst-realloc.cc   | 94 ++++++++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 163 insertions(+), 14 deletions(-)
>>>  create mode 100644 tests/tst-realloc.cc
>>>
>>> diff --git a/core/mempool.cc b/core/mempool.cc
>>> index 11fd1456..50f5e877 100644
>>> --- a/core/mempool.cc
>>> +++ b/core/mempool.cc
>>> @@ -546,7 +546,7 @@ public:
>>>      page_range_allocator() : _deferred_free(nullptr) { }
>>>
>>>      template<bool UseBitmap = true>
>>> -    page_range* alloc(size_t size);
>>> +    page_range* alloc(size_t size, bool contiguous = true);
>>>      page_range* alloc_aligned(size_t size, size_t offset, size_t
>>> alignment,
>>>                                bool fill = false);
>>>      void free(page_range* pr);
>>> @@ -678,7 +678,7 @@ void
>>> page_range_allocator::bitmap_allocator<T>::deallocate(T* p, size_t n)
>>>  }
>>>
>>>  template<bool UseBitmap>
>>> -page_range* page_range_allocator::alloc(size_t size)
>>> +page_range* page_range_allocator::alloc(size_t size, bool contiguous)
>>>  {
>>>      auto exact_order = ilog2_roundup(size / page_size);
>>>      if (exact_order > max_order) {
>>> @@ -692,13 +692,12 @@ page_range* page_range_allocator::alloc(size_t
>>> size)
>>>
>>>      page_range* range = nullptr;
>>>      if (!bitset) {
>>> -        if (!exact_order || _free[exact_order - 1].empty()) {
>>> +        if (!contiguous || !exact_order || _free[exact_order -
>>> 1].empty()) {
>>>              return nullptr;
>>>          }
>>> -        // TODO: This linear search makes worst case complexity of the
>>> allocator
>>> -        // O(n). It would be better to fall back to non-contiguous
>>> allocation
>>> -        // and make worst case complexity depend on the size of
>>> requested memory
>>> -        // block and the logarithm of the number of free huge page
>>> ranges.
>>> +        // This linear search makes worst case complexity of the
>>> allocator
>>> +        // O(n). Unfortunately we do not have choice for contiguous
>>> allocation
>>> +        // so let us hope there is large enough range.
>>>          for (auto&& pr : _free[exact_order - 1]) {
>>>              if (pr.size >= size) {
>>>                  range = &pr;
>>> @@ -823,7 +822,23 @@ void page_range_allocator::for_each(unsigned
>>> min_order, Func f)
>>>      }
>>>  }
>>>
>>> -static void* malloc_large(size_t size, size_t alignment, bool block =
>>> true)
>>> +static void* mapped_malloc_large(size_t size, size_t offset)
>>> +{
>>> +    //TODO: For now pre-populate the memory, in future consider doing
>>> lazy population
>>> +    void* obj = mmu::map_anon(nullptr, size, mmu::mmap_populate,
>>> mmu::perm_read | mmu::perm_write);
>>> +    size_t* ret_header = static_cast<size_t*>(obj);
>>> +    *ret_header = size;
>>> +    return obj + offset;
>>> +}
>>> +
>>> +static void mapped_free_large(void *object)
>>> +{
>>> +    object = align_down(object - 1, mmu::page_size);
>>> +    size_t* ret_header = static_cast<size_t*>(object);
>>> +    mmu::munmap(object, *ret_header);
>>> +}
>>> +
>>> +static void* malloc_large(size_t size, size_t alignment, bool block =
>>> true, bool contiguous = true)
>>>  {
>>>      auto requested_size = size;
>>>      size_t offset;
>>> @@ -835,6 +850,14 @@ static void* malloc_large(size_t size, size_t
>>> alignment, bool block = true)
>>>      size += offset;
>>>      size = align_up(size, page_size);
>>>
>>> +    // Use mmap if requested memory greater than "huge page" size
>>> +    // and does not need to be contiguous
>>> +    if (size >= mmu::huge_page_size && !contiguous) {
>>> +        void* obj = mapped_malloc_large(size, offset);
>>> +        trace_memory_malloc_large(obj, requested_size, size, alignment);
>>> +        return obj;
>>> +    }
>>> +
>>>      while (true) {
>>>          WITH_LOCK(free_page_ranges_lock) {
>>>              reclaimer_thread.wait_for_minimum_memory();
>>> @@ -842,7 +865,7 @@ static void* malloc_large(size_t size, size_t
>>> alignment, bool block = true)
>>>              if (alignment > page_size) {
>>>                  ret_header = free_page_ranges.alloc_aligned(size,
>>> page_size, alignment);
>>>              } else {
>>> -                ret_header = free_page_ranges.alloc(size);
>>> +                ret_header = free_page_ranges.alloc(size, contiguous);
>>>              }
>>>              if (ret_header) {
>>>                  on_alloc(size);
>>> @@ -850,6 +873,11 @@ static void* malloc_large(size_t size, size_t
>>> alignment, bool block = true)
>>>                  obj += offset;
>>>                  trace_memory_malloc_large(obj, requested_size, size,
>>> alignment);
>>>                  return obj;
>>> +            } else if (!contiguous) {
>>> +                // If we failed to get contiguous memory allocation and
>>> +                // the caller does not require one let us use map-based
>>> allocation
>>> +                // which we do after the loop below
>>> +                break;
>>>              }
>>>              if (block)
>>>                  reclaimer_thread.wait_for_memory(size);
>>> @@ -857,6 +885,15 @@ static void* malloc_large(size_t size, size_t
>>> alignment, bool block = true)
>>>                  return nullptr;
>>>          }
>>>      }
>>> +
>>> +    // We are deliberately executing this code here because doing it
>>> +    // in WITH_LOCK section above, would likely lead to a deadlock,
>>> +    // as map_anon() eventually would be pulling memory from
>>> free_page_ranges
>>> +    // to satisfy the request and even worse this method might get
>>> +    // called recursively.
>>> +    void* obj = mapped_malloc_large(size, offset);
>>> +    trace_memory_malloc_large(obj, requested_size, size, alignment);
>>> +    return obj;
>>>  }
>>>
>>>  void shrinker::deactivate_shrinker()
>>> @@ -1070,11 +1107,18 @@ static void free_large(void* obj)
>>>      free_page_range(static_cast<page_range*>(obj));
>>>  }
>>>
>>> -static unsigned large_object_size(void *obj)
>>> +static size_t large_object_offset(void *&obj)
>>>  {
>>> +    void *original_obj = obj;
>>>      obj = align_down(obj - 1, page_size);
>>> +    return reinterpret_cast<uint64_t>(original_obj) -
>>> reinterpret_cast<uint64_t>(obj);
>>> +}
>>> +
>>> +static size_t large_object_size(void *obj)
>>> +{
>>> +    size_t offset = large_object_offset(obj);
>>>      auto header = static_cast<page_range*>(obj);
>>> -    return header->size;
>>> +    return header->size - offset;
>>>  }
>>>
>>>  namespace page_pool {
>>> @@ -1692,7 +1736,7 @@ static inline void* std_malloc(size_t size, size_t
>>> alignment)
>>>                                         memory::alloc_page());
>>>          trace_memory_malloc_page(ret, size, mmu::page_size, alignment);
>>>      } else {
>>> -        ret = memory::malloc_large(size, alignment);
>>> +        ret = memory::malloc_large(size, alignment, true, false);
>>>      }
>>>      memory::tracker_remember(ret, size);
>>>      return ret;
>>> @@ -1714,6 +1758,12 @@ void* calloc(size_t nmemb, size_t size)
>>>
>>>  static size_t object_size(void *object)
>>>  {
>>> +    if (!mmu::is_linear_mapped(object, 0)) {
>>> +        size_t offset = memory::large_object_offset(object);
>>> +        size_t* ret_header = static_cast<size_t*>(object);
>>>
>> +        return *ret_header - offset;
>>>
>>
>> Why does this code assume the start of the object has a size_t (by the
>> way, above you used uint64_t, better be consistent) and not
>> a page_range?
>>
>
> Because mapped_malloc_large() stores allocated (mapped) size in the size_t
> field located at the very beginning of the allocated area. Which is
> different from non-mapped allocation that stores data including size n the
> page_range struct. I think it would be confusing if we used page_range for
> both (which is what I did in my very first version of this patch).
>

Oh, I didn't notice that. I remembered (from the first version I reviewed,
I guess) that you used the same page_range struct.


> Also, uint64_t is used in large_object_offset() to calculate an offset -
> distance between the start of the allocated memory and the object address -
> so I am not sure how this relates to the size_t field.
>

What I meant was that when you calculate difference, or add, pointers, it
works in multiples of the pointed-type size. So if you calculated the
difference between two uint64_t* pointers, you shouldn't add it to a
size_t* pointer. Of course, in practice, it will work (because uint64_t and
size_t are the same) but it's strange.


>
> Or maybe my latest version of this patch had made this part less readable
> -  memory::large_object_offset() calculates offset but also modifies object
> argument  - see it accepts a reference to a pointer.
>

> I may have made here but I am not sure I clearly see it.
>
>>
>>
>> +    }
>>> +
>>>      switch (mmu::get_mem_area(object)) {
>>>      case mmu::mem_area::main:
>>>          return memory::large_object_size(object);
>>> @@ -1763,6 +1813,11 @@ void free(void* object)
>>>          return;
>>>      }
>>>      memory::tracker_forget(object);
>>> +
>>> +    if (!mmu::is_linear_mapped(object, 0)) {
>>> +        return memory::mapped_free_large(object);
>>> +    }
>>> +
>>>      switch (mmu::get_mem_area(object)) {
>>>      case mmu::mem_area::page:
>>>          object = mmu::translate_mem_area(mmu::mem_area::page,
>>> @@ -1972,7 +2027,7 @@ void* alloc_phys_contiguous_aligned(size_t size,
>>> size_t align, bool block)
>>>      assert(is_power_of_two(align));
>>>      // make use of the standard large allocator returning properly
>>> aligned
>>>      // physically contiguous memory:
>>> -    auto ret = malloc_large(size, align, block);
>>> +    auto ret = malloc_large(size, align, block, true);
>>>      assert (!(reinterpret_cast<uintptr_t>(ret) & (align - 1)));
>>>      return ret;
>>>  }
>>> diff --git a/modules/tests/Makefile b/modules/tests/Makefile
>>> index ce004339..10df022f 100644
>>> --- a/modules/tests/Makefile
>>> +++ b/modules/tests/Makefile
>>> @@ -129,7 +129,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so
>>> tst-bsd-evh.so \
>>>         tst-sigaltstack.so tst-fread.so tst-tcp-cork.so tst-tcp-v6.so \
>>>         tst-calloc.so tst-crypt.so tst-non-fpic.so tst-small-malloc.so \
>>>         tst-mmx-fpu.so tst-getopt.so tst-getopt-pie.so tst-non-pie.so
>>> tst-semaphore.so \
>>> -       tst-elf-init.so
>>> +       tst-elf-init.so tst-realloc.so
>>>  #      libstatic-thread-variable.so tst-static-thread-variable.so \
>>>
>>>  tests += testrunner.so
>>> diff --git a/tests/tst-realloc.cc b/tests/tst-realloc.cc
>>> new file mode 100644
>>> index 00000000..49edfb7a
>>> --- /dev/null
>>> +++ b/tests/tst-realloc.cc
>>> @@ -0,0 +1,94 @@
>>> +/*
>>> +* Copyright (C) 2020 Waldemar Kozaczuk
>>> +*
>>> +* This work is open source software, licensed under the terms of the
>>> +* BSD license as described in the LICENSE file in the top-level
>>> directory.
>>> +*/
>>> +
>>> +#include <stdlib.h>
>>> +#include <string.h>
>>> +#include <cassert>
>>> +#include <iostream>
>>> +
>>> +extern "C" size_t malloc_usable_size (void *ptr);
>>> +
>>> +static void test_realloc(size_t original_size, size_t new_size)
>>> +{
>>> +    char data[11] = "0123456789";
>>> +
>>> +    void *original_buf = malloc(original_size);
>>> +    assert(original_buf);
>>> +
>>> +    char *buf = static_cast<char*>(original_buf);
>>> +    for (size_t i = 0; i < original_size; i++) {
>>> +        buf[i] = data[i % 10];
>>> +    }
>>> +
>>> +    void *new_buf = realloc(original_buf, new_size);
>>> +    assert(new_buf);
>>> +
>>> +    auto expected_same_data_len = std::min(original_size, new_size);
>>> +    buf = static_cast<char*>(new_buf);
>>> +    for (size_t i = 0; i < expected_same_data_len; i++) {
>>> +        assert(buf[i] == data[i % 10]);
>>> +    }
>>> +
>>> +    free(new_buf);
>>> +
>>> +    std::cerr << "PASSED realloc() for original_size: " <<
>>> original_size << ", new_size: " << new_size << std::endl;
>>> +}
>>> +
>>> +static void test_usable_size(size_t size, size_t expected_usable_size)
>>> +{
>>> +    void* ptr = malloc(size);
>>> +    assert(expected_usable_size == malloc_usable_size(ptr));
>>> +    free(ptr);
>>> +
>>> +    std::cerr << "PASSED malloc_usable_size() for size: " << size <<
>>> std::endl;
>>> +}
>>> +
>>> +int main()
>>> +{
>>> +    test_realloc(1,2);
>>> +    test_realloc(2,1);
>>> +
>>> +    test_realloc(4,7);
>>> +    test_realloc(7,4);
>>> +
>>> +    test_realloc(63,128);
>>> +    test_realloc(128,63);
>>> +
>>> +    test_realloc(4000,5000);
>>> +    test_realloc(5000,4000);
>>> +
>>> +    test_realloc(4096,4096);
>>> +
>>> +    test_realloc(0x100000,0x100000);
>>> +    test_realloc(0x100000,0x100900);
>>> +    test_realloc(0x100900,0x100000);
>>> +
>>> +    test_realloc(0x200000,0x200000);
>>> +    test_realloc(0x200000,0x300900);
>>> +    test_realloc(0x300900,0x200000);
>>> +
>>> +    test_realloc(0x600900,0x600000);
>>> +    test_realloc(0x400000,0x600000);
>>> +    test_realloc(0x600000,0x400900);
>>> +
>>> +    void *buf = realloc(nullptr, 0);
>>> +    assert(buf);
>>> +    free(buf);
>>> +
>>> +    buf = malloc(16);
>>> +    assert(!realloc(buf, 0));
>>> +
>>> +    test_usable_size(1, 8);
>>> +    test_usable_size(8, 8);
>>> +    test_usable_size(67, 128);
>>> +    test_usable_size(0x4010, 0x4FC0);
>>> +    test_usable_size(0x100000, 0x100FC0);
>>> +    test_usable_size(0x200000, 0x200FC0);
>>> +
>>> +    std::cerr << "PASSED\n";
>>> +    return 0;
>>> +}
>>> \ No newline at end of file
>>> --
>>> 2.20.1
>>>
>>> --
>>> You received this message because you are subscribed to the Google
>>> Groups "OSv Development" group.
>>> To unsubscribe from this group and stop receiving emails from it, send
>>> an email to osv...@googlegroups.com.
>>> To view this discussion on the web visit
>>> https://groups.google.com/d/msgid/osv-dev/20200329165640.23961-2-jwkozaczuk%40gmail.com
>>> .
>>>
>> --
> You received this message because you are subscribed to the Google Groups
> "OSv Development" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to osv-dev+unsubscr...@googlegroups.com.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/osv-dev/ff0b4296-edbb-42ea-a75c-7a27e71a81dc%40googlegroups.com
> <https://groups.google.com/d/msgid/osv-dev/ff0b4296-edbb-42ea-a75c-7a27e71a81dc%40googlegroups.com?utm_medium=email&utm_source=footer>
> .
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/CANEVyjuR2X_p%2BBFSMyw_Vmhe5CMVe-rnYz%2Bp6%2BXVmAeyyx%3Dxdw%40mail.gmail.com.

Reply via email to