On Thursday, March 31, 2022 at 8:54:24 AM UTC-4 Nadav Har'El wrote:

> On Thu, Mar 31, 2022 at 6:36 AM Commit Bot <[email protected]> 
> wrote:
>
>> From: Waldemar Kozaczuk <[email protected]>
>> Committer: Waldemar Kozaczuk <[email protected]>
>> Branch: master
>>
>> mmap: avoid collisions with linear map
>>
>> This patch enhances virtual memory mapping code to track both
>> linear (linear_map) and non-linear (mmap()) virtual memory mappings.
>>
>> It does so by adding simple struct - vma_range and vma_range_set
>> collection to store all mappings. It then modifies mmu::find_hole()
>> implementation to use vma_range_set instead of vma_list to find next hole.
>> That way it can avoid collisions described in the issue #1135.
>>
>> Fixes #1135
>>
>> Signed-off-by: Waldemar Kozaczuk <[email protected]>
>>
>> ---
>> diff --git a/core/mmu.cc b/core/mmu.cc
>> --- a/core/mmu.cc
>> +++ b/core/mmu.cc
>> @@ -47,6 +47,17 @@ extern const char text_start[], text_end[];
>>
>>  namespace mmu {
>>
>> +struct vma_range_compare {
>> +    bool operator()(const vma_range& a, const vma_range& b) {
>> +        return a.start() < b.start();
>> +    }
>> +};
>> +
>> +//Set of all vma ranges - both linear and non-linear ones
>> +__attribute__((init_priority((int)init_prio::vma_range_set)))
>> +std::set<vma_range, vma_range_compare> vma_range_set;
>> +rwlock_t vma_range_set_mutex;
>>
>
> Do we still need both vma_range_set and vma_list? Why?
>
I have actually thought hard about using only one set and one data 
structure to represent both non-linear and linear VMAs but this seemed a 
harder road. The information is quite different and in most places (except 
for find_hole() we either need to access the regular (non-linear) *vma_list* 
or the new *linear_vma_set* (which I added in one of the previous patches). 
The find_hole() is the ONLY place where need information about all ranges 
to avoid a collision. If we put everything in one set then in most places 
when we iterate over/access we would need to differentiate between two 
types and I reasoned it would only complicate things and possibly break 
existing code. So the solution was to add another set - vma_range_set (a 
superset) where we register ALL ranges (only start address and size) which 
we then use in the find_hole().

>  
>
>> +
>>  struct linear_vma_compare {
>>      bool operator()(const linear_vma* a, const linear_vma* b) {
>>          return a->_virt_addr < b->_virt_addr;
>> @@ -66,6 +77,9 @@ class vma_compare {
>>      }
>>  };
>>
>> +constexpr uintptr_t lower_vma_limit = 0x0;
>> +constexpr uintptr_t upper_vma_limit = 0x800000000000;
>> +
>>  typedef boost::intrusive::set<vma,
>>                                bi::compare<vma_compare>,
>>                                bi::member_hook<vma,
>> @@ -78,9 +92,15 @@ struct vma_list_type : vma_list_base {
>>      vma_list_type() {
>>          // insert markers for the edges of allocatable area
>>          // simplifies searches 
>>
> -        insert(*new anon_vma(addr_range(0, 0), 0, 0));
>> -        uintptr_t e = 0x800000000000;
>> -        insert(*new anon_vma(addr_range(e, e), 0, 0));
>> +        auto lower_edge = new anon_vma(addr_range(lower_vma_limit, 
>> lower_vma_limit), 0, 0);
>> +        insert(*lower_edge);
>> +        auto upper_edge = new anon_vma(addr_range(upper_vma_limit, 
>> upper_vma_limit), 0, 0);
>> +        insert(*upper_edge);
>> +
>> +        WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +            vma_range_set.insert(vma_range(lower_edge));
>> +            vma_range_set.insert(vma_range(upper_edge));
>> +        }
>>      }
>>  };
>>
>> @@ -954,27 +974,41 @@ static error protect(const void *addr, size_t size, 
>> unsigned int perm)
>>      return no_error();
>>  }
>>
>> +class vma_range_addr_compare {
>> +public:
>> +    bool operator()(const vma_range& x, uintptr_t y) const { return 
>> x.start() < y; }
>> +    bool operator()(uintptr_t x, const vma_range& y) const { return x < 
>> y.start(); }
>> +};
>> +
>>  uintptr_t find_hole(uintptr_t start, uintptr_t size)
>>  {
>>      bool small = size < huge_page_size;
>>      uintptr_t good_enough = 0;
>>
>> -    // FIXME: use lower_bound or something
>> -    auto p = vma_list.begin();
>> +    SCOPE_LOCK(vma_range_set_mutex.for_read());
>>
>
> Unfortunately I'm very rusy with this area of the code, so I have a 
> question that might be stupid:
>
> How come we didn't need a mutex here before, but need one now?
> Could adding a mutex here be a problem - namely that maybe we need to call 
> some of this code from non-preemptable code and this will now break?
>

Yeah, it is a good point. The intention was to guard against concurrent 
calls from linear_map() which inserts to both linear_vma_set and 
vma_range_set and find_hole(). Maybe it is not necessary as this code is 
called from allocate() which is called with the 
SCOPE_LOCK(vma_list_mutex.for_write()) from both map_anon() and map_file(). 
Could these be called from non-preemptable code?

Also I have added this code to many places in core/mmu.cc (other patch_:

WITH_LOCK(vma_range_set_mutex.for_write()) {
       vma_range_set.erase(vma_range(&dead));
}
 
but I think they all would never be called from non-preemptable code, would 
they?

+    //Find first vma range which starts before the start parameter or is 
>> the 1st one
>> +    auto p = std::lower_bound(vma_range_set.begin(), 
>> vma_range_set.end(), start, vma_range_addr_compare());
>> +    if (p != vma_range_set.begin()) {
>> +        --p;
>> +    }
>>      auto n = std::next(p);
>> -    while (n != vma_list.end()) {
>> +    while (n->start() <= upper_vma_limit) { //we only go up to the upper 
>> mmap vma limit
>> +        //See if desired hole fits between p and n vmas
>>          if (start >= p->end() && start + size <= n->start()) {
>>              return start;
>>          }
>> +        //See if shifting start to the end of p makes desired hole fit 
>> between p and n
>>          if (p->end() >= start && n->start() - p->end() >= size) {
>>              good_enough = p->end();
>>              if (small) {
>>                  return good_enough;
>>              }
>> +            //See if huge hole fits between p and n
>>              if (n->start() - align_up(good_enough, huge_page_size) >= 
>> size) {
>>                  return align_up(good_enough, huge_page_size);
>>              }
>>          }
>> +        //If nothing worked move next in the list
>>          p = n;
>>          ++n;
>>      }
>> @@ -999,6 +1033,9 @@ ulong evacuate(uintptr_t start, uintptr_t end)
>>                  memory::stats::on_jvm_heap_free(size);
>>              }
>>              vma_list.erase(dead);
>> +            WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +                vma_range_set.erase(vma_range(&dead));
>> +            }
>>              delete &dead;
>>          }
>>      }
>> @@ -1140,6 +1177,9 @@ uintptr_t allocate(vma *v, uintptr_t start, size_t 
>> size, bool search)
>>      v->set(start, start+size);
>>
>>      vma_list.insert(*v);
>> +    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +        vma_range_set.insert(vma_range(v));
>> +    }
>>
>>      return start;
>>  }
>> @@ -1493,6 +1533,9 @@ void anon_vma::split(uintptr_t edge)
>>      vma* n = new anon_vma(addr_range(edge, _range.end()), _perm, _flags);
>>      set(_range.start(), edge);
>>      vma_list.insert(*n);
>> +    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +        vma_range_set.insert(vma_range(n));
>> +    }
>>  }
>>
>>  error anon_vma::sync(uintptr_t start, uintptr_t end)
>> @@ -1600,6 +1643,9 @@ jvm_balloon_vma::~jvm_balloon_vma()
>>      // for a dangling mapping representing a balloon that was already 
>> moved
>>      // out.
>>      vma_list.erase(*this);
>> +    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +        vma_range_set.erase(vma_range(this));
>> +    }
>>      assert(!(_real_flags & mmap_jvm_balloon));
>>      mmu::map_anon(addr(), size(), _real_flags, _real_perm);
>>
>> @@ -1667,6 +1713,9 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, 
>> size_t align, balloon_ptr b)
>>                      // Since we will change its position in the tree, 
>> for the sake of future
>>                      // lookups we need to reinsert it.
>>                      vma_list.erase(*jvma);
>> +                    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +                        vma_range_set.erase(vma_range(jvma));
>> +                    }
>>                      if (jvma->start() < start) {
>>                          assert(jvma->partial() >= (jvma->end() - start));
>>                          jvma->set(jvma->start(), start);
>> @@ -1675,11 +1724,17 @@ ulong map_jvm(unsigned char* jvm_addr, size_t 
>> size, size_t align, balloon_ptr b)
>>                          jvma->set(end, jvma->end());
>>                      }
>>                      vma_list.insert(*jvma);
>> +                    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +                        vma_range_set.insert(vma_range(jvma));
>> +                    }
>>                  } else {
>>                      // Note how v and jvma are different. This is 
>> because this one,
>>                      // we will delete.
>>                      auto& v = *i--;
>>                      vma_list.erase(v);
>> +                    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +                        vma_range_set.erase(vma_range(&v));
>> +                    }
>>                      // Finish the move. In practice, it will temporarily 
>> remap an
>>                      // anon mapping here, but this should be rare. Let's 
>> not
>>                      // complicate the code to optimize it. There are no
>> @@ -1692,6 +1747,9 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, 
>> size_t align, balloon_ptr b)
>>
>>          evacuate(start, start + size);
>>          vma_list.insert(*vma);
>> +        WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +            vma_range_set.insert(vma_range(vma));
>> +        }
>>          return vma->size();
>>      }
>>      return 0;
>> @@ -1753,6 +1811,9 @@ void file_vma::split(uintptr_t edge)
>>      vma *n = _file->mmap(addr_range(edge, _range.end()), _flags, _perm, 
>> off).release();
>>      set(_range.start(), edge);
>>      vma_list.insert(*n);
>> +    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +        vma_range_set.insert(vma_range(n));
>> +    }
>>  }
>>
>>  error file_vma::sync(uintptr_t start, uintptr_t end)
>> @@ -1903,6 +1964,9 @@ void linear_map(void* _virt, phys addr, size_t 
>> size, const char* name,
>>      WITH_LOCK(linear_vma_set_mutex.for_write()) {
>>         linear_vma_set.insert(_vma);
>>      }
>> +    WITH_LOCK(vma_range_set_mutex.for_write()) {
>> +       vma_range_set.insert(vma_range(_vma));
>> +    }
>>  }
>>
>>  void free_initial_memory_range(uintptr_t addr, size_t size)
>> diff --git a/include/osv/mmu.hh b/include/osv/mmu.hh
>> --- a/include/osv/mmu.hh
>> +++ b/include/osv/mmu.hh
>> @@ -90,6 +90,37 @@ public:
>>      boost::intrusive::set_member_hook<> _vma_list_hook;
>>  };
>>
>> +struct vma_range {
>> +    const void* _vma;
>> +    bool _is_linear;
>> +
>> +    vma_range(const linear_vma* v) {
>> +       _vma = v;
>> +       _is_linear = true;
>> +    }
>> +
>> +    vma_range(const vma* v) {
>> +       _vma = v;
>> +       _is_linear = false;
>> +    }
>> +
>> +    uintptr_t start() const {
>> +       if (_is_linear) {
>> +          return static_cast<const linear_vma*>(_vma)->v_start();
>> +       } else {
>> +          return static_cast<const vma*>(_vma)->start();
>> +       }
>> +    }
>> +
>> +    uintptr_t end() const {
>> +       if (_is_linear) {
>> +          return static_cast<const linear_vma*>(_vma)->v_end();
>> +       } else {
>> +          return static_cast<const vma*>(_vma)->end();
>> +       }
>> +    }
>> +};
>> +
>>  class anon_vma : public vma {
>>  public:
>>      anon_vma(addr_range range, unsigned perm, unsigned flags);
>> diff --git a/include/osv/prio.hh b/include/osv/prio.hh
>> --- a/include/osv/prio.hh
>> +++ b/include/osv/prio.hh
>> @@ -16,6 +16,7 @@ enum {
>>      cpus,
>>      fpranges,
>>      pt_root,
>> +    vma_range_set,
>>      linear_vma_set,
>>      mempool,
>>      routecache,
>>
>> -- 
>> You received this message because you are subscribed to the Google Groups 
>> "OSv Development" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to [email protected].
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/osv-dev/000000000000ea29e705db7b5e8f%40google.com
>> .
>>
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/468fc0c1-57dc-4133-b09d-e20386108a8cn%40googlegroups.com.

Reply via email to