[osv-dev] Re: [PATCH 2/3] pagecache: refactor to allow integration with non-ZFS filesystems

Waldek Kozaczuk Sun, 26 Apr 2020 23:06:47 -0700


On Saturday, April 4, 2020 at 12:59:04 AM UTC-4, Waldek Kozaczuk wrote:
>
> So far the pagecache layer has been pretty tightly integrated with ZFS 
> and more specifically with its ARC cache layer. This patch refactors 
> the pagecache implementation to make it support other filesystems than 
> zfs. 
>
> In essence we modify all necessary places like retrieving and releasing 
> cached file pages to behave slightly differently depending on the 
> filesystem 
> (ZFS versus non-ZFS) given vnode belongs to. The changes only apply to 
> read cache where ZFS page caching requires tighter integration with ARC. 
>
> This patch adds new integration function - map_read_cached_page() - 
> intended to be used by non-ZFS filesystem implementations to register 
> cached file pages into page cache. 
>
> Signed-off-by: Waldemar Kozaczuk <[email protected]> 
> --- 
>  core/pagecache.cc        | 176 ++++++++++++++++++++++++++++----------- 
>  fs/vfs/vfs_fops.cc       |   2 +- 
>  include/osv/pagecache.hh |   1 + 
>  include/osv/vfs_file.hh  |   2 +- 
>  4 files changed, 131 insertions(+), 50 deletions(-) 
>
> diff --git a/core/pagecache.cc b/core/pagecache.cc 
> index d5d2ee86..426e2a1c 100644 
> --- a/core/pagecache.cc 
> +++ b/core/pagecache.cc 
> @@ -161,7 +161,7 @@ protected: 
>  public: 
>      cached_page(hashkey key, void* page) : _key(key), _page(page) { 
>      } 
> -    ~cached_page() { 
> +    virtual ~cached_page() { 
>      } 
>   
>      void map(mmu::hw_ptep<0> ptep) { 
> @@ -198,7 +198,7 @@ public: 
>          _vp = fp->f_dentry->d_vnode; 
>          vref(_vp); 
>      } 
> -    ~cached_page_write() { 
> +    virtual ~cached_page_write() { 
>          if (_page) { 
>              if (_dirty) { 
>                  writeback(); 
> @@ -238,7 +238,7 @@ public: 
>   
>  class cached_page_arc; 
>   
> -unsigned drop_read_cached_page(cached_page_arc* cp, bool flush = true); 
> +static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush 
> = true); 
>   
>  class cached_page_arc : public cached_page { 
>  public: 
> @@ -267,7 +267,7 @@ private: 
>   
>  public: 
>      cached_page_arc(hashkey key, void* page, arc_buf_t* ab) : 
> cached_page(key, page), _ab(ref(ab, this)) {} 
> -    ~cached_page_arc() { 
> +    virtual ~cached_page_arc() { 
>          if (!_removed && unref(_ab, this)) { 
>              arc_unshare_buf(_ab); 
>          } 
> @@ -282,7 +282,7 @@ public: 
>          std::for_each(it.first, it.second, [&count](arc_map::value_type& 
> p) { 
>                  auto cp = p.second; 
>                  cp->_removed = true; 
> -                count += drop_read_cached_page(cp, false); 
> +                count += drop_arc_read_cached_page(cp, false); 
>          }); 
>          arc_cache_map.erase(ab); 
>          if (count) { 
> @@ -296,10 +296,14 @@ static bool operator==(const 
> cached_page_arc::arc_map::value_type& l, const cach 
>  } 
>   
>  std::unordered_multimap<arc_buf_t*, cached_page_arc*> 
> cached_page_arc::arc_cache_map; 
> -static std::unordered_map<hashkey, cached_page_arc*> read_cache; 
> +//Map used to store read cache pages for ZFS filesystem interacting with 
> ARC 
> +static std::unordered_map<hashkey, cached_page_arc*> arc_read_cache; 
> +//Map used to store read cache pages for non-ZFS filesystems 
> +static std::unordered_map<hashkey, cached_page*> read_cache; 
>  static std::unordered_map<hashkey, cached_page_write*> write_cache; 
>  static std::deque<cached_page_write*> write_lru; 
> -static mutex arc_lock; // protects against parallel access to the read 
> cache 
> +static mutex arc_read_lock; // protects against parallel access to the 
> ARC read cache 
> +static mutex read_lock; // protects against parallel access to the read 
> cache 
>  static mutex write_lock; // protect against parallel access to the write 
> cache 
>   
>  template<typename T> 
> @@ -314,38 +318,57 @@ static T find_in_cache(std::unordered_map<hashkey, 
> T>& cache, hashkey& key) 
>      } 
>  } 
>   
> +static void add_read_mapping(cached_page *cp, mmu::hw_ptep<0> ptep) 
> +{ 
> +    cp->map(ptep); 
> +} 
> + 
>  TRACEPOINT(trace_add_read_mapping, "buf=%p, addr=%p, ptep=%p", void*, 
> void*, void*); 
> -void add_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> ptep) 
> +static void add_arc_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> 
> ptep) 
>  { 
>      trace_add_read_mapping(cp->arcbuf(), cp->addr(), ptep.release()); 
> -    cp->map(ptep); 
> +    add_read_mapping(cp, ptep); 
>  } 
>   
> -TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, 
> void*, void*); 
> -void remove_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> ptep) 
> +template<typename T> 
> +static void remove_read_mapping(std::unordered_map<hashkey, T>& cache, 
> cached_page* cp, mmu::hw_ptep<0> ptep) 
>  { 
> -    trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release()); 
>      if (cp->unmap(ptep) == 0) { 
> -        read_cache.erase(cp->key()); 
> +        cache.erase(cp->key()); 
>          delete cp; 
>      } 
>  } 
>   
> +TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, 
> void*, void*); 
> +static void remove_arc_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> 
> ptep) 
> +{ 
> +    trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release()); 
> +    remove_read_mapping(arc_read_cache, cp, ptep); 
> +} 
> + 
>  void remove_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep) 
>  { 
> -    SCOPE_LOCK(arc_lock); 
> -    cached_page_arc* cp = find_in_cache(read_cache, key); 
> +    SCOPE_LOCK(read_lock); 
> +    cached_page* cp = find_in_cache(read_cache, key); 
>      if (cp) { 
> -        remove_read_mapping(cp, ptep); 
> +        remove_read_mapping(read_cache, cp, ptep); 
>


Add this here: ->
mmu::flush_tlb_all();

I was about to push this series of patches a couple of days ago when I came 
across crashes when stress-testing one of the apps with wrk. More 
specifically it was graalvm-netty-plot app using isolates as described in 
this article - 
https://medium.com/graalvm/isolates-and-compressed-references-more-flexible-and-efficient-memory-management-for-graalvm-a044cc50b67e.
 
I do not exactly understand the nature of the crashes - looks like an 
internal app null-pointer-assert type of error that happens 60-70% of the 
time, no error on OSv side, app simply exits most of the time with error 
message. The same app works perfectly on OSv without these patches and on 
Linux.

It took me a while to figure out what is wrong. And honestly, I am not 100% 
I am right. But here is my theory: I noticed that crash happens only at the 
beginning of the test if it happens at all. If I execute a single request 
with curl and then stress test with wrk there are no crashes. So my 
observation was that a crash would happen under stress when multiple app 
threads would create mappings and at the same time execute code causing 
faults and have ROFS load data from disk into its cache. Once all code was 
in the ROFS cache in memory, the crashes would not happen.

After putting all kinds of assertions in the right places to make sure that 
read_cache is properly locked when accessed for read and write and 
validating many other things in ROFS layer I could not find any problems 
with the code. Then I came with this speculative scenario that might be the 
source of the crashes. Imagine when pagecache::get() is called during page 
fault to handle an attempt to write to a page that is set as read-only (for 
COW), the code would first remove read mapping for that virtual page from 
page table (see remove_read_mapping() with 2 parameters) and then update 
mapping to point to the newly allocated page. If the application thread 
after continues to execute on the same CPU 1 it should see new mapping 
without having to flush TLB (which we do not do here) as the old mapping 
would have been discarded due to the protection fault (is my thinking 
correct?). But what if the same thread was migrated to another CPU 2 and 
accessed data using the same virtual address - quite likely the TLB for 
that cpu might still have pointed to old read-only physical frame and 
simply see wrong data. I do not think we are handling this case in any way.

After adding mmu::flush_tlb_all() to remove_read_mapping() (which is only 
used by with ROFS) the crashes went away. So is my theory right? And the 
solution correct? Flushing all tlb is pretty expensive but do we have 
another choice?

Also, there are holes in my theory. First, the crashes would happen with 
single cpu as well, no migration to different cpu, so the same cpu should 
always see data consistently. So what might be going on in this case? Could 
vcpu be migrated to different physical cpu on host and could similar same 
effect? 

Secondly, I could see these crashes with ZFS images, why?

What do you think about my theory and the solution - flush_tlb_all()?
 

>      } 
>  } 
>   
> -TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*); 
> -unsigned drop_read_cached_page(cached_page_arc* cp, bool flush) 
> +void remove_arc_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep) 
> +{ 
> +    SCOPE_LOCK(arc_read_lock); 
> +    cached_page_arc* cp = find_in_cache(arc_read_cache, key); 
> +    if (cp) { 
> +        remove_arc_read_mapping(cp, ptep); 
> +    } 
> +} 
> + 
> +template<typename T> 
> +static unsigned drop_read_cached_page(std::unordered_map<hashkey, T>& 
> cache, cached_page* cp, bool flush) 
>  { 
> -    trace_drop_read_cached_page(cp->arcbuf(), cp->addr()); 
>      int flushed = cp->flush(); 
> -    read_cache.erase(cp->key()); 
> +    cache.erase(cp->key()); 
>   
>      if (flush && flushed > 1) { // if there was only one pte it is the 
> one we are faulting on; no need to flush. 
>          mmu::flush_tlb_all(); 
> @@ -356,12 +379,28 @@ unsigned drop_read_cached_page(cached_page_arc* cp, 
> bool flush) 
>      return flushed; 
>  } 
>   
> -void drop_read_cached_page(hashkey& key) 
> +static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool 
> flush) 
> +{ 
> +    return drop_read_cached_page(arc_read_cache, cp, flush); 
> +} 
> + 
> +static void drop_read_cached_page(hashkey& key) 
>  { 
> -    SCOPE_LOCK(arc_lock); 
> -    cached_page_arc* cp = find_in_cache(read_cache, key); 
> +    SCOPE_LOCK(read_lock); 
> +    cached_page* cp = find_in_cache(read_cache, key); 
>      if (cp) { 
> -        drop_read_cached_page(cp, true); 
> +        drop_read_cached_page(read_cache, cp, true); 
> +    } 
> +} 
> + 
> +TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*); 
> +static void drop_arc_read_cached_page(hashkey& key) 
> +{ 
> +    SCOPE_LOCK(arc_read_lock); 
> +    cached_page_arc* cp = find_in_cache(arc_read_cache, key); 
> +    if (cp) { 
> +        trace_drop_read_cached_page(cp->arcbuf(), cp->addr()); 
> +        drop_read_cached_page(arc_read_cache, cp, true); 
>      } 
>  } 
>   
> @@ -369,7 +408,7 @@ TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*); 
>  void unmap_arc_buf(arc_buf_t* ab) 
>  { 
>      trace_unmap_arc_buf(ab); 
> -    SCOPE_LOCK(arc_lock); 
> +    SCOPE_LOCK(arc_read_lock); 
>      cached_page_arc::unmap_arc_buf(ab); 
>  } 
>   
> @@ -377,15 +416,22 @@ TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", 
> void*, void*); 
>  void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page) 
>  { 
>      trace_map_arc_buf(ab, page); 
> -    SCOPE_LOCK(arc_lock); 
> +    SCOPE_LOCK(arc_read_lock); 
>      cached_page_arc* pc = new cached_page_arc(*key, page, ab); 
> -    read_cache.emplace(*key, pc); 
> +    arc_read_cache.emplace(*key, pc); 
>      arc_share_buf(ab); 
>  } 
>   
> +void map_read_cached_page(hashkey *key, void *page) 
> +{ 
> +    SCOPE_LOCK(read_lock); 
> +    cached_page* pc = new cached_page(*key, page); 
> +    read_cache.emplace(*key, pc); 
> +} 
> + 
>  static int create_read_cached_page(vfs_file* fp, hashkey& key) 
>  { 
> -    return fp->get_arcbuf(&key, key.offset); 
> +    return fp->read_page_from_cache(&key, key.offset); 
>  } 
>   
>  static std::unique_ptr<cached_page_write> 
> create_write_cached_page(vfs_file* fp, hashkey& key) 
> @@ -422,6 +468,8 @@ static void insert(cached_page_write* cp) { 
>      } 
>  } 
>   
> +#define IS_ZFS(fsid) (fsid.__val[1] == ZFS_ID) 
> + 
>  bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, 
> mmu::pt_element<0> pte, bool write, bool shared) 
>  { 
>      struct stat st; 
> @@ -430,6 +478,8 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> 
> ptep, mmu::pt_element<0> pt 
>      SCOPE_LOCK(write_lock); 
>      cached_page_write* wcp = find_in_cache(write_cache, key); 
>   
> +    bool zfs = IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid); 
> + 
>      if (write) { 
>          if (!wcp) { 
>              auto newcp = create_write_cached_page(fp, key); 
> @@ -437,17 +487,25 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> 
> ptep, mmu::pt_element<0> pt 
>                  // write fault into shared mapping, there page is not in 
> write cache yet, add it. 
>                  wcp = newcp.release(); 
>                  insert(wcp); 
> -                // page is moved from ARC to write cache 
> -                // drop ARC page if exists, removing all mappings 
> -                drop_read_cached_page(key); 
> +                // page is moved from read cache to write cache 
> +                // drop read page if exists, removing all mappings 
> +                if (zfs) { 
> +                    drop_arc_read_cached_page(key); 
> +                } else { 
> +                    drop_read_cached_page(key); 
> +                } 
>              } else { 
> -                // remove mapping to ARC page if exists 
> -                remove_read_mapping(key, ptep); 
> -                // cow of private page from ARC 
> +                // remove mapping to read cache page if exists 
> +                if (zfs) { 
> +                    remove_arc_read_mapping(key, ptep); 
> +                } else { 
> +                    remove_read_mapping(key, ptep); 
> +                } 
> +                // cow (copy-on-write) of private page from read cache 
>                  return mmu::write_pte(newcp->release(), ptep, pte); 
>              } 
>          } else if (!shared) { 
> -            // cow of private page from write cache 
> +            // cow (copy-on-write) of private page from write cache 
>              void* page = memory::alloc_page(); 
>              memcpy(page, wcp->addr(), mmu::page_size); 
>              return mmu::write_pte(page, ptep, pte); 
> @@ -456,11 +514,22 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> 
> ptep, mmu::pt_element<0> pt 
>          int ret; 
>          // read fault and page is not in write cache yet, return one from 
> ARC, mark it cow 
>          do { 
> -            WITH_LOCK(arc_lock) { 
> -                cached_page_arc* cp = find_in_cache(read_cache, key); 
> -                if (cp) { 
> -                    add_read_mapping(cp, ptep); 
> -                    return mmu::write_pte(cp->addr(), ptep, 
> mmu::pte_mark_cow(pte, true)); 
> +            if (zfs) { 
> +                WITH_LOCK(arc_read_lock) { 
> +                    cached_page_arc* cp = find_in_cache(arc_read_cache, 
> key); 
> +                    if (cp) { 
> +                        add_arc_read_mapping(cp, ptep); 
> +                        return mmu::write_pte(cp->addr(), ptep, 
> mmu::pte_mark_cow(pte, true)); 
> +                    } 
> +                } 
> +            } 
> +            else { 
> +                WITH_LOCK(read_lock) { 
> +                    cached_page* cp = find_in_cache(read_cache, key); 
> +                    if (cp) { 
> +                        add_read_mapping(cp, ptep); 
> +                        return mmu::write_pte(cp->addr(), ptep, 
> mmu::pte_mark_cow(pte, true)); 
> +                    } 
>                  } 
>              } 
>   
> @@ -513,12 +582,23 @@ bool release(vfs_file* fp, void *addr, off_t offset, 
> mmu::hw_ptep<0> ptep) 
>          } 
>      } 
>   
> -    WITH_LOCK(arc_lock) { 
> -        cached_page_arc* rcp = find_in_cache(read_cache, key); 
> -        if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) { 
> -            // page is in ARC 
> -            remove_read_mapping(rcp, ptep); 
> -            return false; 
> +    if (IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid)) { 
> +        WITH_LOCK(arc_read_lock) { 
> +            cached_page_arc* rcp = find_in_cache(arc_read_cache, key); 
> +            if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) { 
> +                // page is in ARC read cache 
> +                remove_arc_read_mapping(rcp, ptep); 
> +                return false; 
> +            } 
> +        } 
> +    } else { 
> +        WITH_LOCK(read_lock) { 
> +            cached_page* rcp = find_in_cache(read_cache, key); 
> +            if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) { 
> +                // page is in regular read cache 
> +                remove_read_mapping(read_cache, rcp, ptep); 
> +                return false; 
> +            } 
>          } 
>      } 
>   
> @@ -595,7 +675,7 @@ private: 
>              auto start = sched::thread::current()->thread_clock(); 
>              auto deadline = 
> std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::nanoseconds(static_cast<unsigned
>  
> long>(work/_freq))) + start; 
>   
> -            WITH_LOCK(arc_lock) { 
> +            WITH_LOCK(arc_read_lock) { 
>                  while (sched::thread::current()->thread_clock() < 
> deadline && buckets_scanned < bucket_count) { 
>                      if (current_bucket >= 
> cached_page_arc::arc_cache_map.bucket_count()) { 
>                          current_bucket = 0; 
> @@ -617,7 +697,7 @@ private: 
>   
>                      // mark ARC buffers as accessed when we have 1024 of 
> them 
>                      if (!(cleared % 1024)) { 
> -                        DROP_LOCK(arc_lock) { 
> +                        DROP_LOCK(arc_read_lock) { 
>                              flush = mark_accessed(accessed); 
>                          } 
>                      } 
> diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc 
> index 3a8f98b4..0b73cecd 100644 
> --- a/fs/vfs/vfs_fops.cc 
> +++ b/fs/vfs/vfs_fops.cc 
> @@ -157,7 +157,7 @@ void vfs_file::sync(off_t start, off_t end) 
>  // eviction that will hold the mmu-side lock that protects the mappings 
>  // Always follow that order. We however can't just get rid of the 
> mmu-side lock, 
>  // because not all invalidations will be synchronous. 
> -int vfs_file::get_arcbuf(void* key, off_t offset) 
> +int vfs_file::read_page_from_cache(void* key, off_t offset) 
>  { 
>      struct vnode *vp = f_dentry->d_vnode; 
>   
> diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh 
> index f3881b19..efb7b32d 100644 
> --- a/include/osv/pagecache.hh 
> +++ b/include/osv/pagecache.hh 
> @@ -35,4 +35,5 @@ bool release(vfs_file* fp, void *addr, off_t offset, 
> mmu::hw_ptep<0> ptep); 
>  void sync(vfs_file* fp, off_t start, off_t end); 
>  void unmap_arc_buf(arc_buf_t* ab); 
>  void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page); 
> +void map_read_cached_page(hashkey *key, void *page); 
>  } 
> diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh 
> index ffda5a4c..bb02555a 100644 
> --- a/include/osv/vfs_file.hh 
> +++ b/include/osv/vfs_file.hh 
> @@ -26,7 +26,7 @@ public: 
>      virtual bool put_page(void *addr, uintptr_t offset, mmu::hw_ptep<0> 
> ptep); 
>      virtual void sync(off_t start, off_t end); 
>   
> -    int get_arcbuf(void *key, off_t offset); 
> +    int read_page_from_cache(void *key, off_t offset); 
>  }; 
>   
>  #endif /* VFS_FILE_HH_ */ 
> -- 
> 2.20.1 
>
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/6fc83873-59b3-4ea8-9f4d-f46503429ae5%40googlegroups.com.

[osv-dev] Re: [PATCH 2/3] pagecache: refactor to allow integration with non-ZFS filesystems

Reply via email to