So far the pagecache layer has been pretty tightly integrated with ZFS
and more specifically with its ARC cache layer. This patch refactors
the pagecache implementation to make it support other filesystems than zfs.

In essence we modify all necessary places like retrieving and releasing
cached file pages to behave slightly differently depending on the filesystem
(ZFS versus non-ZFS) given vnode belongs to. The changes only apply to
read cache where ZFS page caching requires tighter integration with ARC.

This patch adds new integration function - map_read_cached_page() -
intended to be used by non-ZFS filesystem implementations to register
cached file pages into page cache.

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 core/pagecache.cc        | 176 ++++++++++++++++++++++++++++-----------
 fs/vfs/vfs_fops.cc       |   2 +-
 include/osv/pagecache.hh |   1 +
 include/osv/vfs_file.hh  |   2 +-
 4 files changed, 131 insertions(+), 50 deletions(-)

diff --git a/core/pagecache.cc b/core/pagecache.cc
index d5d2ee86..426e2a1c 100644
--- a/core/pagecache.cc
+++ b/core/pagecache.cc
@@ -161,7 +161,7 @@ protected:
 public:
     cached_page(hashkey key, void* page) : _key(key), _page(page) {
     }
-    ~cached_page() {
+    virtual ~cached_page() {
     }
 
     void map(mmu::hw_ptep<0> ptep) {
@@ -198,7 +198,7 @@ public:
         _vp = fp->f_dentry->d_vnode;
         vref(_vp);
     }
-    ~cached_page_write() {
+    virtual ~cached_page_write() {
         if (_page) {
             if (_dirty) {
                 writeback();
@@ -238,7 +238,7 @@ public:
 
 class cached_page_arc;
 
-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush = true);
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush = 
true);
 
 class cached_page_arc : public cached_page {
 public:
@@ -267,7 +267,7 @@ private:
 
 public:
     cached_page_arc(hashkey key, void* page, arc_buf_t* ab) : cached_page(key, 
page), _ab(ref(ab, this)) {}
-    ~cached_page_arc() {
+    virtual ~cached_page_arc() {
         if (!_removed && unref(_ab, this)) {
             arc_unshare_buf(_ab);
         }
@@ -282,7 +282,7 @@ public:
         std::for_each(it.first, it.second, [&count](arc_map::value_type& p) {
                 auto cp = p.second;
                 cp->_removed = true;
-                count += drop_read_cached_page(cp, false);
+                count += drop_arc_read_cached_page(cp, false);
         });
         arc_cache_map.erase(ab);
         if (count) {
@@ -296,10 +296,14 @@ static bool operator==(const 
cached_page_arc::arc_map::value_type& l, const cach
 }
 
 std::unordered_multimap<arc_buf_t*, cached_page_arc*> 
cached_page_arc::arc_cache_map;
-static std::unordered_map<hashkey, cached_page_arc*> read_cache;
+//Map used to store read cache pages for ZFS filesystem interacting with ARC
+static std::unordered_map<hashkey, cached_page_arc*> arc_read_cache;
+//Map used to store read cache pages for non-ZFS filesystems
+static std::unordered_map<hashkey, cached_page*> read_cache;
 static std::unordered_map<hashkey, cached_page_write*> write_cache;
 static std::deque<cached_page_write*> write_lru;
-static mutex arc_lock; // protects against parallel access to the read cache
+static mutex arc_read_lock; // protects against parallel access to the ARC 
read cache
+static mutex read_lock; // protects against parallel access to the read cache
 static mutex write_lock; // protect against parallel access to the write cache
 
 template<typename T>
@@ -314,38 +318,57 @@ static T find_in_cache(std::unordered_map<hashkey, T>& 
cache, hashkey& key)
     }
 }
 
+static void add_read_mapping(cached_page *cp, mmu::hw_ptep<0> ptep)
+{
+    cp->map(ptep);
+}
+
 TRACEPOINT(trace_add_read_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, 
void*);
-void add_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> ptep)
+static void add_arc_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> ptep)
 {
     trace_add_read_mapping(cp->arcbuf(), cp->addr(), ptep.release());
-    cp->map(ptep);
+    add_read_mapping(cp, ptep);
 }
 
-TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, 
void*);
-void remove_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> ptep)
+template<typename T>
+static void remove_read_mapping(std::unordered_map<hashkey, T>& cache, 
cached_page* cp, mmu::hw_ptep<0> ptep)
 {
-    trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release());
     if (cp->unmap(ptep) == 0) {
-        read_cache.erase(cp->key());
+        cache.erase(cp->key());
         delete cp;
     }
 }
 
+TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, 
void*);
+static void remove_arc_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> ptep)
+{
+    trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release());
+    remove_read_mapping(arc_read_cache, cp, ptep);
+}
+
 void remove_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep)
 {
-    SCOPE_LOCK(arc_lock);
-    cached_page_arc* cp = find_in_cache(read_cache, key);
+    SCOPE_LOCK(read_lock);
+    cached_page* cp = find_in_cache(read_cache, key);
     if (cp) {
-        remove_read_mapping(cp, ptep);
+        remove_read_mapping(read_cache, cp, ptep);
     }
 }
 
-TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
+void remove_arc_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep)
+{
+    SCOPE_LOCK(arc_read_lock);
+    cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+    if (cp) {
+        remove_arc_read_mapping(cp, ptep);
+    }
+}
+
+template<typename T>
+static unsigned drop_read_cached_page(std::unordered_map<hashkey, T>& cache, 
cached_page* cp, bool flush)
 {
-    trace_drop_read_cached_page(cp->arcbuf(), cp->addr());
     int flushed = cp->flush();
-    read_cache.erase(cp->key());
+    cache.erase(cp->key());
 
     if (flush && flushed > 1) { // if there was only one pte it is the one we 
are faulting on; no need to flush.
         mmu::flush_tlb_all();
@@ -356,12 +379,28 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool 
flush)
     return flushed;
 }
 
-void drop_read_cached_page(hashkey& key)
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush)
+{
+    return drop_read_cached_page(arc_read_cache, cp, flush);
+}
+
+static void drop_read_cached_page(hashkey& key)
 {
-    SCOPE_LOCK(arc_lock);
-    cached_page_arc* cp = find_in_cache(read_cache, key);
+    SCOPE_LOCK(read_lock);
+    cached_page* cp = find_in_cache(read_cache, key);
     if (cp) {
-        drop_read_cached_page(cp, true);
+        drop_read_cached_page(read_cache, cp, true);
+    }
+}
+
+TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
+static void drop_arc_read_cached_page(hashkey& key)
+{
+    SCOPE_LOCK(arc_read_lock);
+    cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+    if (cp) {
+        trace_drop_read_cached_page(cp->arcbuf(), cp->addr());
+        drop_read_cached_page(arc_read_cache, cp, true);
     }
 }
 
@@ -369,7 +408,7 @@ TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*);
 void unmap_arc_buf(arc_buf_t* ab)
 {
     trace_unmap_arc_buf(ab);
-    SCOPE_LOCK(arc_lock);
+    SCOPE_LOCK(arc_read_lock);
     cached_page_arc::unmap_arc_buf(ab);
 }
 
@@ -377,15 +416,22 @@ TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", void*, 
void*);
 void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page)
 {
     trace_map_arc_buf(ab, page);
-    SCOPE_LOCK(arc_lock);
+    SCOPE_LOCK(arc_read_lock);
     cached_page_arc* pc = new cached_page_arc(*key, page, ab);
-    read_cache.emplace(*key, pc);
+    arc_read_cache.emplace(*key, pc);
     arc_share_buf(ab);
 }
 
+void map_read_cached_page(hashkey *key, void *page)
+{
+    SCOPE_LOCK(read_lock);
+    cached_page* pc = new cached_page(*key, page);
+    read_cache.emplace(*key, pc);
+}
+
 static int create_read_cached_page(vfs_file* fp, hashkey& key)
 {
-    return fp->get_arcbuf(&key, key.offset);
+    return fp->read_page_from_cache(&key, key.offset);
 }
 
 static std::unique_ptr<cached_page_write> create_write_cached_page(vfs_file* 
fp, hashkey& key)
@@ -422,6 +468,8 @@ static void insert(cached_page_write* cp) {
     }
 }
 
+#define IS_ZFS(fsid) (fsid.__val[1] == ZFS_ID)
+
 bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> 
pte, bool write, bool shared)
 {
     struct stat st;
@@ -430,6 +478,8 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, 
mmu::pt_element<0> pt
     SCOPE_LOCK(write_lock);
     cached_page_write* wcp = find_in_cache(write_cache, key);
 
+    bool zfs = IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid);
+
     if (write) {
         if (!wcp) {
             auto newcp = create_write_cached_page(fp, key);
@@ -437,17 +487,25 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> 
ptep, mmu::pt_element<0> pt
                 // write fault into shared mapping, there page is not in write 
cache yet, add it.
                 wcp = newcp.release();
                 insert(wcp);
-                // page is moved from ARC to write cache
-                // drop ARC page if exists, removing all mappings
-                drop_read_cached_page(key);
+                // page is moved from read cache to write cache
+                // drop read page if exists, removing all mappings
+                if (zfs) {
+                    drop_arc_read_cached_page(key);
+                } else {
+                    drop_read_cached_page(key);
+                }
             } else {
-                // remove mapping to ARC page if exists
-                remove_read_mapping(key, ptep);
-                // cow of private page from ARC
+                // remove mapping to read cache page if exists
+                if (zfs) {
+                    remove_arc_read_mapping(key, ptep);
+                } else {
+                    remove_read_mapping(key, ptep);
+                }
+                // cow (copy-on-write) of private page from read cache
                 return mmu::write_pte(newcp->release(), ptep, pte);
             }
         } else if (!shared) {
-            // cow of private page from write cache
+            // cow (copy-on-write) of private page from write cache
             void* page = memory::alloc_page();
             memcpy(page, wcp->addr(), mmu::page_size);
             return mmu::write_pte(page, ptep, pte);
@@ -456,11 +514,22 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> 
ptep, mmu::pt_element<0> pt
         int ret;
         // read fault and page is not in write cache yet, return one from ARC, 
mark it cow
         do {
-            WITH_LOCK(arc_lock) {
-                cached_page_arc* cp = find_in_cache(read_cache, key);
-                if (cp) {
-                    add_read_mapping(cp, ptep);
-                    return mmu::write_pte(cp->addr(), ptep, 
mmu::pte_mark_cow(pte, true));
+            if (zfs) {
+                WITH_LOCK(arc_read_lock) {
+                    cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+                    if (cp) {
+                        add_arc_read_mapping(cp, ptep);
+                        return mmu::write_pte(cp->addr(), ptep, 
mmu::pte_mark_cow(pte, true));
+                    }
+                }
+            }
+            else {
+                WITH_LOCK(read_lock) {
+                    cached_page* cp = find_in_cache(read_cache, key);
+                    if (cp) {
+                        add_read_mapping(cp, ptep);
+                        return mmu::write_pte(cp->addr(), ptep, 
mmu::pte_mark_cow(pte, true));
+                    }
                 }
             }
 
@@ -513,12 +582,23 @@ bool release(vfs_file* fp, void *addr, off_t offset, 
mmu::hw_ptep<0> ptep)
         }
     }
 
-    WITH_LOCK(arc_lock) {
-        cached_page_arc* rcp = find_in_cache(read_cache, key);
-        if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
-            // page is in ARC
-            remove_read_mapping(rcp, ptep);
-            return false;
+    if (IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid)) {
+        WITH_LOCK(arc_read_lock) {
+            cached_page_arc* rcp = find_in_cache(arc_read_cache, key);
+            if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+                // page is in ARC read cache
+                remove_arc_read_mapping(rcp, ptep);
+                return false;
+            }
+        }
+    } else {
+        WITH_LOCK(read_lock) {
+            cached_page* rcp = find_in_cache(read_cache, key);
+            if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+                // page is in regular read cache
+                remove_read_mapping(read_cache, rcp, ptep);
+                return false;
+            }
         }
     }
 
@@ -595,7 +675,7 @@ private:
             auto start = sched::thread::current()->thread_clock();
             auto deadline = 
std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::nanoseconds(static_cast<unsigned
 long>(work/_freq))) + start;
 
-            WITH_LOCK(arc_lock) {
+            WITH_LOCK(arc_read_lock) {
                 while (sched::thread::current()->thread_clock() < deadline && 
buckets_scanned < bucket_count) {
                     if (current_bucket >= 
cached_page_arc::arc_cache_map.bucket_count()) {
                         current_bucket = 0;
@@ -617,7 +697,7 @@ private:
 
                     // mark ARC buffers as accessed when we have 1024 of them
                     if (!(cleared % 1024)) {
-                        DROP_LOCK(arc_lock) {
+                        DROP_LOCK(arc_read_lock) {
                             flush = mark_accessed(accessed);
                         }
                     }
diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc
index 3a8f98b4..0b73cecd 100644
--- a/fs/vfs/vfs_fops.cc
+++ b/fs/vfs/vfs_fops.cc
@@ -157,7 +157,7 @@ void vfs_file::sync(off_t start, off_t end)
 // eviction that will hold the mmu-side lock that protects the mappings
 // Always follow that order. We however can't just get rid of the mmu-side 
lock,
 // because not all invalidations will be synchronous.
-int vfs_file::get_arcbuf(void* key, off_t offset)
+int vfs_file::read_page_from_cache(void* key, off_t offset)
 {
     struct vnode *vp = f_dentry->d_vnode;
 
diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh
index f3881b19..efb7b32d 100644
--- a/include/osv/pagecache.hh
+++ b/include/osv/pagecache.hh
@@ -35,4 +35,5 @@ bool release(vfs_file* fp, void *addr, off_t offset, 
mmu::hw_ptep<0> ptep);
 void sync(vfs_file* fp, off_t start, off_t end);
 void unmap_arc_buf(arc_buf_t* ab);
 void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page);
+void map_read_cached_page(hashkey *key, void *page);
 }
diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh
index ffda5a4c..bb02555a 100644
--- a/include/osv/vfs_file.hh
+++ b/include/osv/vfs_file.hh
@@ -26,7 +26,7 @@ public:
     virtual bool put_page(void *addr, uintptr_t offset, mmu::hw_ptep<0> ptep);
     virtual void sync(off_t start, off_t end);
 
-    int get_arcbuf(void *key, off_t offset);
+    int read_page_from_cache(void *key, off_t offset);
 };
 
 #endif /* VFS_FILE_HH_ */
-- 
2.20.1

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/20200404045846.27854-2-jwkozaczuk%40gmail.com.

Reply via email to