From: Fotis Xenakis <[email protected]>
Committer: Waldemar Kozaczuk <[email protected]>
Branch: master

virtio-fs: use multiple dax mappings in filesystem

This integrates the dax window manager with the virtio-fs file system
operations:

- Removes the lock from virtio::fs::dax_window, since this is now
  handled by the window manager.
- A single manager instance is maintained for each virtio-fs device
  having at least a singe mount. Multiple, concurrent mounts of the same
  device share a common dax_manager.
- The FUSE_READ fallback as well as the logic of using it when the DAX
  window is not available or a read using it fails are untouched.

Signed-off-by: Fotis Xenakis <[email protected]>
Message-Id: 
<am0pr03mb629260f9f8cc3272e9cf435aa6...@am0pr03mb6292.eurprd03.prod.outlook.com>

---
diff --git a/drivers/virtio-fs.hh b/drivers/virtio-fs.hh
--- a/drivers/virtio-fs.hh
+++ b/drivers/virtio-fs.hh
@@ -8,8 +8,11 @@
 #ifndef VIRTIO_FS_DRIVER_H
 #define VIRTIO_FS_DRIVER_H
 
+#include <functional>
+
 #include <osv/mutex.h>
 #include <osv/waitqueue.hh>
+
 #include "drivers/virtio.hh"
 #include "drivers/virtio-device.hh"
 #include "fs/virtiofs/fuse_kernel.h"
@@ -49,7 +52,13 @@ public:
     struct dax_window {
         mmioaddr_t addr;
         u64 len;
-        mutex lock;
+    };
+
+    // Helper enabling the use of fs* as key type in an unordered_* container.
+    struct hasher {
+        size_t operator()(fs* const _fs) const noexcept {
+            return std::hash<int>{}(_fs->_id);
+        }
     };
 
     explicit fs(virtio_device& dev);
diff --git a/fs/virtiofs/virtiofs.hh b/fs/virtiofs/virtiofs.hh
--- a/fs/virtiofs/virtiofs.hh
+++ b/fs/virtiofs/virtiofs.hh
@@ -8,11 +8,13 @@
 #ifndef __INCLUDE_VIRTIOFS_H__
 #define __INCLUDE_VIRTIOFS_H__
 
-#include <osv/vnode.h>
+#include <memory>
+
+#include <osv/debug.h>
 #include <osv/mount.h>
-#include <osv/dentry.h>
-#include <osv/prex.h>
-#include <osv/buf.h>
+#include <osv/vnode.h>
+
+#include "drivers/virtio-fs.hh"
 #include "fuse_kernel.h"
 
 #define VIRTIOFS_DEBUG_ENABLED 1
@@ -23,6 +25,17 @@
 #define virtiofs_debug(...)
 #endif
 
+// Necessary pre-declaration because virtiofs::dax depends on virtiofs_inode,
+// declared below.
+namespace virtiofs {
+class dax_manager;
+}
+
+struct virtiofs_mount_data {
+    virtio::fs* drv;
+    std::shared_ptr<virtiofs::dax_manager> dax_mgr;
+};
+
 struct virtiofs_inode {
     uint64_t nodeid;
     struct fuse_attr attr;
diff --git a/fs/virtiofs/virtiofs_i.hh b/fs/virtiofs/virtiofs_i.hh
--- a/fs/virtiofs/virtiofs_i.hh
+++ b/fs/virtiofs/virtiofs_i.hh
@@ -8,9 +8,6 @@
 #ifndef VIRTIOFS_IO_H
 #define VIRTIOFS_IO_H
 
-#include "fuse_kernel.h"
-#include <osv/mutex.h>
-#include <osv/waitqueue.hh>
 #include "drivers/virtio-fs.hh"
 
 int fuse_req_send_and_receive_reply(virtio::fs* drv, uint32_t opcode,
diff --git a/fs/virtiofs/virtiofs_vfsops.cc b/fs/virtiofs/virtiofs_vfsops.cc
--- a/fs/virtiofs/virtiofs_vfsops.cc
+++ b/fs/virtiofs/virtiofs_vfsops.cc
@@ -5,19 +5,32 @@
  * BSD license as described in the LICENSE file in the top-level directory.
  */
 
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <new>
 #include <sys/types.h>
-#include <osv/device.h>
+
+#include <api/assert.h>
 #include <osv/debug.h>
-#include <iomanip>
-#include <iostream>
+#include <osv/device.h>
+#include <osv/mutex.h>
+
+#include "drivers/virtio-fs.hh"
 #include "virtiofs.hh"
+#include "virtiofs_dax.hh"
 #include "virtiofs_i.hh"
-#include "drivers/virtio-fs.hh"
 
 using fuse_request = virtio::fs::fuse_request;
 
 static std::atomic<uint64_t> fuse_unique_id(1);
 
+static struct {
+    std::unordered_map<virtio::fs*, std::shared_ptr<virtiofs::dax_manager>,
+        virtio::fs::hasher> mgrs;
+    mutex lock;
+} dax_managers;
+
 int fuse_req_send_and_receive_reply(virtio::fs* drv, uint32_t opcode,
     uint64_t nodeid, void* input_args_data, size_t input_args_size,
     void* output_args_data, size_t output_args_size)
@@ -115,7 +128,28 @@ static int virtiofs_mount(struct mount* mp, const char* 
dev, int flags,
 
     virtiofs_set_vnode(mp->m_root->d_vnode, root_node);
 
-    mp->m_data = drv;
+    auto* m_data = new (std::nothrow) virtiofs_mount_data;
+    if (!m_data) {
+        return ENOMEM;
+    }
+    m_data->drv = drv;
+    if (drv->get_dax()) {
+        // The device supports the DAX window
+        std::lock_guard<mutex> guard {dax_managers.lock};
+        auto found = dax_managers.mgrs.find(drv);
+        if (found != dax_managers.mgrs.end()) {
+            // There is a dax_manager already associated with this device (the
+            // device is already mounted)
+            m_data->dax_mgr = found->second;
+        } else {
+            m_data->dax_mgr = std::make_shared<virtiofs::dax_manager>(*drv);
+            if (!m_data->dax_mgr) {
+                return ENOMEM;
+            }
+        }
+    }
+
+    mp->m_data = m_data;
     mp->m_dev = device;
 
     return 0;
@@ -141,6 +175,15 @@ static int virtiofs_statfs(struct mount* mp, struct 
statfs* statp)
 
 static int virtiofs_unmount(struct mount* mp, int flags)
 {
+    auto* m_data = static_cast<virtiofs_mount_data*>(mp->m_data);
+    std::lock_guard<mutex> guard {dax_managers.lock};
+    if (m_data->dax_mgr && m_data->dax_mgr.use_count() == 2) {
+        // This was the last mount of this device. It's safe to delete the
+        // window manager.
+        dax_managers.mgrs.erase(m_data->drv);
+    }
+    delete m_data;
+
     struct device* dev = mp->m_dev;
     return device_close(dev);
 }
diff --git a/fs/virtiofs/virtiofs_vnops.cc b/fs/virtiofs/virtiofs_vnops.cc
--- a/fs/virtiofs/virtiofs_vnops.cc
+++ b/fs/virtiofs/virtiofs_vnops.cc
@@ -5,28 +5,27 @@
  * BSD license as described in the LICENSE file in the top-level directory.
  */
 
-#include <sys/stat.h>
+#include <cstdlib>
+#include <cstring>
 #include <dirent.h>
-#include <sys/param.h>
-
 #include <errno.h>
-#include <string.h>
-#include <stdlib.h>
 #include <fcntl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 
-#include <osv/prex.h>
-#include <osv/vnode.h>
-#include <osv/file.h>
-#include <osv/mount.h>
+#include <osv/contiguous_alloc.hh>
 #include <osv/debug.h>
-
-#include <sys/types.h>
 #include <osv/device.h>
-#include <osv/sched.hh>
+#include <osv/file.h>
 #include <osv/mmio.hh>
-#include <osv/contiguous_alloc.hh>
+#include <osv/mount.h>
+#include <osv/prex.h>
+#include <osv/sched.hh>
+#include <osv/vnode.h>
 
 #include "virtiofs.hh"
+#include "virtiofs_dax.hh"
 #include "virtiofs_i.hh"
 
 static constexpr uint32_t OPEN_FLAGS = O_RDONLY;
@@ -59,7 +58,8 @@ static int virtiofs_lookup(struct vnode* vnode, char* name, 
struct vnode** vpp)
     }
     strcpy(in_args.get(), name);
 
-    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
+    auto* m_data = static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
+    auto* drv = m_data->drv;
     auto error = fuse_req_send_and_receive_reply(drv, FUSE_LOOKUP,
         inode->nodeid, in_args.get(), in_args_len, out_args.get(),
         sizeof(*out_args));
@@ -110,7 +110,8 @@ static int virtiofs_open(struct file* fp)
     }
     in_args->flags = OPEN_FLAGS;
 
-    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
+    auto* m_data = static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
+    auto* drv = m_data->drv;
     auto error = fuse_req_send_and_receive_reply(drv, FUSE_OPEN,
         inode->nodeid, in_args.get(), sizeof(*in_args), out_args.get(),
         sizeof(*out_args));
@@ -145,7 +146,8 @@ static int virtiofs_close(struct vnode* vnode, struct file* 
fp)
     in_args->fh = f_data->file_handle;
     in_args->flags = OPEN_FLAGS; // need to be same as in FUSE_OPEN
 
-    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
+    auto* m_data = static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
+    auto* drv = m_data->drv;
     auto error = fuse_req_send_and_receive_reply(drv, FUSE_RELEASE,
         inode->nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
     if (error) {
@@ -172,7 +174,8 @@ static int virtiofs_readlink(struct vnode* vnode, struct 
uio* uio)
         return ENOMEM;
     }
 
-    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
+    auto* m_data = static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
+    auto* drv = m_data->drv;
     auto error = fuse_req_send_and_receive_reply(drv, FUSE_READLINK,
         inode->nodeid, nullptr, 0, link_path.get(), PATH_MAX);
     if (error) {
@@ -185,107 +188,6 @@ static int virtiofs_readlink(struct vnode* vnode, struct 
uio* uio)
     return uiomove(link_path.get(), strlen(link_path.get()), uio);
 }
 
-// Read @read_amt bytes from @inode, using the DAX window.
-static int virtiofs_read_direct(virtiofs_inode& inode, u64 file_handle,
-    u64 read_amt, virtio::fs& drv, struct uio& uio)
-{
-    auto* dax = drv.get_dax();
-    // Enter the critical path: setup mapping -> read -> remove mapping
-    std::lock_guard<mutex> guard {dax->lock};
-
-    // Setup mapping
-    // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
-    // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
-    // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
-    // negotiation"):
-    // - foffset: multiple of map_alignment from FUSE_INIT
-    // - len: not larger than remaining file?
-    // - moffset: multiple of map_alignment from FUSE_INIT
-    // In practice, map_alignment is the host's page size, because foffset and
-    // moffset are passed to mmap() on the host.
-    std::unique_ptr<fuse_setupmapping_in> in_args {
-        new (std::nothrow) fuse_setupmapping_in()};
-    if (!in_args) {
-        return ENOMEM;
-    }
-    in_args->fh = file_handle;
-    in_args->flags = 0;
-    uint64_t moffset = 0;
-    in_args->moffset = moffset;
-
-    auto map_align = drv.get_map_alignment();
-    if (map_align < 0) {
-        kprintf("[virtiofs] inode %lld, map alignment not set\n", 
inode.nodeid);
-        return ENOTSUP;
-    }
-    uint64_t alignment = 1ul << map_align;
-    auto foffset = align_down(static_cast<uint64_t>(uio.uio_offset), 
alignment);
-    in_args->foffset = foffset;
-
-    // The possible excess part of the file mapped due to alignment constraints
-    // NOTE: map_excess <= alignemnt
-    auto map_excess = uio.uio_offset - foffset;
-    if (moffset + map_excess >= dax->len) {
-        // No usable room in DAX window due to map_excess
-        return ENOBUFS;
-    }
-    // Actual read amount is read_amt, or what fits in the DAX window
-    auto read_amt_act = std::min<uint64_t>(read_amt,
-        dax->len - moffset - map_excess);
-    in_args->len = read_amt_act + map_excess;
-
-    virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
-                   "moffset=%lld)\n", inode.nodeid, in_args->foffset,
-                   in_args->len, in_args->moffset);
-    auto error = fuse_req_send_and_receive_reply(&drv, FUSE_SETUPMAPPING,
-        inode.nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
-    if (error) {
-        kprintf("[virtiofs] inode %lld, mapping setup failed\n", inode.nodeid);
-        return error;
-    }
-
-    // Read from the DAX window
-    // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. 
volatile
-    // accesses). From the spec: "Drivers map this shared memory region with
-    // writeback caching as if it were regular RAM."
-    // The location of the requested data in the DAX window
-    auto req_data = dax->addr + moffset + map_excess;
-    error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
-    if (error) {
-        kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
-        return error;
-    }
-
-    // Remove mapping
-    // NOTE: This is only necessary when FUSE_SETUPMAPPING fails. From the 
spec:
-    // "If the device runs out of resources the FUSE_SETUPMAPPING request fails
-    // until resources are available again following FUSE_REMOVEMAPPING."
-    auto r_in_args_size = sizeof(fuse_removemapping_in) +
-        sizeof(fuse_removemapping_one);
-    std::unique_ptr<u8> r_in_args {new (std::nothrow) u8[r_in_args_size]};
-    if (!r_in_args) {
-        return ENOMEM;
-    }
-    auto r_in = new (r_in_args.get()) fuse_removemapping_in();
-    auto r_one = new (r_in_args.get() + sizeof(fuse_removemapping_in))
-        fuse_removemapping_one();
-    r_in->count = 1;
-    r_one->moffset = in_args->moffset;
-    r_one->len = in_args->len;
-
-    virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
-        inode.nodeid, r_one->moffset, r_one->len);
-    error = fuse_req_send_and_receive_reply(&drv, FUSE_REMOVEMAPPING,
-        inode.nodeid, r_in_args.get(), r_in_args_size, nullptr, 0);
-    if (error) {
-        kprintf("[virtiofs] inode %lld, mapping removal failed\n",
-            inode.nodeid);
-        return error;
-    }
-
-    return 0;
-}
-
 // Read @read_amt bytes from @inode, using the fallback FUSE_READ mechanism.
 static int virtiofs_read_fallback(virtiofs_inode& inode, u64 file_handle,
     u32 read_amt, u32 flags, virtio::fs& drv, struct uio& uio)
@@ -314,9 +216,6 @@ static int virtiofs_read_fallback(virtiofs_inode& inode, 
u64 file_handle,
     return uiomove(buf.get(), read_amt, &uio);
 }
 
-// TODO: Optimize it to reduce number of exits to host (each
-// fuse_req_send_and_receive_reply()) by reading eagerly "ahead/around" just
-// like ROFS does and caching it
 static int virtiofs_read(struct vnode* vnode, struct file* fp, struct uio* uio,
     int ioflag)
 {
@@ -343,17 +242,17 @@ static int virtiofs_read(struct vnode* vnode, struct 
file* fp, struct uio* uio,
 
     auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
     auto* file_data = static_cast<virtiofs_file_data*>(fp->f_data);
-    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
+    auto* m_data = static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
+    auto* drv = m_data->drv;
+    auto dax_mgr = m_data->dax_mgr;
 
     // Total read amount is what they requested, or what is left
     auto read_amt = std::min<uint64_t>(uio->uio_resid,
         inode->attr.size - uio->uio_offset);
 
-    if (drv->get_dax()) {
+    if (dax_mgr) {
         // Try to read from DAX
-        if (!virtiofs_read_direct(*inode, file_data->file_handle, read_amt,
-            *drv, *uio)) {
-
+        if (!dax_mgr->read(*inode, file_data->file_handle, read_amt, *uio)) {
             return 0;
         }
     }

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/0000000000008772bb05a9040214%40google.com.

Reply via email to