When the DAX window is available from the device, the filesystem prefers
to use it instead of the regular FUSE_READ request. If that fails,
FUSE_READ is used as a fallback.

To use the DAX window, a part of the file is mapped to it with
FUSE_SETUPMAPPING, the contents are copied from it to the user buffers
and the mapping is cleaned-up with FUSE_REMOVEMAPPING. In this naive
implementation, the window is used for a single mapping at a time, with
no caching or readahead.

Signed-off-by: Fotis Xenakis <fo...@windowslive.com>
---
 fs/virtiofs/virtiofs_vnops.cc | 167 +++++++++++++++++++++++++++++-----
 1 file changed, 144 insertions(+), 23 deletions(-)

diff --git a/fs/virtiofs/virtiofs_vnops.cc b/fs/virtiofs/virtiofs_vnops.cc
index 7fbb2cd2..9551ff07 100644
--- a/fs/virtiofs/virtiofs_vnops.cc
+++ b/fs/virtiofs/virtiofs_vnops.cc
@@ -23,9 +23,11 @@
 #include <sys/types.h>
 #include <osv/device.h>
 #include <osv/sched.hh>
+#include <osv/mmio.hh>
 
 #include "virtiofs.hh"
 #include "virtiofs_i.hh"
+#include "drivers/virtio-fs.hh"
 
 static constexpr uint32_t OPEN_FLAGS = O_RDONLY;
 
@@ -183,14 +185,139 @@ static int virtiofs_readlink(struct vnode* vnode, struct 
uio* uio)
     return uiomove(link_path.get(), strlen(link_path.get()), uio);
 }
 
+// Read @read_amt bytes from @inode, using the DAX window.
+static int virtiofs_read_direct(virtiofs_inode& inode, u64 file_handle,
+    u64 read_amt, fuse_strategy& strategy, struct uio& uio)
+{
+    auto* drv = static_cast<virtio::fs*>(strategy.drv);
+    auto* dax = drv->get_dax();
+    // Enter the critical path: setup mapping -> read -> remove mapping
+    std::lock_guard<mutex> guard {dax->lock};
+
+    // Setup mapping
+    // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING (in
+    // the future will be negotiated with FUSE_INIT, from the spec: "Alignment
+    // constraints for FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING requests are
+    // communicated during FUSE_INIT negotiation"):
+    // - foffset: multiple of host's page size (passed to host mmap())
+    // - len: not larger than remaining file?
+    // - moffset: multiple of host's page size (passed to host mmap())
+    std::unique_ptr<fuse_setupmapping_in> in_args {
+        new (std::nothrow) fuse_setupmapping_in()};
+    if (!in_args) {
+        return ENOMEM;
+    }
+    in_args->fh = file_handle;
+    in_args->flags = 0;
+    uint64_t moffset = 0;
+    in_args->moffset = moffset;
+
+    // TODO: When implemented in virtiofsd, get alignment from FUSE_INIT
+    uint64_t alignment = 1 << 12;
+    auto foffset = align_down(static_cast<uint64_t>(uio.uio_offset), 
alignment);
+    in_args->foffset = foffset;
+
+    // The possible excess part of the file mapped due to alignment constraints
+    // NOTE: map_excess <= alignemnt
+    auto map_excess = uio.uio_offset - foffset;
+    if (moffset + map_excess >= dax->len) {
+        // No usable room in DAX window due to map_excess
+        return ENOBUFS;
+    }
+    // Actual read amount is read_amt, or what fits in the DAX window
+    auto read_amt_act = std::min<uint64_t>(read_amt,
+        dax->len - moffset - map_excess);
+    in_args->len = read_amt_act + map_excess;
+
+    // NOTE: This is not used, and seems like it will go away in the future (it
+    // is absent in the development branches of virtiofsd).
+    std::unique_ptr<fuse_setupmapping_out> out_args {
+        new (std::nothrow) fuse_setupmapping_out};
+    if (!out_args) {
+        return ENOMEM;
+    }
+
+    virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
+                   "moffset=%lld)\n", inode.nodeid, in_args->foffset,
+                   in_args->len, in_args->moffset);
+    auto error = fuse_req_send_and_receive_reply(&strategy, FUSE_SETUPMAPPING,
+        inode.nodeid, in_args.get(), sizeof(*in_args), out_args.get(),
+        sizeof(*out_args));
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping setup failed\n", inode.nodeid);
+        return error;
+    }
+
+    // Read from the DAX window
+    // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. 
volatile
+    // accesses). From the spec: "Drivers map this shared memory region with
+    // writeback caching as if it were regular RAM."
+    // The location of the requested data in the DAX window
+    auto req_data = dax->addr + moffset + map_excess;
+    error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
+        return error;
+    }
+
+    // Remove mapping
+    // NOTE: This is only necessary when FUSE_SETUPMAPPING fails. From the 
spec:
+    // "If the device runs out of resources the FUSE_SETUPMAPPING request fails
+    // until resources are available again following FUSE_REMOVEMAPPING."
+    std::unique_ptr<fuse_removemapping_in> iargs {
+        new (std::nothrow) fuse_removemapping_in()};
+    if (!iargs) {
+        return ENOMEM;
+    }
+    iargs->fh = in_args->fh;
+    iargs->moffset = in_args->moffset;
+    iargs->len = in_args->len;
+
+    virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
+        inode.nodeid, iargs->moffset, iargs->len);
+    error = fuse_req_send_and_receive_reply(&strategy, FUSE_REMOVEMAPPING,
+        inode.nodeid, iargs.get(), sizeof(*iargs), nullptr, 0);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping removal failed\n",
+            inode.nodeid);
+        return error;
+    }
+
+    return 0;
+}
+
+// Read @read_amt bytes from @inode, using the fallback FUSE_READ mechanism.
+static int virtiofs_read_fallback(virtiofs_inode& inode, u64 file_handle,
+    u32 read_amt, u32 flags, fuse_strategy& strategy, struct uio& uio)
+{
+    std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in()};
+    std::unique_ptr<u8[]> buf {new (std::nothrow) u8[read_amt]};
+    if (!in_args | !buf) {
+        return ENOMEM;
+    }
+    in_args->fh = file_handle;
+    in_args->offset = uio.uio_offset;
+    in_args->size = read_amt;
+    in_args->flags = flags;
+
+    virtiofs_debug("inode %lld, reading %lld bytes at offset %lld\n",
+        inode.nodeid, read_amt, uio.uio_offset);
+    auto error = fuse_req_send_and_receive_reply(&strategy, FUSE_READ,
+        inode.nodeid, in_args.get(), sizeof(*in_args), buf.get(), read_amt);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, read failed\n", inode.nodeid);
+        return error;
+    }
+
+    return uiomove(buf.get(), read_amt, &uio);
+}
+
 // TODO: Optimize it to reduce number of exits to host (each
 // fuse_req_send_and_receive_reply()) by reading eagerly "ahead/around" just
 // like ROFS does and caching it
 static int virtiofs_read(struct vnode* vnode, struct file* fp, struct uio* uio,
     int ioflag)
 {
-    auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
-
     // Can't read directories
     if (vnode->v_type == VDIR) {
         return EISDIR;
@@ -212,32 +339,26 @@ static int virtiofs_read(struct vnode* vnode, struct 
file* fp, struct uio* uio,
         return 0;
     }
 
+    auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
+    auto* file_data = static_cast<virtiofs_file_data*>(fp->f_data);
+    auto* strategy = static_cast<fuse_strategy*>(vnode->v_mount->m_data);
+
     // Total read amount is what they requested, or what is left
     auto read_amt = std::min<uint64_t>(uio->uio_resid,
         inode->attr.size - uio->uio_offset);
-    std::unique_ptr<u8[]> buf {new (std::nothrow) u8[read_amt]};
-    std::unique_ptr<fuse_read_in> in_args {new (std::nothrow) fuse_read_in()};
-    if (!buf || !in_args) {
-        return ENOMEM;
-    }
-    auto* f_data = static_cast<virtiofs_file_data*>(file_data(fp));
-    in_args->fh = f_data->file_handle;
-    in_args->offset = uio->uio_offset;
-    in_args->size = read_amt;
-    in_args->flags = ioflag;
 
-    virtiofs_debug("inode %lld, reading %lld bytes at offset %lld\n",
-        inode->nodeid, read_amt, uio->uio_offset);
+    auto* drv = static_cast<virtio::fs*>(strategy->drv);
+    if (drv->get_dax()) {
+        // Try to read from DAX
+        if (!virtiofs_read_direct(*inode, file_data->file_handle, read_amt,
+            *strategy, *uio)) {
 
-    auto* strategy = static_cast<fuse_strategy*>(vnode->v_mount->m_data);
-    auto error = fuse_req_send_and_receive_reply(strategy, FUSE_READ,
-        inode->nodeid, in_args.get(), sizeof(*in_args), buf.get(), read_amt);
-    if (error) {
-        kprintf("[virtiofs] inode %lld, read failed\n", inode->nodeid);
-        return error;
+            return 0;
+        }
     }
-
-    return uiomove(buf.get(), read_amt, uio);
+    // DAX unavailable or failed, use fallback
+    return virtiofs_read_fallback(*inode, file_data->file_handle, read_amt,
+        ioflag, *strategy, *uio);
 }
 
 static int virtiofs_readdir(struct vnode* vnode, struct file* fp,
@@ -307,7 +428,7 @@ struct vnops virtiofs_vnops = {
     virtiofs_truncate,  /* truncate - returns error when called */
     virtiofs_link,      /* link - returns error when called */
     virtiofs_arc,       /* arc */ //TODO: Implement to allow memory re-use when
-                        // mapping files, investigate using virtio-fs DAX
+                        // mapping files
     virtiofs_fallocate, /* fallocate - returns error when called */
     virtiofs_readlink,  /* read link */
     virtiofs_symlink    /* symbolic link - returns error when called */
-- 
2.26.1

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/VI1PR03MB43838CCFD70E77BF6D143083A6D40%40VI1PR03MB4383.eurprd03.prod.outlook.com.

Reply via email to