Hi,

Very impressive work!

I cannot really say I fully appreciate all details nor know this part of
the spec but your code looks good and makes sense.

How did you actually test it especially the edge conditions - window
getting full, etc?

Waldek


On Sun, Jun 21, 2020 at 5:09 AM Fotis Xenakis <[email protected]> wrote:

> This integrates the dax window manager with the virtio-fs file system
> operations:
>
> - Removes the lock from virtio::fs::dax_window, since this is now
>   handled by the window manager.
> - A single manager instance is maintained for each virtio-fs device
>   having at least a singe mount. Multiple, concurrent mounts of the same
>   device share a common dax_manager.
> - The FUSE_READ fallback as well as the logic of using it when the DAX
>   window is not available or a read using it fails are untouched.
>
> Signed-off-by: Fotis Xenakis <[email protected]>
> ---
>  drivers/virtio-fs.hh           |  11 ++-
>  fs/virtiofs/virtiofs.hh        |  21 ++++-
>  fs/virtiofs/virtiofs_i.hh      |   3 -
>  fs/virtiofs/virtiofs_vfsops.cc |  53 ++++++++++--
>  fs/virtiofs/virtiofs_vnops.cc  | 151 ++++++---------------------------
>  5 files changed, 100 insertions(+), 139 deletions(-)
>
> diff --git a/drivers/virtio-fs.hh b/drivers/virtio-fs.hh
> index d78d7962..88a16939 100644
> --- a/drivers/virtio-fs.hh
> +++ b/drivers/virtio-fs.hh
> @@ -8,8 +8,11 @@
>  #ifndef VIRTIO_FS_DRIVER_H
>  #define VIRTIO_FS_DRIVER_H
>
> +#include <functional>
> +
>  #include <osv/mutex.h>
>  #include <osv/waitqueue.hh>
> +
>  #include "drivers/virtio.hh"
>  #include "drivers/virtio-device.hh"
>  #include "fs/virtiofs/fuse_kernel.h"
> @@ -49,7 +52,13 @@ public:
>      struct dax_window {
>          mmioaddr_t addr;
>          u64 len;
> -        mutex lock;
> +    };
> +
> +    // Helper enabling the use of fs* as key type in an unordered_*
> container.
> +    struct hasher {
> +        size_t operator()(fs* const _fs) const noexcept {
> +            return std::hash<int>{}(_fs->_id);
> +        }
>      };
>
>      explicit fs(virtio_device& dev);
> diff --git a/fs/virtiofs/virtiofs.hh b/fs/virtiofs/virtiofs.hh
> index 475d5eba..8a3b06f2 100644
> --- a/fs/virtiofs/virtiofs.hh
> +++ b/fs/virtiofs/virtiofs.hh
> @@ -8,11 +8,13 @@
>  #ifndef __INCLUDE_VIRTIOFS_H__
>  #define __INCLUDE_VIRTIOFS_H__
>
> -#include <osv/vnode.h>
> +#include <memory>
> +
> +#include <osv/debug.h>
>  #include <osv/mount.h>
> -#include <osv/dentry.h>
> -#include <osv/prex.h>
> -#include <osv/buf.h>
> +#include <osv/vnode.h>
> +
> +#include "drivers/virtio-fs.hh"
>  #include "fuse_kernel.h"
>
>  #define VIRTIOFS_DEBUG_ENABLED 1
> @@ -23,6 +25,17 @@
>  #define virtiofs_debug(...)
>  #endif
>
> +// Necessary pre-declaration because virtiofs::dax depends on
> virtiofs_inode,
> +// declared below.
> +namespace virtiofs {
> +class dax_manager;
> +}
> +
> +struct virtiofs_mount_data {
> +    virtio::fs* drv;
> +    std::shared_ptr<virtiofs::dax_manager> dax_mgr;
> +};
> +
>  struct virtiofs_inode {
>      uint64_t nodeid;
>      struct fuse_attr attr;
> diff --git a/fs/virtiofs/virtiofs_i.hh b/fs/virtiofs/virtiofs_i.hh
> index 76533d74..e879546d 100644
> --- a/fs/virtiofs/virtiofs_i.hh
> +++ b/fs/virtiofs/virtiofs_i.hh
> @@ -8,9 +8,6 @@
>  #ifndef VIRTIOFS_IO_H
>  #define VIRTIOFS_IO_H
>
> -#include "fuse_kernel.h"
> -#include <osv/mutex.h>
> -#include <osv/waitqueue.hh>
>  #include "drivers/virtio-fs.hh"
>
>  int fuse_req_send_and_receive_reply(virtio::fs* drv, uint32_t opcode,
> diff --git a/fs/virtiofs/virtiofs_vfsops.cc
> b/fs/virtiofs/virtiofs_vfsops.cc
> index 30daa42b..b60a766b 100644
> --- a/fs/virtiofs/virtiofs_vfsops.cc
> +++ b/fs/virtiofs/virtiofs_vfsops.cc
> @@ -5,19 +5,32 @@
>   * BSD license as described in the LICENSE file in the top-level
> directory.
>   */
>
> +#include <atomic>
> +#include <memory>
> +#include <mutex>
> +#include <new>
>  #include <sys/types.h>
> -#include <osv/device.h>
> +
> +#include <api/assert.h>
>  #include <osv/debug.h>
> -#include <iomanip>
> -#include <iostream>
> +#include <osv/device.h>
> +#include <osv/mutex.h>
> +
> +#include "drivers/virtio-fs.hh"
>  #include "virtiofs.hh"
> +#include "virtiofs_dax.hh"
>  #include "virtiofs_i.hh"
> -#include "drivers/virtio-fs.hh"
>
>  using fuse_request = virtio::fs::fuse_request;
>
>  static std::atomic<uint64_t> fuse_unique_id(1);
>
> +static struct {
> +    std::unordered_map<virtio::fs*,
> std::shared_ptr<virtiofs::dax_manager>,
> +        virtio::fs::hasher> mgrs;
> +    mutex lock;
> +} dax_managers;
> +
>  int fuse_req_send_and_receive_reply(virtio::fs* drv, uint32_t opcode,
>      uint64_t nodeid, void* input_args_data, size_t input_args_size,
>      void* output_args_data, size_t output_args_size)
> @@ -115,7 +128,28 @@ static int virtiofs_mount(struct mount* mp, const
> char* dev, int flags,
>
>      virtiofs_set_vnode(mp->m_root->d_vnode, root_node);
>
> -    mp->m_data = drv;
> +    auto* m_data = new (std::nothrow) virtiofs_mount_data;
> +    if (!m_data) {
> +        return ENOMEM;
> +    }
> +    m_data->drv = drv;
> +    if (drv->get_dax()) {
> +        // The device supports the DAX window
> +        std::lock_guard<mutex> guard {dax_managers.lock};
> +        auto found = dax_managers.mgrs.find(drv);
> +        if (found != dax_managers.mgrs.end()) {
> +            // There is a dax_manager already associated with this device
> (the
> +            // device is already mounted)
> +            m_data->dax_mgr = found->second;
> +        } else {
> +            m_data->dax_mgr =
> std::make_shared<virtiofs::dax_manager>(*drv);
> +            if (!m_data->dax_mgr) {
> +                return ENOMEM;
> +            }
> +        }
> +    }
> +
> +    mp->m_data = m_data;
>      mp->m_dev = device;
>
>      return 0;
> @@ -141,6 +175,15 @@ static int virtiofs_statfs(struct mount* mp, struct
> statfs* statp)
>
>  static int virtiofs_unmount(struct mount* mp, int flags)
>  {
> +    auto* m_data = static_cast<virtiofs_mount_data*>(mp->m_data);
> +    std::lock_guard<mutex> guard {dax_managers.lock};
> +    if (m_data->dax_mgr && m_data->dax_mgr.use_count() == 2) {
> +        // This was the last mount of this device. It's safe to delete the
> +        // window manager.
> +        dax_managers.mgrs.erase(m_data->drv);
> +    }
> +    delete m_data;
> +
>      struct device* dev = mp->m_dev;
>      return device_close(dev);
>  }
> diff --git a/fs/virtiofs/virtiofs_vnops.cc b/fs/virtiofs/virtiofs_vnops.cc
> index 38c000ec..51be3ee4 100644
> --- a/fs/virtiofs/virtiofs_vnops.cc
> +++ b/fs/virtiofs/virtiofs_vnops.cc
> @@ -5,28 +5,27 @@
>   * BSD license as described in the LICENSE file in the top-level
> directory.
>   */
>
> -#include <sys/stat.h>
> +#include <cstdlib>
> +#include <cstring>
>  #include <dirent.h>
> -#include <sys/param.h>
> -
>  #include <errno.h>
> -#include <string.h>
> -#include <stdlib.h>
>  #include <fcntl.h>
> +#include <sys/param.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
>
> -#include <osv/prex.h>
> -#include <osv/vnode.h>
> -#include <osv/file.h>
> -#include <osv/mount.h>
> +#include <osv/contiguous_alloc.hh>
>  #include <osv/debug.h>
> -
> -#include <sys/types.h>
>  #include <osv/device.h>
> -#include <osv/sched.hh>
> +#include <osv/file.h>
>  #include <osv/mmio.hh>
> -#include <osv/contiguous_alloc.hh>
> +#include <osv/mount.h>
> +#include <osv/prex.h>
> +#include <osv/sched.hh>
> +#include <osv/vnode.h>
>
>  #include "virtiofs.hh"
> +#include "virtiofs_dax.hh"
>  #include "virtiofs_i.hh"
>
>  static constexpr uint32_t OPEN_FLAGS = O_RDONLY;
> @@ -59,7 +58,8 @@ static int virtiofs_lookup(struct vnode* vnode, char*
> name, struct vnode** vpp)
>      }
>      strcpy(in_args.get(), name);
>
> -    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
> +    auto* m_data =
> static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
> +    auto* drv = m_data->drv;
>      auto error = fuse_req_send_and_receive_reply(drv, FUSE_LOOKUP,
>          inode->nodeid, in_args.get(), in_args_len, out_args.get(),
>          sizeof(*out_args));
> @@ -110,7 +110,8 @@ static int virtiofs_open(struct file* fp)
>      }
>      in_args->flags = OPEN_FLAGS;
>
> -    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
> +    auto* m_data =
> static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
> +    auto* drv = m_data->drv;
>      auto error = fuse_req_send_and_receive_reply(drv, FUSE_OPEN,
>          inode->nodeid, in_args.get(), sizeof(*in_args), out_args.get(),
>          sizeof(*out_args));
> @@ -145,7 +146,8 @@ static int virtiofs_close(struct vnode* vnode, struct
> file* fp)
>      in_args->fh = f_data->file_handle;
>      in_args->flags = OPEN_FLAGS; // need to be same as in FUSE_OPEN
>
> -    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
> +    auto* m_data =
> static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
> +    auto* drv = m_data->drv;
>      auto error = fuse_req_send_and_receive_reply(drv, FUSE_RELEASE,
>          inode->nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
>      if (error) {
> @@ -172,7 +174,8 @@ static int virtiofs_readlink(struct vnode* vnode,
> struct uio* uio)
>          return ENOMEM;
>      }
>
> -    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
> +    auto* m_data =
> static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
> +    auto* drv = m_data->drv;
>      auto error = fuse_req_send_and_receive_reply(drv, FUSE_READLINK,
>          inode->nodeid, nullptr, 0, link_path.get(), PATH_MAX);
>      if (error) {
> @@ -185,107 +188,6 @@ static int virtiofs_readlink(struct vnode* vnode,
> struct uio* uio)
>      return uiomove(link_path.get(), strlen(link_path.get()), uio);
>  }
>
> -// Read @read_amt bytes from @inode, using the DAX window.
> -static int virtiofs_read_direct(virtiofs_inode& inode, u64 file_handle,
> -    u64 read_amt, virtio::fs& drv, struct uio& uio)
> -{
> -    auto* dax = drv.get_dax();
> -    // Enter the critical path: setup mapping -> read -> remove mapping
> -    std::lock_guard<mutex> guard {dax->lock};
> -
> -    // Setup mapping
> -    // NOTE: There are restrictions on the arguments to
> FUSE_SETUPMAPPING, from
> -    // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
> -    // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
> -    // negotiation"):
> -    // - foffset: multiple of map_alignment from FUSE_INIT
> -    // - len: not larger than remaining file?
> -    // - moffset: multiple of map_alignment from FUSE_INIT
> -    // In practice, map_alignment is the host's page size, because
> foffset and
> -    // moffset are passed to mmap() on the host.
> -    std::unique_ptr<fuse_setupmapping_in> in_args {
> -        new (std::nothrow) fuse_setupmapping_in()};
> -    if (!in_args) {
> -        return ENOMEM;
> -    }
> -    in_args->fh = file_handle;
> -    in_args->flags = 0;
> -    uint64_t moffset = 0;
> -    in_args->moffset = moffset;
> -
> -    auto map_align = drv.get_map_alignment();
> -    if (map_align < 0) {
> -        kprintf("[virtiofs] inode %lld, map alignment not set\n",
> inode.nodeid);
> -        return ENOTSUP;
> -    }
> -    uint64_t alignment = 1ul << map_align;
> -    auto foffset = align_down(static_cast<uint64_t>(uio.uio_offset),
> alignment);
> -    in_args->foffset = foffset;
> -
> -    // The possible excess part of the file mapped due to alignment
> constraints
> -    // NOTE: map_excess <= alignemnt
> -    auto map_excess = uio.uio_offset - foffset;
> -    if (moffset + map_excess >= dax->len) {
> -        // No usable room in DAX window due to map_excess
> -        return ENOBUFS;
> -    }
> -    // Actual read amount is read_amt, or what fits in the DAX window
> -    auto read_amt_act = std::min<uint64_t>(read_amt,
> -        dax->len - moffset - map_excess);
> -    in_args->len = read_amt_act + map_excess;
> -
> -    virtiofs_debug("inode %lld, setting up mapping (foffset=%lld,
> len=%lld, "
> -                   "moffset=%lld)\n", inode.nodeid, in_args->foffset,
> -                   in_args->len, in_args->moffset);
> -    auto error = fuse_req_send_and_receive_reply(&drv, FUSE_SETUPMAPPING,
> -        inode.nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
> -    if (error) {
> -        kprintf("[virtiofs] inode %lld, mapping setup failed\n",
> inode.nodeid);
> -        return error;
> -    }
> -
> -    // Read from the DAX window
> -    // NOTE: It shouldn't be necessary to use the mmio* interface (i.e.
> volatile
> -    // accesses). From the spec: "Drivers map this shared memory region
> with
> -    // writeback caching as if it were regular RAM."
> -    // The location of the requested data in the DAX window
> -    auto req_data = dax->addr + moffset + map_excess;
> -    error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
> -    if (error) {
> -        kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
> -        return error;
> -    }
> -
> -    // Remove mapping
> -    // NOTE: This is only necessary when FUSE_SETUPMAPPING fails. From
> the spec:
> -    // "If the device runs out of resources the FUSE_SETUPMAPPING request
> fails
> -    // until resources are available again following FUSE_REMOVEMAPPING."
> -    auto r_in_args_size = sizeof(fuse_removemapping_in) +
> -        sizeof(fuse_removemapping_one);
> -    std::unique_ptr<u8> r_in_args {new (std::nothrow) u8[r_in_args_size]};
> -    if (!r_in_args) {
> -        return ENOMEM;
> -    }
> -    auto r_in = new (r_in_args.get()) fuse_removemapping_in();
> -    auto r_one = new (r_in_args.get() + sizeof(fuse_removemapping_in))
> -        fuse_removemapping_one();
> -    r_in->count = 1;
> -    r_one->moffset = in_args->moffset;
> -    r_one->len = in_args->len;
> -
> -    virtiofs_debug("inode %lld, removing mapping (moffset=%lld,
> len=%lld)\n",
> -        inode.nodeid, r_one->moffset, r_one->len);
> -    error = fuse_req_send_and_receive_reply(&drv, FUSE_REMOVEMAPPING,
> -        inode.nodeid, r_in_args.get(), r_in_args_size, nullptr, 0);
> -    if (error) {
> -        kprintf("[virtiofs] inode %lld, mapping removal failed\n",
> -            inode.nodeid);
> -        return error;
> -    }
> -
> -    return 0;
> -}
> -
>  // Read @read_amt bytes from @inode, using the fallback FUSE_READ
> mechanism.
>  static int virtiofs_read_fallback(virtiofs_inode& inode, u64 file_handle,
>      u32 read_amt, u32 flags, virtio::fs& drv, struct uio& uio)
> @@ -314,9 +216,6 @@ static int virtiofs_read_fallback(virtiofs_inode&
> inode, u64 file_handle,
>      return uiomove(buf.get(), read_amt, &uio);
>  }
>
> -// TODO: Optimize it to reduce number of exits to host (each
> -// fuse_req_send_and_receive_reply()) by reading eagerly "ahead/around"
> just
> -// like ROFS does and caching it
>  static int virtiofs_read(struct vnode* vnode, struct file* fp, struct
> uio* uio,
>      int ioflag)
>  {
> @@ -343,17 +242,17 @@ static int virtiofs_read(struct vnode* vnode, struct
> file* fp, struct uio* uio,
>
>      auto* inode = static_cast<virtiofs_inode*>(vnode->v_data);
>      auto* file_data = static_cast<virtiofs_file_data*>(fp->f_data);
> -    auto* drv = static_cast<virtio::fs*>(vnode->v_mount->m_data);
> +    auto* m_data =
> static_cast<virtiofs_mount_data*>(vnode->v_mount->m_data);
> +    auto* drv = m_data->drv;
> +    auto dax_mgr = m_data->dax_mgr;
>
>      // Total read amount is what they requested, or what is left
>      auto read_amt = std::min<uint64_t>(uio->uio_resid,
>          inode->attr.size - uio->uio_offset);
>
> -    if (drv->get_dax()) {
> +    if (dax_mgr) {
>          // Try to read from DAX
> -        if (!virtiofs_read_direct(*inode, file_data->file_handle,
> read_amt,
> -            *drv, *uio)) {
> -
> +        if (!dax_mgr->read(*inode, file_data->file_handle, read_amt,
> *uio)) {
>              return 0;
>          }
>      }
> --
> 2.27.0
>
> --
> You received this message because you are subscribed to the Google Groups
> "OSv Development" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/osv-dev/AM0PR03MB629260F9F8CC3272E9CF435AA6960%40AM0PR03MB6292.eurprd03.prod.outlook.com
> .
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/CAL9cFfO%3DcrNf_Rg72Zru8vviDzZG%3DRaVc-BsBfKP9v-Rz6mDUQ%40mail.gmail.com.

Reply via email to