For details on the manager's policy, please see
fs/virtiofs/virtiofs_dax.hh.

Signed-off-by: Fotis Xenakis <[email protected]>
---
 Makefile                    |  39 +++---
 fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++
 fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++
 3 files changed, 397 insertions(+), 19 deletions(-)
 create mode 100644 fs/virtiofs/virtiofs_dax.cc
 create mode 100644 fs/virtiofs/virtiofs_dax.hh

diff --git a/Makefile b/Makefile
index 20ddf3b1..12366794 100644
--- a/Makefile
+++ b/Makefile
@@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o
 bsd += bsd/porting/pcpu.o
 bsd += bsd/porting/bus_dma.o
 bsd += bsd/porting/kobj.o
-bsd += bsd/sys/netinet/if_ether.o
-bsd += bsd/sys/compat/linux/linux_socket.o
-bsd += bsd/sys/compat/linux/linux_ioctl.o
-bsd += bsd/sys/net/if_ethersubr.o
-bsd += bsd/sys/net/if_llatbl.o
-bsd += bsd/sys/net/radix.o
-bsd += bsd/sys/net/route.o
-bsd += bsd/sys/net/raw_cb.o
-bsd += bsd/sys/net/raw_usrreq.o
-bsd += bsd/sys/net/rtsock.o
-bsd += bsd/sys/net/netisr.o
-bsd += bsd/sys/net/netisr1.o
-bsd += bsd/sys/net/if_dead.o
-bsd += bsd/sys/net/if_clone.o
-bsd += bsd/sys/net/if_loop.o
-bsd += bsd/sys/net/if.o
-bsd += bsd/sys/net/pfil.o
+bsd += bsd/sys/netinet/if_ether.o
+bsd += bsd/sys/compat/linux/linux_socket.o
+bsd += bsd/sys/compat/linux/linux_ioctl.o
+bsd += bsd/sys/net/if_ethersubr.o
+bsd += bsd/sys/net/if_llatbl.o
+bsd += bsd/sys/net/radix.o
+bsd += bsd/sys/net/route.o
+bsd += bsd/sys/net/raw_cb.o
+bsd += bsd/sys/net/raw_usrreq.o
+bsd += bsd/sys/net/rtsock.o
+bsd += bsd/sys/net/netisr.o
+bsd += bsd/sys/net/netisr1.o
+bsd += bsd/sys/net/if_dead.o
+bsd += bsd/sys/net/if_clone.o
+bsd += bsd/sys/net/if_loop.o
+bsd += bsd/sys/net/if.o
+bsd += bsd/sys/net/pfil.o
 bsd += bsd/sys/net/routecache.o
 bsd += bsd/sys/netinet/in.o
 bsd += bsd/sys/netinet/in_pcb.o
@@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \
        rofs/rofs_common.o

 fs_objs += virtiofs/virtiofs_vfsops.o \
-       virtiofs/virtiofs_vnops.o
+       virtiofs/virtiofs_vnops.o \
+       virtiofs/virtiofs_dax.o

 fs_objs += pseudofs/pseudofs.o
 fs_objs += procfs/procfs_vnops.o
@@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), 
$(out)/bsd/cddl/contri

 define libuutil-includes
   bsd/cddl/contrib/opensolaris/lib/libuutil/common
-  bsd/cddl/compat/opensolaris/include
+  bsd/cddl/compat/opensolaris/include
   bsd/sys/cddl/contrib/opensolaris/uts/common
   bsd/sys/cddl/compat/opensolaris
   bsd/cddl/contrib/opensolaris/head
diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc
new file mode 100644
index 00000000..8e612eb5
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <algorithm>
+#include <mutex>
+
+#include <osv/debug.h>
+#include <osv/uio.h>
+
+#include "fuse_kernel.h"
+#include "virtiofs.hh"
+#include "virtiofs_dax.hh"
+#include "virtiofs_i.hh"
+
+namespace virtiofs {
+
+int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 
read_amt,
+    struct uio& uio, bool aggressive)
+{
+    std::lock_guard<mutex> guard {_lock};
+
+    // Necessary pre-declarations due to goto below
+    size_t to_map;
+    chunk nchunks;
+    int error;
+    mapping_part mp;
+    chunk fstart = uio.uio_offset / _chunk_size;
+    off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk
+    if (find(inode.nodeid, fstart, mp)) {
+        // Requested data (at least some initial) is already mapped
+        auto read_amt_act = std::min<size_t>(read_amt,
+            (mp.nchunks * _chunk_size) - coffset);
+        virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, "
+            "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act,
+            (mp.mstart * _chunk_size) + coffset);
+        goto out;
+    }
+
+    // Map file
+    to_map = coffset; // bytes to map
+    if (aggressive) {
+        // Map the rest of the file
+        to_map += inode.attr.size - uio.uio_offset;
+    } else {
+        // Map just enough chunks to satisfy read_amt
+        to_map += read_amt;
+    }
+    nchunks = to_map / _chunk_size;
+    if (to_map % _chunk_size > 0) {
+        nchunks++;
+    }
+    // NOTE: This relies on the fact that requesting a mapping longer than the
+    // remaining file works (see mmap() on the host). If that didn't work, we
+    // would have to request exact mappings (byte-granularity, rather than
+    // chunk-granularity).
+    error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true);
+    if (error) {
+        return error;
+    }
+
+out:
+    auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset;
+    auto read_amt_act = std::min<size_t>(read_amt,
+        (mp.nchunks * _chunk_size) - coffset);
+    // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. 
volatile
+    // accesses). From the spec: "Drivers map this shared memory region with
+    // writeback caching as if it were regular RAM."
+    error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
+    }
+    return error;
+}
+
+int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+    chunk fstart, mapping_part& mapped, bool evict)
+{
+    // If necessary, unmap just enough chunks
+    auto empty = _window_chunks - first_empty();
+    if (evict && empty < nchunks) {
+        mapping_part mp;
+        auto error = unmap(nchunks - empty, mp, false);
+        if (error) {
+            return error;
+        }
+        empty += mp.nchunks;
+    }
+    auto to_map = std::min<chunk>(nchunks, empty);
+    if (to_map == 0) {
+        // The window is full and evict is false, or nchunks is 0
+        mapped.mstart = _window_chunks - empty;
+        mapped.nchunks = 0;
+        return (nchunks == 0) ? 0 : ENOBUFS;
+    }
+
+    // Map new chunks
+    auto mstart = _window_chunks - empty;
+    auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart);
+    if (error) {
+        return error;
+    }
+    if (!_mappings.empty()) {
+        auto& m {_mappings.back()};
+        if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) {
+            // Extend previous mapping
+            m.nchunks += to_map;
+            mapped.mstart = mstart;
+            mapped.nchunks = to_map;
+            return 0;
+        }
+    }
+    _mappings.emplace_back(nodeid, to_map, fstart, mstart);
+    mapped.mstart = mstart;
+    mapped.nchunks = to_map;
+    return 0;
+}
+
+int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep)
+{
+    // Determine necessary changes
+    chunk to_unmap = 0;
+    auto erase_first {_mappings.cend()};
+    chunk to_unmap_from_last = 0;
+    for (auto it {_mappings.crbegin()};
+        to_unmap < nchunks && it != _mappings.crend(); it++) {
+
+        if (it->nchunks <= nchunks - to_unmap) {
+            // Remove *it
+            erase_first = it.base() - 1;
+            to_unmap += it->nchunks;
+        } else {
+            // Modify *it
+            to_unmap_from_last = nchunks - to_unmap;
+            to_unmap = nchunks;
+        }
+    }
+    if (to_unmap == 0) {
+        // The window is empty, or nchunks is 0
+        unmapped.mstart = first_empty();
+        unmapped.nchunks = 0;
+        return (nchunks == 0) ? 0 : ENODATA;
+    }
+
+    // Apply changes
+    if (deep) {
+        auto mstart = first_empty() - to_unmap;
+        auto error = unmap_ll(to_unmap, mstart);
+        if (error) {
+            return error;
+        }
+    }
+    _mappings.erase(erase_first, _mappings.cend());
+    if (to_unmap_from_last > 0) {
+        _mappings.back().nchunks -= to_unmap_from_last;
+    }
+
+    unmapped.mstart = first_empty();
+    unmapped.nchunks = to_unmap;
+    return 0;
+}
+
+int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+    chunk fstart, chunk mstart)
+{
+    assert(mstart + nchunks <= _window_chunks);
+
+    // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
+    // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
+    // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
+    // negotiation"):
+    // - foffset: multiple of map_alignment from FUSE_INIT
+    // - len: not larger than remaining file?
+    // - moffset: multiple of map_alignment from FUSE_INIT
+    // In practice, map_alignment is the host's page size, because foffset and
+    // moffset are passed to mmap() on the host. These are satisfied by
+    // _chunk_size being a multiple of map_alignment.
+
+    std::unique_ptr<fuse_setupmapping_in> in_args {
+        new (std::nothrow) fuse_setupmapping_in()};
+    if (!in_args) {
+        return ENOMEM;
+    }
+    in_args->fh = file_handle;
+    in_args->foffset = fstart * _chunk_size;
+    in_args->len = nchunks * _chunk_size;
+    in_args->flags = 0; // Read-only
+    in_args->moffset = mstart * _chunk_size;
+
+    virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
+                   "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len,
+                   in_args->moffset);
+    auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING,
+        nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid);
+        return error;
+    }
+
+    return 0;
+}
+
+int dax_manager::unmap_ll(chunk nchunks, chunk mstart)
+{
+    assert(mstart + nchunks <= _window_chunks);
+
+    // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by
+    // fuse_removemapping_in.count fuse_removemapping_one arguments in general.
+    auto in_args_size = sizeof(fuse_removemapping_in) +
+        sizeof(fuse_removemapping_one);
+    std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]};
+    if (!in_args) {
+        return ENOMEM;
+    }
+    auto r_in = new (in_args.get()) fuse_removemapping_in();
+    auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in))
+        fuse_removemapping_one();
+    r_in->count = 1;
+    r_one->moffset = mstart * _chunk_size;
+    r_one->len = nchunks * _chunk_size;
+
+    // The nodeid is irrelevant for the current implementation of
+    // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a
+    // request per inode?
+    uint64_t nodeid = 0;
+
+    virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
+        nodeid, r_one->moffset, r_one->len);
+    auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING,
+        nodeid, in_args.get(), in_args_size, nullptr, 0);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid);
+        return error;
+    }
+
+    return 0;
+}
+
+bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) 
const
+{
+    for (auto& m : _mappings) {
+        if (m.nodeid == nodeid &&
+            m.fstart <= fstart &&
+            m.fstart + m.nchunks > fstart) {
+
+            // m contains fstart
+            auto excess = fstart - m.fstart; // excess contained in m
+            found.nchunks = m.nchunks - excess;
+            found.mstart = m.mstart + excess;
+            return true;
+        }
+    }
+    return false;
+}
+
+dax_manager::chunk dax_manager::first_empty() const
+{
+    if (_mappings.empty()) {
+        return 0;
+    }
+    auto& m {_mappings.back()};
+    return m.mstart + m.nchunks;
+}
+
+}
diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh
new file mode 100644
index 00000000..2b9fa341
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.hh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <vector>
+
+#include <api/assert.h>
+#include <osv/mutex.h>
+#include <osv/uio.h>
+
+#include "drivers/virtio-fs.hh"
+#include "virtiofs.hh"
+
+namespace virtiofs {
+
+// A manager for the DAX window of a virtio-fs device. This implements a
+// straight-forward scheme for file mappings:
+// - The window is split into equally-sized chunks. Each mapping occupies an
+//   integer amount of consecutive chunks.
+// - New mappings are placed on the lowest available chunks in the window.
+// - When there are not enough chunks available for a new mapping, the highest
+//   (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks are
+//   mapped in a LIFO manner (the window resembles a stack).
+class dax_manager {
+public:
+    static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB
+
+    // Construct a new manager for the DAX window associated with @drv (as
+    // returned by drv.get_dax()). The alignment constraint of the device (as
+    // reported by drv.get_map_alignment()) should be compatible with
+    // @chunk_size.
+    dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE)
+        : _drv {drv},
+          _window {drv.get_dax()},
+          _chunk_size {chunk_size},
+          _window_chunks {_window->len / _chunk_size} {
+
+        assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0);
+
+        // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< 
CHUNK_SIZE)
+        // is effectively ignored.
+    }
+
+    // Read @read_amt bytes from @inode, using the DAX window. If @aggressive,
+    // try to prefetch as much of the rest of the file as possible.
+    int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
+        struct uio& uio, bool aggressive = false);
+
+private:
+    // Helper type to better distinguish referring to chunks vs bytes
+    using chunk = size_t;
+
+    struct mapping {
+        mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart)
+            : nodeid {_nodeid},
+              nchunks {_nchunks},
+              fstart {_fstart},
+              mstart {_mstart} {}
+        uint64_t nodeid;
+        chunk nchunks;
+        chunk fstart;
+        chunk mstart;
+    };
+
+    struct mapping_part {
+        chunk nchunks;
+        chunk mstart;
+    };
+
+    // Map up to @nchunks chunks of the file with @nodeid, starting at chunk
+    // @fstart of the file, after all other mappings. If @evict, evict other
+    // chunks if necessary. Returns in @mapped the new mapping and non-zero on
+    // failure. Called with _lock held (for writing).
+    int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk fstart,
+        mapping_part& mapped, bool evict = false);
+    // Unmap @nchunks last chunks, also doing an actual unmapping on the device
+    // if @deep. Returns in @unmapped what was unmapped and non-zero on 
failure.
+    // Called with _lock held (for writing).
+    int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false);
+    // Map @nchunks chunks of the file with @nodeid (opened as @fh), starting 
at
+    // chunk @fstart of the file and chunk @mstart of the window. Returns
+    // non-zero on failure. Called with _lock held (for writing).
+    int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart,
+        chunk mstart);
+    // Unmap @nchunks chunks, starting at chunk @mstart of the window. Returns
+    // non-zero on failure. Called with _lock held (for writing).
+    int unmap_ll(chunk nchunks, chunk mstart);
+
+    // Return in @found the largest contiguous existing mapping for @nodeid
+    // starting at @fstart. If none found, returns false. Called with _lock 
held
+    // (for reading).
+    bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const;
+    // Returns the first empty chunk in the window, or one-past-the-last if the
+    // window is full. Called with _lock held (for reading).
+    chunk first_empty() const;
+
+    virtio::fs& _drv;
+    const virtio::fs::dax_window* const _window;
+    const size_t _chunk_size;
+    const chunk _window_chunks;
+    // TODO OPT: Switch to rwlock
+    mutex _lock;
+    std::vector<mapping> _mappings;
+};
+
+}
--
2.27.0

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/AM0PR03MB62928739B45E668435D74798A6980%40AM0PR03MB6292.eurprd03.prod.outlook.com.

Reply via email to