For details on the manager's policy, please see
fs/virtiofs/virtiofs_dax.hh.

Signed-off-by: Fotis Xenakis <fo...@windowslive.com>
---
 Makefile                    |  39 +++---
 fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++
 fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++
 3 files changed, 397 insertions(+), 19 deletions(-)
 create mode 100644 fs/virtiofs/virtiofs_dax.cc
 create mode 100644 fs/virtiofs/virtiofs_dax.hh

diff --git a/Makefile b/Makefile
index 20ddf3b1..12366794 100644
--- a/Makefile
+++ b/Makefile
@@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o
 bsd += bsd/porting/pcpu.o
 bsd += bsd/porting/bus_dma.o
 bsd += bsd/porting/kobj.o
-bsd += bsd/sys/netinet/if_ether.o
-bsd += bsd/sys/compat/linux/linux_socket.o
-bsd += bsd/sys/compat/linux/linux_ioctl.o
-bsd += bsd/sys/net/if_ethersubr.o
-bsd += bsd/sys/net/if_llatbl.o
-bsd += bsd/sys/net/radix.o
-bsd += bsd/sys/net/route.o
-bsd += bsd/sys/net/raw_cb.o
-bsd += bsd/sys/net/raw_usrreq.o
-bsd += bsd/sys/net/rtsock.o
-bsd += bsd/sys/net/netisr.o
-bsd += bsd/sys/net/netisr1.o
-bsd += bsd/sys/net/if_dead.o
-bsd += bsd/sys/net/if_clone.o
-bsd += bsd/sys/net/if_loop.o
-bsd += bsd/sys/net/if.o
-bsd += bsd/sys/net/pfil.o
+bsd += bsd/sys/netinet/if_ether.o
+bsd += bsd/sys/compat/linux/linux_socket.o
+bsd += bsd/sys/compat/linux/linux_ioctl.o
+bsd += bsd/sys/net/if_ethersubr.o
+bsd += bsd/sys/net/if_llatbl.o
+bsd += bsd/sys/net/radix.o
+bsd += bsd/sys/net/route.o
+bsd += bsd/sys/net/raw_cb.o
+bsd += bsd/sys/net/raw_usrreq.o
+bsd += bsd/sys/net/rtsock.o
+bsd += bsd/sys/net/netisr.o
+bsd += bsd/sys/net/netisr1.o
+bsd += bsd/sys/net/if_dead.o
+bsd += bsd/sys/net/if_clone.o
+bsd += bsd/sys/net/if_loop.o
+bsd += bsd/sys/net/if.o
+bsd += bsd/sys/net/pfil.o
 bsd += bsd/sys/net/routecache.o
 bsd += bsd/sys/netinet/in.o
 bsd += bsd/sys/netinet/in_pcb.o
@@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \
        rofs/rofs_common.o

 fs_objs += virtiofs/virtiofs_vfsops.o \
-       virtiofs/virtiofs_vnops.o
+       virtiofs/virtiofs_vnops.o \
+       virtiofs/virtiofs_dax.o

 fs_objs += pseudofs/pseudofs.o
 fs_objs += procfs/procfs_vnops.o
@@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), 
$(out)/bsd/cddl/contri

 define libuutil-includes
   bsd/cddl/contrib/opensolaris/lib/libuutil/common
-  bsd/cddl/compat/opensolaris/include
+  bsd/cddl/compat/opensolaris/include
   bsd/sys/cddl/contrib/opensolaris/uts/common
   bsd/sys/cddl/compat/opensolaris
   bsd/cddl/contrib/opensolaris/head
diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc
new file mode 100644
index 00000000..8e612eb5
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <algorithm>
+#include <mutex>
+
+#include <osv/debug.h>
+#include <osv/uio.h>
+
+#include "fuse_kernel.h"
+#include "virtiofs.hh"
+#include "virtiofs_dax.hh"
+#include "virtiofs_i.hh"
+
+namespace virtiofs {
+
+int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 
read_amt,
+    struct uio& uio, bool aggressive)
+{
+    std::lock_guard<mutex> guard {_lock};
+
+    // Necessary pre-declarations due to goto below
+    size_t to_map;
+    chunk nchunks;
+    int error;
+    mapping_part mp;
+    chunk fstart = uio.uio_offset / _chunk_size;
+    off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk
+    if (find(inode.nodeid, fstart, mp)) {
+        // Requested data (at least some initial) is already mapped
+        auto read_amt_act = std::min<size_t>(read_amt,
+            (mp.nchunks * _chunk_size) - coffset);
+        virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, "
+            "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act,
+            (mp.mstart * _chunk_size) + coffset);
+        goto out;
+    }
+
+    // Map file
+    to_map = coffset; // bytes to map
+    if (aggressive) {
+        // Map the rest of the file
+        to_map += inode.attr.size - uio.uio_offset;
+    } else {
+        // Map just enough chunks to satisfy read_amt
+        to_map += read_amt;
+    }
+    nchunks = to_map / _chunk_size;
+    if (to_map % _chunk_size > 0) {
+        nchunks++;
+    }
+    // NOTE: This relies on the fact that requesting a mapping longer than the
+    // remaining file works (see mmap() on the host). If that didn't work, we
+    // would have to request exact mappings (byte-granularity, rather than
+    // chunk-granularity).
+    error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true);
+    if (error) {
+        return error;
+    }
+
+out:
+    auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset;
+    auto read_amt_act = std::min<size_t>(read_amt,
+        (mp.nchunks * _chunk_size) - coffset);
+    // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. 
volatile
+    // accesses). From the spec: "Drivers map this shared memory region with
+    // writeback caching as if it were regular RAM."
+    error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
+    }
+    return error;
+}
+
+int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+    chunk fstart, mapping_part& mapped, bool evict)
+{
+    // If necessary, unmap just enough chunks
+    auto empty = _window_chunks - first_empty();
+    if (evict && empty < nchunks) {
+        mapping_part mp;
+        auto error = unmap(nchunks - empty, mp, false);
+        if (error) {
+            return error;
+        }
+        empty += mp.nchunks;
+    }
+    auto to_map = std::min<chunk>(nchunks, empty);
+    if (to_map == 0) {
+        // The window is full and evict is false, or nchunks is 0
+        mapped.mstart = _window_chunks - empty;
+        mapped.nchunks = 0;
+        return (nchunks == 0) ? 0 : ENOBUFS;
+    }
+
+    // Map new chunks
+    auto mstart = _window_chunks - empty;
+    auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart);
+    if (error) {
+        return error;
+    }
+    if (!_mappings.empty()) {
+        auto& m {_mappings.back()};
+        if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) {
+            // Extend previous mapping
+            m.nchunks += to_map;
+            mapped.mstart = mstart;
+            mapped.nchunks = to_map;
+            return 0;
+        }
+    }
+    _mappings.emplace_back(nodeid, to_map, fstart, mstart);
+    mapped.mstart = mstart;
+    mapped.nchunks = to_map;
+    return 0;
+}
+
+int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep)
+{
+    // Determine necessary changes
+    chunk to_unmap = 0;
+    auto erase_first {_mappings.cend()};
+    chunk to_unmap_from_last = 0;
+    for (auto it {_mappings.crbegin()};
+        to_unmap < nchunks && it != _mappings.crend(); it++) {
+
+        if (it->nchunks <= nchunks - to_unmap) {
+            // Remove *it
+            erase_first = it.base() - 1;
+            to_unmap += it->nchunks;
+        } else {
+            // Modify *it
+            to_unmap_from_last = nchunks - to_unmap;
+            to_unmap = nchunks;
+        }
+    }
+    if (to_unmap == 0) {
+        // The window is empty, or nchunks is 0
+        unmapped.mstart = first_empty();
+        unmapped.nchunks = 0;
+        return (nchunks == 0) ? 0 : ENODATA;
+    }
+
+    // Apply changes
+    if (deep) {
+        auto mstart = first_empty() - to_unmap;
+        auto error = unmap_ll(to_unmap, mstart);
+        if (error) {
+            return error;
+        }
+    }
+    _mappings.erase(erase_first, _mappings.cend());
+    if (to_unmap_from_last > 0) {
+        _mappings.back().nchunks -= to_unmap_from_last;
+    }
+
+    unmapped.mstart = first_empty();
+    unmapped.nchunks = to_unmap;
+    return 0;
+}
+
+int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+    chunk fstart, chunk mstart)
+{
+    assert(mstart + nchunks <= _window_chunks);
+
+    // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
+    // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
+    // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
+    // negotiation"):
+    // - foffset: multiple of map_alignment from FUSE_INIT
+    // - len: not larger than remaining file?
+    // - moffset: multiple of map_alignment from FUSE_INIT
+    // In practice, map_alignment is the host's page size, because foffset and
+    // moffset are passed to mmap() on the host. These are satisfied by
+    // _chunk_size being a multiple of map_alignment.
+
+    std::unique_ptr<fuse_setupmapping_in> in_args {
+        new (std::nothrow) fuse_setupmapping_in()};
+    if (!in_args) {
+        return ENOMEM;
+    }
+    in_args->fh = file_handle;
+    in_args->foffset = fstart * _chunk_size;
+    in_args->len = nchunks * _chunk_size;
+    in_args->flags = 0; // Read-only
+    in_args->moffset = mstart * _chunk_size;
+
+    virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
+                   "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len,
+                   in_args->moffset);
+    auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING,
+        nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid);
+        return error;
+    }
+
+    return 0;
+}
+
+int dax_manager::unmap_ll(chunk nchunks, chunk mstart)
+{
+    assert(mstart + nchunks <= _window_chunks);
+
+    // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by
+    // fuse_removemapping_in.count fuse_removemapping_one arguments in general.
+    auto in_args_size = sizeof(fuse_removemapping_in) +
+        sizeof(fuse_removemapping_one);
+    std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]};
+    if (!in_args) {
+        return ENOMEM;
+    }
+    auto r_in = new (in_args.get()) fuse_removemapping_in();
+    auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in))
+        fuse_removemapping_one();
+    r_in->count = 1;
+    r_one->moffset = mstart * _chunk_size;
+    r_one->len = nchunks * _chunk_size;
+
+    // The nodeid is irrelevant for the current implementation of
+    // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a
+    // request per inode?
+    uint64_t nodeid = 0;
+
+    virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
+        nodeid, r_one->moffset, r_one->len);
+    auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING,
+        nodeid, in_args.get(), in_args_size, nullptr, 0);
+    if (error) {
+        kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid);
+        return error;
+    }
+
+    return 0;
+}
+
+bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) 
const
+{
+    for (auto& m : _mappings) {
+        if (m.nodeid == nodeid &&
+            m.fstart <= fstart &&
+            m.fstart + m.nchunks > fstart) {
+
+            // m contains fstart
+            auto excess = fstart - m.fstart; // excess contained in m
+            found.nchunks = m.nchunks - excess;
+            found.mstart = m.mstart + excess;
+            return true;
+        }
+    }
+    return false;
+}
+
+dax_manager::chunk dax_manager::first_empty() const
+{
+    if (_mappings.empty()) {
+        return 0;
+    }
+    auto& m {_mappings.back()};
+    return m.mstart + m.nchunks;
+}
+
+}
diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh
new file mode 100644
index 00000000..2b9fa341
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.hh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <vector>
+
+#include <api/assert.h>
+#include <osv/mutex.h>
+#include <osv/uio.h>
+
+#include "drivers/virtio-fs.hh"
+#include "virtiofs.hh"
+
+namespace virtiofs {
+
+// A manager for the DAX window of a virtio-fs device. This implements a
+// straight-forward scheme for file mappings:
+// - The window is split into equally-sized chunks. Each mapping occupies an
+//   integer amount of consecutive chunks.
+// - New mappings are placed on the lowest available chunks in the window.
+// - When there are not enough chunks available for a new mapping, the highest
+//   (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks are
+//   mapped in a LIFO manner (the window resembles a stack).
+class dax_manager {
+public:
+    static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB
+
+    // Construct a new manager for the DAX window associated with @drv (as
+    // returned by drv.get_dax()). The alignment constraint of the device (as
+    // reported by drv.get_map_alignment()) should be compatible with
+    // @chunk_size.
+    dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE)
+        : _drv {drv},
+          _window {drv.get_dax()},
+          _chunk_size {chunk_size},
+          _window_chunks {_window->len / _chunk_size} {
+
+        assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0);
+
+        // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< 
CHUNK_SIZE)
+        // is effectively ignored.
+    }
+
+    // Read @read_amt bytes from @inode, using the DAX window. If @aggressive,
+    // try to prefetch as much of the rest of the file as possible.
+    int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
+        struct uio& uio, bool aggressive = false);
+
+private:
+    // Helper type to better distinguish referring to chunks vs bytes
+    using chunk = size_t;
+
+    struct mapping {
+        mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart)
+            : nodeid {_nodeid},
+              nchunks {_nchunks},
+              fstart {_fstart},
+              mstart {_mstart} {}
+        uint64_t nodeid;
+        chunk nchunks;
+        chunk fstart;
+        chunk mstart;
+    };
+
+    struct mapping_part {
+        chunk nchunks;
+        chunk mstart;
+    };
+
+    // Map up to @nchunks chunks of the file with @nodeid, starting at chunk
+    // @fstart of the file, after all other mappings. If @evict, evict other
+    // chunks if necessary. Returns in @mapped the new mapping and non-zero on
+    // failure. Called with _lock held (for writing).
+    int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk fstart,
+        mapping_part& mapped, bool evict = false);
+    // Unmap @nchunks last chunks, also doing an actual unmapping on the device
+    // if @deep. Returns in @unmapped what was unmapped and non-zero on 
failure.
+    // Called with _lock held (for writing).
+    int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false);
+    // Map @nchunks chunks of the file with @nodeid (opened as @fh), starting 
at
+    // chunk @fstart of the file and chunk @mstart of the window. Returns
+    // non-zero on failure. Called with _lock held (for writing).
+    int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart,
+        chunk mstart);
+    // Unmap @nchunks chunks, starting at chunk @mstart of the window. Returns
+    // non-zero on failure. Called with _lock held (for writing).
+    int unmap_ll(chunk nchunks, chunk mstart);
+
+    // Return in @found the largest contiguous existing mapping for @nodeid
+    // starting at @fstart. If none found, returns false. Called with _lock 
held
+    // (for reading).
+    bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const;
+    // Returns the first empty chunk in the window, or one-past-the-last if the
+    // window is full. Called with _lock held (for reading).
+    chunk first_empty() const;
+
+    virtio::fs& _drv;
+    const virtio::fs::dax_window* const _window;
+    const size_t _chunk_size;
+    const chunk _window_chunks;
+    // TODO OPT: Switch to rwlock
+    mutex _lock;
+    std::vector<mapping> _mappings;
+};
+
+}
--
2.27.0

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/AM0PR03MB62928739B45E668435D74798A6980%40AM0PR03MB6292.eurprd03.prod.outlook.com.

Reply via email to