For details on the manager's policy, please see fs/virtiofs/virtiofs_dax.hh.
Signed-off-by: Fotis Xenakis <[email protected]> --- Makefile | 39 +++--- fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++ fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++ 3 files changed, 397 insertions(+), 19 deletions(-) create mode 100644 fs/virtiofs/virtiofs_dax.cc create mode 100644 fs/virtiofs/virtiofs_dax.hh diff --git a/Makefile b/Makefile index 20ddf3b1..12366794 100644 --- a/Makefile +++ b/Makefile @@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o bsd += bsd/porting/pcpu.o bsd += bsd/porting/bus_dma.o bsd += bsd/porting/kobj.o -bsd += bsd/sys/netinet/if_ether.o -bsd += bsd/sys/compat/linux/linux_socket.o -bsd += bsd/sys/compat/linux/linux_ioctl.o -bsd += bsd/sys/net/if_ethersubr.o -bsd += bsd/sys/net/if_llatbl.o -bsd += bsd/sys/net/radix.o -bsd += bsd/sys/net/route.o -bsd += bsd/sys/net/raw_cb.o -bsd += bsd/sys/net/raw_usrreq.o -bsd += bsd/sys/net/rtsock.o -bsd += bsd/sys/net/netisr.o -bsd += bsd/sys/net/netisr1.o -bsd += bsd/sys/net/if_dead.o -bsd += bsd/sys/net/if_clone.o -bsd += bsd/sys/net/if_loop.o -bsd += bsd/sys/net/if.o -bsd += bsd/sys/net/pfil.o +bsd += bsd/sys/netinet/if_ether.o +bsd += bsd/sys/compat/linux/linux_socket.o +bsd += bsd/sys/compat/linux/linux_ioctl.o +bsd += bsd/sys/net/if_ethersubr.o +bsd += bsd/sys/net/if_llatbl.o +bsd += bsd/sys/net/radix.o +bsd += bsd/sys/net/route.o +bsd += bsd/sys/net/raw_cb.o +bsd += bsd/sys/net/raw_usrreq.o +bsd += bsd/sys/net/rtsock.o +bsd += bsd/sys/net/netisr.o +bsd += bsd/sys/net/netisr1.o +bsd += bsd/sys/net/if_dead.o +bsd += bsd/sys/net/if_clone.o +bsd += bsd/sys/net/if_loop.o +bsd += bsd/sys/net/if.o +bsd += bsd/sys/net/pfil.o bsd += bsd/sys/net/routecache.o bsd += bsd/sys/netinet/in.o bsd += bsd/sys/netinet/in_pcb.o @@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \ rofs/rofs_common.o fs_objs += virtiofs/virtiofs_vfsops.o \ - virtiofs/virtiofs_vnops.o + virtiofs/virtiofs_vnops.o \ + virtiofs/virtiofs_dax.o fs_objs += pseudofs/pseudofs.o fs_objs += procfs/procfs_vnops.o @@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), $(out)/bsd/cddl/contri define libuutil-includes bsd/cddl/contrib/opensolaris/lib/libuutil/common - bsd/cddl/compat/opensolaris/include + bsd/cddl/compat/opensolaris/include bsd/sys/cddl/contrib/opensolaris/uts/common bsd/sys/cddl/compat/opensolaris bsd/cddl/contrib/opensolaris/head diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc new file mode 100644 index 00000000..8e612eb5 --- /dev/null +++ b/fs/virtiofs/virtiofs_dax.cc @@ -0,0 +1,268 @@ +/* + * Copyright (C) 2020 Fotis Xenakis + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include <algorithm> +#include <mutex> + +#include <osv/debug.h> +#include <osv/uio.h> + +#include "fuse_kernel.h" +#include "virtiofs.hh" +#include "virtiofs_dax.hh" +#include "virtiofs_i.hh" + +namespace virtiofs { + +int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt, + struct uio& uio, bool aggressive) +{ + std::lock_guard<mutex> guard {_lock}; + + // Necessary pre-declarations due to goto below + size_t to_map; + chunk nchunks; + int error; + mapping_part mp; + chunk fstart = uio.uio_offset / _chunk_size; + off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk + if (find(inode.nodeid, fstart, mp)) { + // Requested data (at least some initial) is already mapped + auto read_amt_act = std::min<size_t>(read_amt, + (mp.nchunks * _chunk_size) - coffset); + virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, " + "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act, + (mp.mstart * _chunk_size) + coffset); + goto out; + } + + // Map file + to_map = coffset; // bytes to map + if (aggressive) { + // Map the rest of the file + to_map += inode.attr.size - uio.uio_offset; + } else { + // Map just enough chunks to satisfy read_amt + to_map += read_amt; + } + nchunks = to_map / _chunk_size; + if (to_map % _chunk_size > 0) { + nchunks++; + } + // NOTE: This relies on the fact that requesting a mapping longer than the + // remaining file works (see mmap() on the host). If that didn't work, we + // would have to request exact mappings (byte-granularity, rather than + // chunk-granularity). + error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true); + if (error) { + return error; + } + +out: + auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset; + auto read_amt_act = std::min<size_t>(read_amt, + (mp.nchunks * _chunk_size) - coffset); + // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile + // accesses). From the spec: "Drivers map this shared memory region with + // writeback caching as if it were regular RAM." + error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio); + if (error) { + kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid); + } + return error; +} + +int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, + chunk fstart, mapping_part& mapped, bool evict) +{ + // If necessary, unmap just enough chunks + auto empty = _window_chunks - first_empty(); + if (evict && empty < nchunks) { + mapping_part mp; + auto error = unmap(nchunks - empty, mp, false); + if (error) { + return error; + } + empty += mp.nchunks; + } + auto to_map = std::min<chunk>(nchunks, empty); + if (to_map == 0) { + // The window is full and evict is false, or nchunks is 0 + mapped.mstart = _window_chunks - empty; + mapped.nchunks = 0; + return (nchunks == 0) ? 0 : ENOBUFS; + } + + // Map new chunks + auto mstart = _window_chunks - empty; + auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart); + if (error) { + return error; + } + if (!_mappings.empty()) { + auto& m {_mappings.back()}; + if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) { + // Extend previous mapping + m.nchunks += to_map; + mapped.mstart = mstart; + mapped.nchunks = to_map; + return 0; + } + } + _mappings.emplace_back(nodeid, to_map, fstart, mstart); + mapped.mstart = mstart; + mapped.nchunks = to_map; + return 0; +} + +int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep) +{ + // Determine necessary changes + chunk to_unmap = 0; + auto erase_first {_mappings.cend()}; + chunk to_unmap_from_last = 0; + for (auto it {_mappings.crbegin()}; + to_unmap < nchunks && it != _mappings.crend(); it++) { + + if (it->nchunks <= nchunks - to_unmap) { + // Remove *it + erase_first = it.base() - 1; + to_unmap += it->nchunks; + } else { + // Modify *it + to_unmap_from_last = nchunks - to_unmap; + to_unmap = nchunks; + } + } + if (to_unmap == 0) { + // The window is empty, or nchunks is 0 + unmapped.mstart = first_empty(); + unmapped.nchunks = 0; + return (nchunks == 0) ? 0 : ENODATA; + } + + // Apply changes + if (deep) { + auto mstart = first_empty() - to_unmap; + auto error = unmap_ll(to_unmap, mstart); + if (error) { + return error; + } + } + _mappings.erase(erase_first, _mappings.cend()); + if (to_unmap_from_last > 0) { + _mappings.back().nchunks -= to_unmap_from_last; + } + + unmapped.mstart = first_empty(); + unmapped.nchunks = to_unmap; + return 0; +} + +int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks, + chunk fstart, chunk mstart) +{ + assert(mstart + nchunks <= _window_chunks); + + // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from + // the spec: "Alignment constraints for FUSE_SETUPMAPPING and + // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT + // negotiation"): + // - foffset: multiple of map_alignment from FUSE_INIT + // - len: not larger than remaining file? + // - moffset: multiple of map_alignment from FUSE_INIT + // In practice, map_alignment is the host's page size, because foffset and + // moffset are passed to mmap() on the host. These are satisfied by + // _chunk_size being a multiple of map_alignment. + + std::unique_ptr<fuse_setupmapping_in> in_args { + new (std::nothrow) fuse_setupmapping_in()}; + if (!in_args) { + return ENOMEM; + } + in_args->fh = file_handle; + in_args->foffset = fstart * _chunk_size; + in_args->len = nchunks * _chunk_size; + in_args->flags = 0; // Read-only + in_args->moffset = mstart * _chunk_size; + + virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, " + "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len, + in_args->moffset); + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING, + nodeid, in_args.get(), sizeof(*in_args), nullptr, 0); + if (error) { + kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid); + return error; + } + + return 0; +} + +int dax_manager::unmap_ll(chunk nchunks, chunk mstart) +{ + assert(mstart + nchunks <= _window_chunks); + + // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by + // fuse_removemapping_in.count fuse_removemapping_one arguments in general. + auto in_args_size = sizeof(fuse_removemapping_in) + + sizeof(fuse_removemapping_one); + std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]}; + if (!in_args) { + return ENOMEM; + } + auto r_in = new (in_args.get()) fuse_removemapping_in(); + auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in)) + fuse_removemapping_one(); + r_in->count = 1; + r_one->moffset = mstart * _chunk_size; + r_one->len = nchunks * _chunk_size; + + // The nodeid is irrelevant for the current implementation of + // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a + // request per inode? + uint64_t nodeid = 0; + + virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n", + nodeid, r_one->moffset, r_one->len); + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING, + nodeid, in_args.get(), in_args_size, nullptr, 0); + if (error) { + kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid); + return error; + } + + return 0; +} + +bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) const +{ + for (auto& m : _mappings) { + if (m.nodeid == nodeid && + m.fstart <= fstart && + m.fstart + m.nchunks > fstart) { + + // m contains fstart + auto excess = fstart - m.fstart; // excess contained in m + found.nchunks = m.nchunks - excess; + found.mstart = m.mstart + excess; + return true; + } + } + return false; +} + +dax_manager::chunk dax_manager::first_empty() const +{ + if (_mappings.empty()) { + return 0; + } + auto& m {_mappings.back()}; + return m.mstart + m.nchunks; +} + +} diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh new file mode 100644 index 00000000..2b9fa341 --- /dev/null +++ b/fs/virtiofs/virtiofs_dax.hh @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2020 Fotis Xenakis + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include <vector> + +#include <api/assert.h> +#include <osv/mutex.h> +#include <osv/uio.h> + +#include "drivers/virtio-fs.hh" +#include "virtiofs.hh" + +namespace virtiofs { + +// A manager for the DAX window of a virtio-fs device. This implements a +// straight-forward scheme for file mappings: +// - The window is split into equally-sized chunks. Each mapping occupies an +// integer amount of consecutive chunks. +// - New mappings are placed on the lowest available chunks in the window. +// - When there are not enough chunks available for a new mapping, the highest +// (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks are +// mapped in a LIFO manner (the window resembles a stack). +class dax_manager { +public: + static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB + + // Construct a new manager for the DAX window associated with @drv (as + // returned by drv.get_dax()). The alignment constraint of the device (as + // reported by drv.get_map_alignment()) should be compatible with + // @chunk_size. + dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE) + : _drv {drv}, + _window {drv.get_dax()}, + _chunk_size {chunk_size}, + _window_chunks {_window->len / _chunk_size} { + + assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0); + + // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< CHUNK_SIZE) + // is effectively ignored. + } + + // Read @read_amt bytes from @inode, using the DAX window. If @aggressive, + // try to prefetch as much of the rest of the file as possible. + int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt, + struct uio& uio, bool aggressive = false); + +private: + // Helper type to better distinguish referring to chunks vs bytes + using chunk = size_t; + + struct mapping { + mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart) + : nodeid {_nodeid}, + nchunks {_nchunks}, + fstart {_fstart}, + mstart {_mstart} {} + uint64_t nodeid; + chunk nchunks; + chunk fstart; + chunk mstart; + }; + + struct mapping_part { + chunk nchunks; + chunk mstart; + }; + + // Map up to @nchunks chunks of the file with @nodeid, starting at chunk + // @fstart of the file, after all other mappings. If @evict, evict other + // chunks if necessary. Returns in @mapped the new mapping and non-zero on + // failure. Called with _lock held (for writing). + int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk fstart, + mapping_part& mapped, bool evict = false); + // Unmap @nchunks last chunks, also doing an actual unmapping on the device + // if @deep. Returns in @unmapped what was unmapped and non-zero on failure. + // Called with _lock held (for writing). + int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false); + // Map @nchunks chunks of the file with @nodeid (opened as @fh), starting at + // chunk @fstart of the file and chunk @mstart of the window. Returns + // non-zero on failure. Called with _lock held (for writing). + int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart, + chunk mstart); + // Unmap @nchunks chunks, starting at chunk @mstart of the window. Returns + // non-zero on failure. Called with _lock held (for writing). + int unmap_ll(chunk nchunks, chunk mstart); + + // Return in @found the largest contiguous existing mapping for @nodeid + // starting at @fstart. If none found, returns false. Called with _lock held + // (for reading). + bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const; + // Returns the first empty chunk in the window, or one-past-the-last if the + // window is full. Called with _lock held (for reading). + chunk first_empty() const; + + virtio::fs& _drv; + const virtio::fs::dax_window* const _window; + const size_t _chunk_size; + const chunk _window_chunks; + // TODO OPT: Switch to rwlock + mutex _lock; + std::vector<mapping> _mappings; +}; + +} -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/AM0PR03MB62924F7C3278ED342493D529A6960%40AM0PR03MB6292.eurprd03.prod.outlook.com.
