On Sunday, June 21, 2020 at 5:06:28 AM UTC-4 Fotis Xenakis wrote:
> For details on the manager's policy, please see > fs/virtiofs/virtiofs_dax.hh. > > Signed-off-by: Fotis Xenakis <[email protected]> > --- > Makefile | 39 +++--- > fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++ > fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++ > 3 files changed, 397 insertions(+), 19 deletions(-) > create mode 100644 fs/virtiofs/virtiofs_dax.cc > create mode 100644 fs/virtiofs/virtiofs_dax.hh > > diff --git a/Makefile b/Makefile > index 20ddf3b1..12366794 100644 > --- a/Makefile > +++ b/Makefile > @@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o > bsd += bsd/porting/pcpu.o > bsd += bsd/porting/bus_dma.o > bsd += bsd/porting/kobj.o > -bsd += bsd/sys/netinet/if_ether.o > -bsd += bsd/sys/compat/linux/linux_socket.o > -bsd += bsd/sys/compat/linux/linux_ioctl.o > -bsd += bsd/sys/net/if_ethersubr.o > -bsd += bsd/sys/net/if_llatbl.o > -bsd += bsd/sys/net/radix.o > -bsd += bsd/sys/net/route.o > -bsd += bsd/sys/net/raw_cb.o > -bsd += bsd/sys/net/raw_usrreq.o > -bsd += bsd/sys/net/rtsock.o > -bsd += bsd/sys/net/netisr.o > -bsd += bsd/sys/net/netisr1.o > -bsd += bsd/sys/net/if_dead.o > -bsd += bsd/sys/net/if_clone.o > -bsd += bsd/sys/net/if_loop.o > -bsd += bsd/sys/net/if.o > -bsd += bsd/sys/net/pfil.o > +bsd += bsd/sys/netinet/if_ether.o > +bsd += bsd/sys/compat/linux/linux_socket.o > +bsd += bsd/sys/compat/linux/linux_ioctl.o > +bsd += bsd/sys/net/if_ethersubr.o > +bsd += bsd/sys/net/if_llatbl.o > +bsd += bsd/sys/net/radix.o > +bsd += bsd/sys/net/route.o > +bsd += bsd/sys/net/raw_cb.o > +bsd += bsd/sys/net/raw_usrreq.o > +bsd += bsd/sys/net/rtsock.o > +bsd += bsd/sys/net/netisr.o > +bsd += bsd/sys/net/netisr1.o > +bsd += bsd/sys/net/if_dead.o > +bsd += bsd/sys/net/if_clone.o > +bsd += bsd/sys/net/if_loop.o > +bsd += bsd/sys/net/if.o > +bsd += bsd/sys/net/pfil.o > bsd += bsd/sys/net/routecache.o > bsd += bsd/sys/netinet/in.o > bsd += bsd/sys/netinet/in_pcb.o > @@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \ > rofs/rofs_common.o > > fs_objs += virtiofs/virtiofs_vfsops.o \ > - virtiofs/virtiofs_vnops.o > + virtiofs/virtiofs_vnops.o \ > + virtiofs/virtiofs_dax.o > > fs_objs += pseudofs/pseudofs.o > fs_objs += procfs/procfs_vnops.o > @@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, > $(libuutil-file-list), $(out)/bsd/cddl/contri > > define libuutil-includes > bsd/cddl/contrib/opensolaris/lib/libuutil/common > - bsd/cddl/compat/opensolaris/include > + bsd/cddl/compat/opensolaris/include > bsd/sys/cddl/contrib/opensolaris/uts/common > bsd/sys/cddl/compat/opensolaris > bsd/cddl/contrib/opensolaris/head > diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc > new file mode 100644 > index 00000000..8e612eb5 > --- /dev/null > +++ b/fs/virtiofs/virtiofs_dax.cc > @@ -0,0 +1,268 @@ > +/* > + * Copyright (C) 2020 Fotis Xenakis > + * > + * This work is open source software, licensed under the terms of the > + * BSD license as described in the LICENSE file in the top-level > directory. > + */ > + > +#include <algorithm> > +#include <mutex> > + > +#include <osv/debug.h> > +#include <osv/uio.h> > + > +#include "fuse_kernel.h" > +#include "virtiofs.hh" > +#include "virtiofs_dax.hh" > +#include "virtiofs_i.hh" > + > +namespace virtiofs { > + > +int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 > read_amt, > + struct uio& uio, bool aggressive) > +{ > + std::lock_guard<mutex> guard {_lock}; > + > + // Necessary pre-declarations due to goto below > + size_t to_map; > + chunk nchunks; > + int error; > + mapping_part mp; > + chunk fstart = uio.uio_offset / _chunk_size; > + off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk > + if (find(inode.nodeid, fstart, mp)) { > + // Requested data (at least some initial) is already mapped > + auto read_amt_act = std::min<size_t>(read_amt, > + (mp.nchunks * _chunk_size) - coffset); > + virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, " > + "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act, > + (mp.mstart * _chunk_size) + coffset); > + goto out; > + } > + > + // Map file > + to_map = coffset; // bytes to map > + if (aggressive) { > + // Map the rest of the file > + to_map += inode.attr.size - uio.uio_offset; > + } else { > + // Map just enough chunks to satisfy read_amt > + to_map += read_amt; > + } > + nchunks = to_map / _chunk_size; > + if (to_map % _chunk_size > 0) { > + nchunks++; > + } > + // NOTE: This relies on the fact that requesting a mapping longer than > the > + // remaining file works (see mmap() on the host). If that didn't work, > we > + // would have to request exact mappings (byte-granularity, rather than > + // chunk-granularity). > + error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true); > + if (error) { > + return error; > + } > + > +out: > + auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset; > + auto read_amt_act = std::min<size_t>(read_amt, > + (mp.nchunks * _chunk_size) - coffset); > + // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. > volatile > + // accesses). From the spec: "Drivers map this shared memory region with > + // writeback caching as if it were regular RAM." > As you have seen I have applied this patch because the code looks good and works. But I am curious how is it possible, given what you wrote above, that the DAX window mapping can be accessed like this? It seems as if the physical - virtual memory mapping is 1-1 but then it might collide on the virtual side with other mappings OSv is maintaining. Am I missing something? Or are we just lucky it just happens to not collide during the tests we have performed so far? > + error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio); > + if (error) { > + kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid); > + } > + return error; > +} > + > +int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk > nchunks, > + chunk fstart, mapping_part& mapped, bool evict) > +{ > + // If necessary, unmap just enough chunks > + auto empty = _window_chunks - first_empty(); > + if (evict && empty < nchunks) { > + mapping_part mp; > + auto error = unmap(nchunks - empty, mp, false); > + if (error) { > + return error; > + } > + empty += mp.nchunks; > + } > + auto to_map = std::min<chunk>(nchunks, empty); > + if (to_map == 0) { > + // The window is full and evict is false, or nchunks is 0 > + mapped.mstart = _window_chunks - empty; > + mapped.nchunks = 0; > + return (nchunks == 0) ? 0 : ENOBUFS; > + } > + > + // Map new chunks > + auto mstart = _window_chunks - empty; > + auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart); > + if (error) { > + return error; > + } > + if (!_mappings.empty()) { > + auto& m {_mappings.back()}; > + if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) { > + // Extend previous mapping > + m.nchunks += to_map; > + mapped.mstart = mstart; > + mapped.nchunks = to_map; > + return 0; > + } > + } > + _mappings.emplace_back(nodeid, to_map, fstart, mstart); > + mapped.mstart = mstart; > + mapped.nchunks = to_map; > + return 0; > +} > + > +int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep) > +{ > + // Determine necessary changes > + chunk to_unmap = 0; > + auto erase_first {_mappings.cend()}; > + chunk to_unmap_from_last = 0; > + for (auto it {_mappings.crbegin()}; > + to_unmap < nchunks && it != _mappings.crend(); it++) { > + > + if (it->nchunks <= nchunks - to_unmap) { > + // Remove *it > + erase_first = it.base() - 1; > + to_unmap += it->nchunks; > + } else { > + // Modify *it > + to_unmap_from_last = nchunks - to_unmap; > + to_unmap = nchunks; > + } > + } > + if (to_unmap == 0) { > + // The window is empty, or nchunks is 0 > + unmapped.mstart = first_empty(); > + unmapped.nchunks = 0; > + return (nchunks == 0) ? 0 : ENODATA; > + } > + > + // Apply changes > + if (deep) { > + auto mstart = first_empty() - to_unmap; > + auto error = unmap_ll(to_unmap, mstart); > + if (error) { > + return error; > + } > + } > + _mappings.erase(erase_first, _mappings.cend()); > + if (to_unmap_from_last > 0) { > + _mappings.back().nchunks -= to_unmap_from_last; > + } > + > + unmapped.mstart = first_empty(); > + unmapped.nchunks = to_unmap; > + return 0; > +} > + > +int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk > nchunks, > + chunk fstart, chunk mstart) > +{ > + assert(mstart + nchunks <= _window_chunks); > + > + // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, > from > + // the spec: "Alignment constraints for FUSE_SETUPMAPPING and > + // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT > + // negotiation"): > + // - foffset: multiple of map_alignment from FUSE_INIT > + // - len: not larger than remaining file? > + // - moffset: multiple of map_alignment from FUSE_INIT > + // In practice, map_alignment is the host's page size, because foffset > and > + // moffset are passed to mmap() on the host. These are satisfied by > + // _chunk_size being a multiple of map_alignment. > + > + std::unique_ptr<fuse_setupmapping_in> in_args { > + new (std::nothrow) fuse_setupmapping_in()}; > + if (!in_args) { > + return ENOMEM; > + } > + in_args->fh = file_handle; > + in_args->foffset = fstart * _chunk_size; > + in_args->len = nchunks * _chunk_size; > + in_args->flags = 0; // Read-only > + in_args->moffset = mstart * _chunk_size; > + > + virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, > " > + "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len, > + in_args->moffset); > + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING, > + nodeid, in_args.get(), sizeof(*in_args), nullptr, 0); > + if (error) { > + kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid); > + return error; > + } > + > + return 0; > +} > + > +int dax_manager::unmap_ll(chunk nchunks, chunk mstart) > +{ > + assert(mstart + nchunks <= _window_chunks); > + > + // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by > + // fuse_removemapping_in.count fuse_removemapping_one arguments in > general. > + auto in_args_size = sizeof(fuse_removemapping_in) + > + sizeof(fuse_removemapping_one); > + std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]}; > + if (!in_args) { > + return ENOMEM; > + } > + auto r_in = new (in_args.get()) fuse_removemapping_in(); > + auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in)) > + fuse_removemapping_one(); > + r_in->count = 1; > + r_one->moffset = mstart * _chunk_size; > + r_one->len = nchunks * _chunk_size; > + > + // The nodeid is irrelevant for the current implementation of > + // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a > + // request per inode? > + uint64_t nodeid = 0; > + > + virtiofs_debug("inode %lld, removing mapping (moffset=%lld, > len=%lld)\n", > + nodeid, r_one->moffset, r_one->len); > + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING, > + nodeid, in_args.get(), in_args_size, nullptr, 0); > + if (error) { > + kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid); > + return error; > + } > + > + return 0; > +} > + > +bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& > found) const > +{ > + for (auto& m : _mappings) { > + if (m.nodeid == nodeid && > + m.fstart <= fstart && > + m.fstart + m.nchunks > fstart) { > + > + // m contains fstart > + auto excess = fstart - m.fstart; // excess contained in m > + found.nchunks = m.nchunks - excess; > + found.mstart = m.mstart + excess; > + return true; > + } > + } > + return false; > +} > + > +dax_manager::chunk dax_manager::first_empty() const > +{ > + if (_mappings.empty()) { > + return 0; > + } > + auto& m {_mappings.back()}; > + return m.mstart + m.nchunks; > +} > + > +} > diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh > new file mode 100644 > index 00000000..2b9fa341 > --- /dev/null > +++ b/fs/virtiofs/virtiofs_dax.hh > @@ -0,0 +1,109 @@ > +/* > + * Copyright (C) 2020 Fotis Xenakis > + * > + * This work is open source software, licensed under the terms of the > + * BSD license as described in the LICENSE file in the top-level > directory. > + */ > + > +#include <vector> > + > +#include <api/assert.h> > +#include <osv/mutex.h> > +#include <osv/uio.h> > + > +#include "drivers/virtio-fs.hh" > +#include "virtiofs.hh" > + > +namespace virtiofs { > + > +// A manager for the DAX window of a virtio-fs device. This implements a > +// straight-forward scheme for file mappings: > +// - The window is split into equally-sized chunks. Each mapping occupies > an > +// integer amount of consecutive chunks. > +// - New mappings are placed on the lowest available chunks in the > window. > +// - When there are not enough chunks available for a new mapping, the > highest > +// (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks > are > +// mapped in a LIFO manner (the window resembles a stack). > +class dax_manager { > +public: > + static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB > + > + // Construct a new manager for the DAX window associated with @drv (as > + // returned by drv.get_dax()). The alignment constraint of the device > (as > + // reported by drv.get_map_alignment()) should be compatible with > + // @chunk_size. > + dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE) > + : _drv {drv}, > + _window {drv.get_dax()}, > + _chunk_size {chunk_size}, > + _window_chunks {_window->len / _chunk_size} { > + > + assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0); > + > + // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< CHUNK_SIZE) > + // is effectively ignored. > + } > + > + // Read @read_amt bytes from @inode, using the DAX window. If > @aggressive, > + // try to prefetch as much of the rest of the file as possible. > + int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt, > + struct uio& uio, bool aggressive = false); > + > +private: > + // Helper type to better distinguish referring to chunks vs bytes > + using chunk = size_t; > + > + struct mapping { > + mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart) > + : nodeid {_nodeid}, > + nchunks {_nchunks}, > + fstart {_fstart}, > + mstart {_mstart} {} > + uint64_t nodeid; > + chunk nchunks; > + chunk fstart; > + chunk mstart; > + }; > + > + struct mapping_part { > + chunk nchunks; > + chunk mstart; > + }; > + > + // Map up to @nchunks chunks of the file with @nodeid, starting at chunk > + // @fstart of the file, after all other mappings. If @evict, evict other > + // chunks if necessary. Returns in @mapped the new mapping and non-zero > on > + // failure. Called with _lock held (for writing). > + int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk > fstart, > + mapping_part& mapped, bool evict = false); > + // Unmap @nchunks last chunks, also doing an actual unmapping on the > device > + // if @deep. Returns in @unmapped what was unmapped and non-zero on > failure. > + // Called with _lock held (for writing). > + int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false); > + // Map @nchunks chunks of the file with @nodeid (opened as @fh), > starting at > + // chunk @fstart of the file and chunk @mstart of the window. Returns > + // non-zero on failure. Called with _lock held (for writing). > + int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart, > + chunk mstart); > + // Unmap @nchunks chunks, starting at chunk @mstart of the window. > Returns > + // non-zero on failure. Called with _lock held (for writing). > + int unmap_ll(chunk nchunks, chunk mstart); > + > + // Return in @found the largest contiguous existing mapping for @nodeid > + // starting at @fstart. If none found, returns false. Called with _lock > held > + // (for reading). > + bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const; > + // Returns the first empty chunk in the window, or one-past-the-last if > the > + // window is full. Called with _lock held (for reading). > + chunk first_empty() const; > + > + virtio::fs& _drv; > + const virtio::fs::dax_window* const _window; > + const size_t _chunk_size; > + const chunk _window_chunks; > + // TODO OPT: Switch to rwlock > + mutex _lock; > + std::vector<mapping> _mappings; > +}; > + > +} > -- > 2.27.0 > > -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/5ccb8460-3019-4db9-a21a-fb16930ef894n%40googlegroups.com.
