Makes sense. Thanks for your explanation. On Saturday, June 27, 2020 at 9:12:42 AM UTC-4 Fotis Xenakis wrote:
> Στις Σάββατο, 27 Ιουνίου 2020 στις 1:19:40 π.μ. UTC+3, ο χρήστης > [email protected] έγραψε: > >> On Sunday, June 21, 2020 at 5:06:28 AM UTC-4 Fotis Xenakis wrote: >> >>> For details on the manager's policy, please see >>> fs/virtiofs/virtiofs_dax.hh. >>> >>> Signed-off-by: Fotis Xenakis <[email protected]> >>> --- >>> Makefile | 39 +++--- >>> fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++ >>> fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++ >>> 3 files changed, 397 insertions(+), 19 deletions(-) >>> create mode 100644 fs/virtiofs/virtiofs_dax.cc >>> create mode 100644 fs/virtiofs/virtiofs_dax.hh >>> >>> diff --git a/Makefile b/Makefile >>> index 20ddf3b1..12366794 100644 >>> --- a/Makefile >>> +++ b/Makefile >>> @@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o >>> bsd += bsd/porting/pcpu.o >>> bsd += bsd/porting/bus_dma.o >>> bsd += bsd/porting/kobj.o >>> -bsd += bsd/sys/netinet/if_ether.o >>> -bsd += bsd/sys/compat/linux/linux_socket.o >>> -bsd += bsd/sys/compat/linux/linux_ioctl.o >>> -bsd += bsd/sys/net/if_ethersubr.o >>> -bsd += bsd/sys/net/if_llatbl.o >>> -bsd += bsd/sys/net/radix.o >>> -bsd += bsd/sys/net/route.o >>> -bsd += bsd/sys/net/raw_cb.o >>> -bsd += bsd/sys/net/raw_usrreq.o >>> -bsd += bsd/sys/net/rtsock.o >>> -bsd += bsd/sys/net/netisr.o >>> -bsd += bsd/sys/net/netisr1.o >>> -bsd += bsd/sys/net/if_dead.o >>> -bsd += bsd/sys/net/if_clone.o >>> -bsd += bsd/sys/net/if_loop.o >>> -bsd += bsd/sys/net/if.o >>> -bsd += bsd/sys/net/pfil.o >>> +bsd += bsd/sys/netinet/if_ether.o >>> +bsd += bsd/sys/compat/linux/linux_socket.o >>> +bsd += bsd/sys/compat/linux/linux_ioctl.o >>> +bsd += bsd/sys/net/if_ethersubr.o >>> +bsd += bsd/sys/net/if_llatbl.o >>> +bsd += bsd/sys/net/radix.o >>> +bsd += bsd/sys/net/route.o >>> +bsd += bsd/sys/net/raw_cb.o >>> +bsd += bsd/sys/net/raw_usrreq.o >>> +bsd += bsd/sys/net/rtsock.o >>> +bsd += bsd/sys/net/netisr.o >>> +bsd += bsd/sys/net/netisr1.o >>> +bsd += bsd/sys/net/if_dead.o >>> +bsd += bsd/sys/net/if_clone.o >>> +bsd += bsd/sys/net/if_loop.o >>> +bsd += bsd/sys/net/if.o >>> +bsd += bsd/sys/net/pfil.o >>> bsd += bsd/sys/net/routecache.o >>> bsd += bsd/sys/netinet/in.o >>> bsd += bsd/sys/netinet/in_pcb.o >>> @@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \ >>> rofs/rofs_common.o >>> >>> fs_objs += virtiofs/virtiofs_vfsops.o \ >>> - virtiofs/virtiofs_vnops.o >>> + virtiofs/virtiofs_vnops.o \ >>> + virtiofs/virtiofs_dax.o >>> >>> fs_objs += pseudofs/pseudofs.o >>> fs_objs += procfs/procfs_vnops.o >>> @@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, >>> $(libuutil-file-list), $(out)/bsd/cddl/contri >>> >>> define libuutil-includes >>> bsd/cddl/contrib/opensolaris/lib/libuutil/common >>> - bsd/cddl/compat/opensolaris/include >>> + bsd/cddl/compat/opensolaris/include >>> bsd/sys/cddl/contrib/opensolaris/uts/common >>> bsd/sys/cddl/compat/opensolaris >>> bsd/cddl/contrib/opensolaris/head >>> diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc >>> new file mode 100644 >>> index 00000000..8e612eb5 >>> --- /dev/null >>> +++ b/fs/virtiofs/virtiofs_dax.cc >>> @@ -0,0 +1,268 @@ >>> +/* >>> + * Copyright (C) 2020 Fotis Xenakis >>> + * >>> + * This work is open source software, licensed under the terms of the >>> + * BSD license as described in the LICENSE file in the top-level >>> directory. >>> + */ >>> + >>> +#include <algorithm> >>> +#include <mutex> >>> + >>> +#include <osv/debug.h> >>> +#include <osv/uio.h> >>> + >>> +#include "fuse_kernel.h" >>> +#include "virtiofs.hh" >>> +#include "virtiofs_dax.hh" >>> +#include "virtiofs_i.hh" >>> + >>> +namespace virtiofs { >>> + >>> +int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 >>> read_amt, >>> + struct uio& uio, bool aggressive) >>> +{ >>> + std::lock_guard<mutex> guard {_lock}; >>> + >>> + // Necessary pre-declarations due to goto below >>> + size_t to_map; >>> + chunk nchunks; >>> + int error; >>> + mapping_part mp; >>> + chunk fstart = uio.uio_offset / _chunk_size; >>> + off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk >>> + if (find(inode.nodeid, fstart, mp)) { >>> + // Requested data (at least some initial) is already mapped >>> + auto read_amt_act = std::min<size_t>(read_amt, >>> + (mp.nchunks * _chunk_size) - coffset); >>> + virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, " >>> + "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act, >>> + (mp.mstart * _chunk_size) + coffset); >>> + goto out; >>> + } >>> + >>> + // Map file >>> + to_map = coffset; // bytes to map >>> + if (aggressive) { >>> + // Map the rest of the file >>> + to_map += inode.attr.size - uio.uio_offset; >>> + } else { >>> + // Map just enough chunks to satisfy read_amt >>> + to_map += read_amt; >>> + } >>> + nchunks = to_map / _chunk_size; >>> + if (to_map % _chunk_size > 0) { >>> + nchunks++; >>> + } >>> + // NOTE: This relies on the fact that requesting a mapping longer than >>> the >>> + // remaining file works (see mmap() on the host). If that didn't work, >>> we >>> + // would have to request exact mappings (byte-granularity, rather than >>> + // chunk-granularity). >>> + error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true); >>> + if (error) { >>> + return error; >>> + } >>> + >>> +out: >>> + auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset; >>> + auto read_amt_act = std::min<size_t>(read_amt, >>> + (mp.nchunks * _chunk_size) - coffset); >>> + // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. >>> volatile >>> + // accesses). From the spec: "Drivers map this shared memory region >>> with >>> + // writeback caching as if it were regular RAM." >>> >> As you have seen I have applied this patch because the code looks good >> and works. But I am curious how is it possible, given what you wrote above, >> that the DAX window mapping can be accessed like this? It seems as if the >> physical - virtual memory mapping is 1-1 but then it might collide on the >> virtual side with other mappings OSv is maintaining. Am I missing >> something? Or are we just lucky it just happens to not collide during the >> tests we have performed so far? >> > It's not entirely clear to me what the concern is about collisions. The > DAX window shows up as device memory, subsequently mapped in the guest's > virtual address space (that's taken care of by existing code in the PCI > subsystem). > The note above (btw, this has been present since the initial DAX support) > touches on why we are using uiomove (i.e. plain memory accesses) instead of > mmio_get* [1] (i.e. memory accesses with the volatile specifier, usually > seen when touching device memory). I can't claim to be 100% sure this is > always correct (and it would be hard to debug if it isn't), but the spec's > note and my reasoning (taking into account the underlying implementation of > the DAX window with mmap) provide some confidence. > > [1] > https://github.com/cloudius-systems/osv/blob/df5ce400ef938269b0937fdee0f6c3d7f1fbc533/core/mmio.cc#L34 > >> + error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio); >>> + if (error) { >>> + kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid); >>> + } >>> + return error; >>> +} >>> + >>> +int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk >>> nchunks, >>> + chunk fstart, mapping_part& mapped, bool evict) >>> +{ >>> + // If necessary, unmap just enough chunks >>> + auto empty = _window_chunks - first_empty(); >>> + if (evict && empty < nchunks) { >>> + mapping_part mp; >>> + auto error = unmap(nchunks - empty, mp, false); >>> + if (error) { >>> + return error; >>> + } >>> + empty += mp.nchunks; >>> + } >>> + auto to_map = std::min<chunk>(nchunks, empty); >>> + if (to_map == 0) { >>> + // The window is full and evict is false, or nchunks is 0 >>> + mapped.mstart = _window_chunks - empty; >>> + mapped.nchunks = 0; >>> + return (nchunks == 0) ? 0 : ENOBUFS; >>> + } >>> + >>> + // Map new chunks >>> + auto mstart = _window_chunks - empty; >>> + auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart); >>> + if (error) { >>> + return error; >>> + } >>> + if (!_mappings.empty()) { >>> + auto& m {_mappings.back()}; >>> + if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) { >>> + // Extend previous mapping >>> + m.nchunks += to_map; >>> + mapped.mstart = mstart; >>> + mapped.nchunks = to_map; >>> + return 0; >>> + } >>> + } >>> + _mappings.emplace_back(nodeid, to_map, fstart, mstart); >>> + mapped.mstart = mstart; >>> + mapped.nchunks = to_map; >>> + return 0; >>> +} >>> + >>> +int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool >>> deep) >>> +{ >>> + // Determine necessary changes >>> + chunk to_unmap = 0; >>> + auto erase_first {_mappings.cend()}; >>> + chunk to_unmap_from_last = 0; >>> + for (auto it {_mappings.crbegin()}; >>> + to_unmap < nchunks && it != _mappings.crend(); it++) { >>> + >>> + if (it->nchunks <= nchunks - to_unmap) { >>> + // Remove *it >>> + erase_first = it.base() - 1; >>> + to_unmap += it->nchunks; >>> + } else { >>> + // Modify *it >>> + to_unmap_from_last = nchunks - to_unmap; >>> + to_unmap = nchunks; >>> + } >>> + } >>> + if (to_unmap == 0) { >>> + // The window is empty, or nchunks is 0 >>> + unmapped.mstart = first_empty(); >>> + unmapped.nchunks = 0; >>> + return (nchunks == 0) ? 0 : ENODATA; >>> + } >>> + >>> + // Apply changes >>> + if (deep) { >>> + auto mstart = first_empty() - to_unmap; >>> + auto error = unmap_ll(to_unmap, mstart); >>> + if (error) { >>> + return error; >>> + } >>> + } >>> + _mappings.erase(erase_first, _mappings.cend()); >>> + if (to_unmap_from_last > 0) { >>> + _mappings.back().nchunks -= to_unmap_from_last; >>> + } >>> + >>> + unmapped.mstart = first_empty(); >>> + unmapped.nchunks = to_unmap; >>> + return 0; >>> +} >>> + >>> +int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk >>> nchunks, >>> + chunk fstart, chunk mstart) >>> +{ >>> + assert(mstart + nchunks <= _window_chunks); >>> + >>> + // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, >>> from >>> + // the spec: "Alignment constraints for FUSE_SETUPMAPPING and >>> + // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT >>> + // negotiation"): >>> + // - foffset: multiple of map_alignment from FUSE_INIT >>> + // - len: not larger than remaining file? >>> + // - moffset: multiple of map_alignment from FUSE_INIT >>> + // In practice, map_alignment is the host's page size, because foffset >>> and >>> + // moffset are passed to mmap() on the host. These are satisfied by >>> + // _chunk_size being a multiple of map_alignment. >>> + >>> + std::unique_ptr<fuse_setupmapping_in> in_args { >>> + new (std::nothrow) fuse_setupmapping_in()}; >>> + if (!in_args) { >>> + return ENOMEM; >>> + } >>> + in_args->fh = file_handle; >>> + in_args->foffset = fstart * _chunk_size; >>> + in_args->len = nchunks * _chunk_size; >>> + in_args->flags = 0; // Read-only >>> + in_args->moffset = mstart * _chunk_size; >>> + >>> + virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, >>> len=%lld, " >>> + "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len, >>> + in_args->moffset); >>> + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING, >>> + nodeid, in_args.get(), sizeof(*in_args), nullptr, 0); >>> + if (error) { >>> + kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid); >>> + return error; >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +int dax_manager::unmap_ll(chunk nchunks, chunk mstart) >>> +{ >>> + assert(mstart + nchunks <= _window_chunks); >>> + >>> + // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed >>> by >>> + // fuse_removemapping_in.count fuse_removemapping_one arguments in >>> general. >>> + auto in_args_size = sizeof(fuse_removemapping_in) + >>> + sizeof(fuse_removemapping_one); >>> + std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]}; >>> + if (!in_args) { >>> + return ENOMEM; >>> + } >>> + auto r_in = new (in_args.get()) fuse_removemapping_in(); >>> + auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in)) >>> + fuse_removemapping_one(); >>> + r_in->count = 1; >>> + r_one->moffset = mstart * _chunk_size; >>> + r_one->len = nchunks * _chunk_size; >>> + >>> + // The nodeid is irrelevant for the current implementation of >>> + // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a >>> + // request per inode? >>> + uint64_t nodeid = 0; >>> + >>> + virtiofs_debug("inode %lld, removing mapping (moffset=%lld, >>> len=%lld)\n", >>> + nodeid, r_one->moffset, r_one->len); >>> + auto error = fuse_req_send_and_receive_reply(&_drv, >>> FUSE_REMOVEMAPPING, >>> + nodeid, in_args.get(), in_args_size, nullptr, 0); >>> + if (error) { >>> + kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid); >>> + return error; >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& >>> found) const >>> +{ >>> + for (auto& m : _mappings) { >>> + if (m.nodeid == nodeid && >>> + m.fstart <= fstart && >>> + m.fstart + m.nchunks > fstart) { >>> + >>> + // m contains fstart >>> + auto excess = fstart - m.fstart; // excess contained in m >>> + found.nchunks = m.nchunks - excess; >>> + found.mstart = m.mstart + excess; >>> + return true; >>> + } >>> + } >>> + return false; >>> +} >>> + >>> +dax_manager::chunk dax_manager::first_empty() const >>> +{ >>> + if (_mappings.empty()) { >>> + return 0; >>> + } >>> + auto& m {_mappings.back()}; >>> + return m.mstart + m.nchunks; >>> +} >>> + >>> +} >>> diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh >>> new file mode 100644 >>> index 00000000..2b9fa341 >>> --- /dev/null >>> +++ b/fs/virtiofs/virtiofs_dax.hh >>> @@ -0,0 +1,109 @@ >>> +/* >>> + * Copyright (C) 2020 Fotis Xenakis >>> + * >>> + * This work is open source software, licensed under the terms of the >>> + * BSD license as described in the LICENSE file in the top-level >>> directory. >>> + */ >>> + >>> +#include <vector> >>> + >>> +#include <api/assert.h> >>> +#include <osv/mutex.h> >>> +#include <osv/uio.h> >>> + >>> +#include "drivers/virtio-fs.hh" >>> +#include "virtiofs.hh" >>> + >>> +namespace virtiofs { >>> + >>> +// A manager for the DAX window of a virtio-fs device. This implements >>> a >>> +// straight-forward scheme for file mappings: >>> +// - The window is split into equally-sized chunks. Each mapping >>> occupies an >>> +// integer amount of consecutive chunks. >>> +// - New mappings are placed on the lowest available chunks in the >>> window. >>> +// - When there are not enough chunks available for a new mapping, the >>> highest >>> +// (i.e. most recently mapped) chunks occupied are evicted. Thus, >>> chunks are >>> +// mapped in a LIFO manner (the window resembles a stack). >>> +class dax_manager { >>> +public: >>> + static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB >>> + >>> + // Construct a new manager for the DAX window associated with @drv (as >>> + // returned by drv.get_dax()). The alignment constraint of the device >>> (as >>> + // reported by drv.get_map_alignment()) should be compatible with >>> + // @chunk_size. >>> + dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE) >>> + : _drv {drv}, >>> + _window {drv.get_dax()}, >>> + _chunk_size {chunk_size}, >>> + _window_chunks {_window->len / _chunk_size} { >>> + >>> + assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0); >>> + >>> + // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< >>> CHUNK_SIZE) >>> + // is effectively ignored. >>> + } >>> + >>> + // Read @read_amt bytes from @inode, using the DAX window. If >>> @aggressive, >>> + // try to prefetch as much of the rest of the file as possible. >>> + int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt, >>> + struct uio& uio, bool aggressive = false); >>> + >>> +private: >>> + // Helper type to better distinguish referring to chunks vs bytes >>> + using chunk = size_t; >>> + >>> + struct mapping { >>> + mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk >>> _mstart) >>> + : nodeid {_nodeid}, >>> + nchunks {_nchunks}, >>> + fstart {_fstart}, >>> + mstart {_mstart} {} >>> + uint64_t nodeid; >>> + chunk nchunks; >>> + chunk fstart; >>> + chunk mstart; >>> + }; >>> + >>> + struct mapping_part { >>> + chunk nchunks; >>> + chunk mstart; >>> + }; >>> + >>> + // Map up to @nchunks chunks of the file with @nodeid, starting at >>> chunk >>> + // @fstart of the file, after all other mappings. If @evict, evict >>> other >>> + // chunks if necessary. Returns in @mapped the new mapping and >>> non-zero on >>> + // failure. Called with _lock held (for writing). >>> + int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk >>> fstart, >>> + mapping_part& mapped, bool evict = false); >>> + // Unmap @nchunks last chunks, also doing an actual unmapping on the >>> device >>> + // if @deep. Returns in @unmapped what was unmapped and non-zero on >>> failure. >>> + // Called with _lock held (for writing). >>> + int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false); >>> + // Map @nchunks chunks of the file with @nodeid (opened as @fh), >>> starting at >>> + // chunk @fstart of the file and chunk @mstart of the window. Returns >>> + // non-zero on failure. Called with _lock held (for writing). >>> + int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart, >>> + chunk mstart); >>> + // Unmap @nchunks chunks, starting at chunk @mstart of the window. >>> Returns >>> + // non-zero on failure. Called with _lock held (for writing). >>> + int unmap_ll(chunk nchunks, chunk mstart); >>> + >>> + // Return in @found the largest contiguous existing mapping for >>> @nodeid >>> + // starting at @fstart. If none found, returns false. Called with >>> _lock held >>> + // (for reading). >>> + bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const; >>> + // Returns the first empty chunk in the window, or one-past-the-last >>> if the >>> + // window is full. Called with _lock held (for reading). >>> + chunk first_empty() const; >>> + >>> + virtio::fs& _drv; >>> + const virtio::fs::dax_window* const _window; >>> + const size_t _chunk_size; >>> + const chunk _window_chunks; >>> + // TODO OPT: Switch to rwlock >>> + mutex _lock; >>> + std::vector<mapping> _mappings; >>> +}; >>> + >>> +} >>> -- >>> 2.27.0 >>> >>> -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/8756ea78-7616-4553-adf7-d250771b71adn%40googlegroups.com.
