On Sunday, June 21, 2020 at 5:06:28 AM UTC-4 Fotis Xenakis wrote:

> For details on the manager's policy, please see 
> fs/virtiofs/virtiofs_dax.hh. 
>
> Signed-off-by: Fotis Xenakis <[email protected]> 
> --- 
> Makefile | 39 +++--- 
> fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++ 
> fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++ 
> 3 files changed, 397 insertions(+), 19 deletions(-) 
> create mode 100644 fs/virtiofs/virtiofs_dax.cc 
> create mode 100644 fs/virtiofs/virtiofs_dax.hh 
>
> diff --git a/Makefile b/Makefile 
> index 20ddf3b1..12366794 100644 
> --- a/Makefile 
> +++ b/Makefile 
> @@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o 
> bsd += bsd/porting/pcpu.o 
> bsd += bsd/porting/bus_dma.o 
> bsd += bsd/porting/kobj.o 
> -bsd += bsd/sys/netinet/if_ether.o 
> -bsd += bsd/sys/compat/linux/linux_socket.o 
> -bsd += bsd/sys/compat/linux/linux_ioctl.o 
> -bsd += bsd/sys/net/if_ethersubr.o 
> -bsd += bsd/sys/net/if_llatbl.o 
> -bsd += bsd/sys/net/radix.o 
> -bsd += bsd/sys/net/route.o 
> -bsd += bsd/sys/net/raw_cb.o 
> -bsd += bsd/sys/net/raw_usrreq.o 
> -bsd += bsd/sys/net/rtsock.o 
> -bsd += bsd/sys/net/netisr.o 
> -bsd += bsd/sys/net/netisr1.o 
> -bsd += bsd/sys/net/if_dead.o 
> -bsd += bsd/sys/net/if_clone.o 
> -bsd += bsd/sys/net/if_loop.o 
> -bsd += bsd/sys/net/if.o 
> -bsd += bsd/sys/net/pfil.o 
> +bsd += bsd/sys/netinet/if_ether.o 
> +bsd += bsd/sys/compat/linux/linux_socket.o 
> +bsd += bsd/sys/compat/linux/linux_ioctl.o 
> +bsd += bsd/sys/net/if_ethersubr.o 
> +bsd += bsd/sys/net/if_llatbl.o 
> +bsd += bsd/sys/net/radix.o 
> +bsd += bsd/sys/net/route.o 
> +bsd += bsd/sys/net/raw_cb.o 
> +bsd += bsd/sys/net/raw_usrreq.o 
> +bsd += bsd/sys/net/rtsock.o 
> +bsd += bsd/sys/net/netisr.o 
> +bsd += bsd/sys/net/netisr1.o 
> +bsd += bsd/sys/net/if_dead.o 
> +bsd += bsd/sys/net/if_clone.o 
> +bsd += bsd/sys/net/if_loop.o 
> +bsd += bsd/sys/net/if.o 
> +bsd += bsd/sys/net/pfil.o 
> bsd += bsd/sys/net/routecache.o 
> bsd += bsd/sys/netinet/in.o 
> bsd += bsd/sys/netinet/in_pcb.o 
> @@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \ 
> rofs/rofs_common.o 
>
> fs_objs += virtiofs/virtiofs_vfsops.o \ 
> - virtiofs/virtiofs_vnops.o 
> + virtiofs/virtiofs_vnops.o \ 
> + virtiofs/virtiofs_dax.o 
>
> fs_objs += pseudofs/pseudofs.o 
> fs_objs += procfs/procfs_vnops.o 
> @@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, 
> $(libuutil-file-list), $(out)/bsd/cddl/contri 
>
> define libuutil-includes 
> bsd/cddl/contrib/opensolaris/lib/libuutil/common 
> - bsd/cddl/compat/opensolaris/include 
> + bsd/cddl/compat/opensolaris/include 
> bsd/sys/cddl/contrib/opensolaris/uts/common 
> bsd/sys/cddl/compat/opensolaris 
> bsd/cddl/contrib/opensolaris/head 
> diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc 
> new file mode 100644 
> index 00000000..8e612eb5 
> --- /dev/null 
> +++ b/fs/virtiofs/virtiofs_dax.cc 
> @@ -0,0 +1,268 @@ 
> +/* 
> + * Copyright (C) 2020 Fotis Xenakis 
> + * 
> + * This work is open source software, licensed under the terms of the 
> + * BSD license as described in the LICENSE file in the top-level 
> directory. 
> + */ 
> + 
> +#include <algorithm> 
> +#include <mutex> 
> + 
> +#include <osv/debug.h> 
> +#include <osv/uio.h> 
> + 
> +#include "fuse_kernel.h" 
> +#include "virtiofs.hh" 
> +#include "virtiofs_dax.hh" 
> +#include "virtiofs_i.hh" 
> + 
> +namespace virtiofs { 
> + 
> +int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 
> read_amt, 
> + struct uio& uio, bool aggressive) 
> +{ 
> + std::lock_guard<mutex> guard {_lock}; 
> + 
> + // Necessary pre-declarations due to goto below 
> + size_t to_map; 
> + chunk nchunks; 
> + int error; 
> + mapping_part mp; 
> + chunk fstart = uio.uio_offset / _chunk_size; 
> + off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk 
> + if (find(inode.nodeid, fstart, mp)) { 
> + // Requested data (at least some initial) is already mapped 
> + auto read_amt_act = std::min<size_t>(read_amt, 
> + (mp.nchunks * _chunk_size) - coffset); 
> + virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, " 
> + "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act, 
> + (mp.mstart * _chunk_size) + coffset); 
> + goto out; 
> + } 
> + 
> + // Map file 
> + to_map = coffset; // bytes to map 
> + if (aggressive) { 
> + // Map the rest of the file 
> + to_map += inode.attr.size - uio.uio_offset; 
> + } else { 
> + // Map just enough chunks to satisfy read_amt 
> + to_map += read_amt; 
> + } 
> + nchunks = to_map / _chunk_size; 
> + if (to_map % _chunk_size > 0) { 
> + nchunks++; 
> + } 
> + // NOTE: This relies on the fact that requesting a mapping longer than 
> the 
> + // remaining file works (see mmap() on the host). If that didn't work, 
> we 
> + // would have to request exact mappings (byte-granularity, rather than 
> + // chunk-granularity). 
> + error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true); 
> + if (error) { 
> + return error; 
> + } 
> + 
> +out: 
> + auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset; 
> + auto read_amt_act = std::min<size_t>(read_amt, 
> + (mp.nchunks * _chunk_size) - coffset); 
> + // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. 
> volatile 
> + // accesses). From the spec: "Drivers map this shared memory region with 
> + // writeback caching as if it were regular RAM." 
>
As you have seen I have applied this patch because the code looks good and 
works. But I am curious how is it possible, given what you wrote above, 
that the DAX window mapping can be accessed like this? It seems as if the 
physical - virtual memory mapping is 1-1 but then it might collide on the 
virtual side with other mappings OSv is maintaining. Am I missing 
something? Or are we just lucky it just happens to not collide during the 
tests we have performed so far?

> + error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio); 
> + if (error) { 
> + kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid); 
> + } 
> + return error; 
> +} 
> + 
> +int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk 
> nchunks, 
> + chunk fstart, mapping_part& mapped, bool evict) 
> +{ 
> + // If necessary, unmap just enough chunks 
> + auto empty = _window_chunks - first_empty(); 
> + if (evict && empty < nchunks) { 
> + mapping_part mp; 
> + auto error = unmap(nchunks - empty, mp, false); 
> + if (error) { 
> + return error; 
> + } 
> + empty += mp.nchunks; 
> + } 
> + auto to_map = std::min<chunk>(nchunks, empty); 
> + if (to_map == 0) { 
> + // The window is full and evict is false, or nchunks is 0 
> + mapped.mstart = _window_chunks - empty; 
> + mapped.nchunks = 0; 
> + return (nchunks == 0) ? 0 : ENOBUFS; 
> + } 
> + 
> + // Map new chunks 
> + auto mstart = _window_chunks - empty; 
> + auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart); 
> + if (error) { 
> + return error; 
> + } 
> + if (!_mappings.empty()) { 
> + auto& m {_mappings.back()}; 
> + if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) { 
> + // Extend previous mapping 
> + m.nchunks += to_map; 
> + mapped.mstart = mstart; 
> + mapped.nchunks = to_map; 
> + return 0; 
> + } 
> + } 
> + _mappings.emplace_back(nodeid, to_map, fstart, mstart); 
> + mapped.mstart = mstart; 
> + mapped.nchunks = to_map; 
> + return 0; 
> +} 
> + 
> +int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep) 
> +{ 
> + // Determine necessary changes 
> + chunk to_unmap = 0; 
> + auto erase_first {_mappings.cend()}; 
> + chunk to_unmap_from_last = 0; 
> + for (auto it {_mappings.crbegin()}; 
> + to_unmap < nchunks && it != _mappings.crend(); it++) { 
> + 
> + if (it->nchunks <= nchunks - to_unmap) { 
> + // Remove *it 
> + erase_first = it.base() - 1; 
> + to_unmap += it->nchunks; 
> + } else { 
> + // Modify *it 
> + to_unmap_from_last = nchunks - to_unmap; 
> + to_unmap = nchunks; 
> + } 
> + } 
> + if (to_unmap == 0) { 
> + // The window is empty, or nchunks is 0 
> + unmapped.mstart = first_empty(); 
> + unmapped.nchunks = 0; 
> + return (nchunks == 0) ? 0 : ENODATA; 
> + } 
> + 
> + // Apply changes 
> + if (deep) { 
> + auto mstart = first_empty() - to_unmap; 
> + auto error = unmap_ll(to_unmap, mstart); 
> + if (error) { 
> + return error; 
> + } 
> + } 
> + _mappings.erase(erase_first, _mappings.cend()); 
> + if (to_unmap_from_last > 0) { 
> + _mappings.back().nchunks -= to_unmap_from_last; 
> + } 
> + 
> + unmapped.mstart = first_empty(); 
> + unmapped.nchunks = to_unmap; 
> + return 0; 
> +} 
> + 
> +int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk 
> nchunks, 
> + chunk fstart, chunk mstart) 
> +{ 
> + assert(mstart + nchunks <= _window_chunks); 
> + 
> + // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, 
> from 
> + // the spec: "Alignment constraints for FUSE_SETUPMAPPING and 
> + // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT 
> + // negotiation"): 
> + // - foffset: multiple of map_alignment from FUSE_INIT 
> + // - len: not larger than remaining file? 
> + // - moffset: multiple of map_alignment from FUSE_INIT 
> + // In practice, map_alignment is the host's page size, because foffset 
> and 
> + // moffset are passed to mmap() on the host. These are satisfied by 
> + // _chunk_size being a multiple of map_alignment. 
> + 
> + std::unique_ptr<fuse_setupmapping_in> in_args { 
> + new (std::nothrow) fuse_setupmapping_in()}; 
> + if (!in_args) { 
> + return ENOMEM; 
> + } 
> + in_args->fh = file_handle; 
> + in_args->foffset = fstart * _chunk_size; 
> + in_args->len = nchunks * _chunk_size; 
> + in_args->flags = 0; // Read-only 
> + in_args->moffset = mstart * _chunk_size; 
> + 
> + virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, 
> " 
> + "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len, 
> + in_args->moffset); 
> + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING, 
> + nodeid, in_args.get(), sizeof(*in_args), nullptr, 0); 
> + if (error) { 
> + kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid); 
> + return error; 
> + } 
> + 
> + return 0; 
> +} 
> + 
> +int dax_manager::unmap_ll(chunk nchunks, chunk mstart) 
> +{ 
> + assert(mstart + nchunks <= _window_chunks); 
> + 
> + // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by 
> + // fuse_removemapping_in.count fuse_removemapping_one arguments in 
> general. 
> + auto in_args_size = sizeof(fuse_removemapping_in) + 
> + sizeof(fuse_removemapping_one); 
> + std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]}; 
> + if (!in_args) { 
> + return ENOMEM; 
> + } 
> + auto r_in = new (in_args.get()) fuse_removemapping_in(); 
> + auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in)) 
> + fuse_removemapping_one(); 
> + r_in->count = 1; 
> + r_one->moffset = mstart * _chunk_size; 
> + r_one->len = nchunks * _chunk_size; 
> + 
> + // The nodeid is irrelevant for the current implementation of 
> + // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a 
> + // request per inode? 
> + uint64_t nodeid = 0; 
> + 
> + virtiofs_debug("inode %lld, removing mapping (moffset=%lld, 
> len=%lld)\n", 
> + nodeid, r_one->moffset, r_one->len); 
> + auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING, 
> + nodeid, in_args.get(), in_args_size, nullptr, 0); 
> + if (error) { 
> + kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid); 
> + return error; 
> + } 
> + 
> + return 0; 
> +} 
> + 
> +bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& 
> found) const 
> +{ 
> + for (auto& m : _mappings) { 
> + if (m.nodeid == nodeid && 
> + m.fstart <= fstart && 
> + m.fstart + m.nchunks > fstart) { 
> + 
> + // m contains fstart 
> + auto excess = fstart - m.fstart; // excess contained in m 
> + found.nchunks = m.nchunks - excess; 
> + found.mstart = m.mstart + excess; 
> + return true; 
> + } 
> + } 
> + return false; 
> +} 
> + 
> +dax_manager::chunk dax_manager::first_empty() const 
> +{ 
> + if (_mappings.empty()) { 
> + return 0; 
> + } 
> + auto& m {_mappings.back()}; 
> + return m.mstart + m.nchunks; 
> +} 
> + 
> +} 
> diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh 
> new file mode 100644 
> index 00000000..2b9fa341 
> --- /dev/null 
> +++ b/fs/virtiofs/virtiofs_dax.hh 
> @@ -0,0 +1,109 @@ 
> +/* 
> + * Copyright (C) 2020 Fotis Xenakis 
> + * 
> + * This work is open source software, licensed under the terms of the 
> + * BSD license as described in the LICENSE file in the top-level 
> directory. 
> + */ 
> + 
> +#include <vector> 
> + 
> +#include <api/assert.h> 
> +#include <osv/mutex.h> 
> +#include <osv/uio.h> 
> + 
> +#include "drivers/virtio-fs.hh" 
> +#include "virtiofs.hh" 
> + 
> +namespace virtiofs { 
> + 
> +// A manager for the DAX window of a virtio-fs device. This implements a 
> +// straight-forward scheme for file mappings: 
> +// - The window is split into equally-sized chunks. Each mapping occupies 
> an 
> +// integer amount of consecutive chunks. 
> +// - New mappings are placed on the lowest available chunks in the 
> window. 
> +// - When there are not enough chunks available for a new mapping, the 
> highest 
> +// (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks 
> are 
> +// mapped in a LIFO manner (the window resembles a stack). 
> +class dax_manager { 
> +public: 
> + static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB 
> + 
> + // Construct a new manager for the DAX window associated with @drv (as 
> + // returned by drv.get_dax()). The alignment constraint of the device 
> (as 
> + // reported by drv.get_map_alignment()) should be compatible with 
> + // @chunk_size. 
> + dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE) 
> + : _drv {drv}, 
> + _window {drv.get_dax()}, 
> + _chunk_size {chunk_size}, 
> + _window_chunks {_window->len / _chunk_size} { 
> + 
> + assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0); 
> + 
> + // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< CHUNK_SIZE) 
> + // is effectively ignored. 
> + } 
> + 
> + // Read @read_amt bytes from @inode, using the DAX window. If 
> @aggressive, 
> + // try to prefetch as much of the rest of the file as possible. 
> + int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt, 
> + struct uio& uio, bool aggressive = false); 
> + 
> +private: 
> + // Helper type to better distinguish referring to chunks vs bytes 
> + using chunk = size_t; 
> + 
> + struct mapping { 
> + mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart) 
> + : nodeid {_nodeid}, 
> + nchunks {_nchunks}, 
> + fstart {_fstart}, 
> + mstart {_mstart} {} 
> + uint64_t nodeid; 
> + chunk nchunks; 
> + chunk fstart; 
> + chunk mstart; 
> + }; 
> + 
> + struct mapping_part { 
> + chunk nchunks; 
> + chunk mstart; 
> + }; 
> + 
> + // Map up to @nchunks chunks of the file with @nodeid, starting at chunk 
> + // @fstart of the file, after all other mappings. If @evict, evict other 
> + // chunks if necessary. Returns in @mapped the new mapping and non-zero 
> on 
> + // failure. Called with _lock held (for writing). 
> + int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk 
> fstart, 
> + mapping_part& mapped, bool evict = false); 
> + // Unmap @nchunks last chunks, also doing an actual unmapping on the 
> device 
> + // if @deep. Returns in @unmapped what was unmapped and non-zero on 
> failure. 
> + // Called with _lock held (for writing). 
> + int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false); 
> + // Map @nchunks chunks of the file with @nodeid (opened as @fh), 
> starting at 
> + // chunk @fstart of the file and chunk @mstart of the window. Returns 
> + // non-zero on failure. Called with _lock held (for writing). 
> + int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart, 
> + chunk mstart); 
> + // Unmap @nchunks chunks, starting at chunk @mstart of the window. 
> Returns 
> + // non-zero on failure. Called with _lock held (for writing). 
> + int unmap_ll(chunk nchunks, chunk mstart); 
> + 
> + // Return in @found the largest contiguous existing mapping for @nodeid 
> + // starting at @fstart. If none found, returns false. Called with _lock 
> held 
> + // (for reading). 
> + bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const; 
> + // Returns the first empty chunk in the window, or one-past-the-last if 
> the 
> + // window is full. Called with _lock held (for reading). 
> + chunk first_empty() const; 
> + 
> + virtio::fs& _drv; 
> + const virtio::fs::dax_window* const _window; 
> + const size_t _chunk_size; 
> + const chunk _window_chunks; 
> + // TODO OPT: Switch to rwlock 
> + mutex _lock; 
> + std::vector<mapping> _mappings; 
> +}; 
> + 
> +} 
> -- 
> 2.27.0 
>
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/5ccb8460-3019-4db9-a21a-fb16930ef894n%40googlegroups.com.

Reply via email to