This patch is a proposal of possible solution that could address ROFS part of issue #979. Even though it seems to work in general and deliver significant memory savings when mmaping ROFS files there are some gaps in the implementation as some failing unit tests indicate.
In essense we are trying to point to some prefetched page-size memory area in the ROFS cache segment of a file being mmapped. To achieve this we are adding new VNOP - get_page_addr - that is meant to return memory address in file cache that corresponds to passed in file offset. Unlike with ZFS we are completely bypassing pagecache (which might be incorrect) and directly setting page table entry to point to file cache memory address. Following 3 tests fail (./scripts/build fs=rofs check): 1) tst-hub fails every time TEST tst-hub.so OSv v0.51.0-15-ge54f98e6 eth0: 192.168.122.15 Assertion failed: sched::preemptable() (core/elf.cc: resolve_pltgot: 675) [backtrace] 0x000000000022965a <__assert_fail+26> 0x000000000034acc7 <elf::object::resolve_pltgot(unsigned int)+519> 0x000000000034ad26 <elf_resolve_pltgot+70> 0x000000000039569a <???+3757722> 0x00000000003f0fdf <???+4132831> 0x000010000040a448 <???+4236360> 0x000010000061840f <???+6390799> 0xffffa000017f3bdf <???+25115615> 0xffffa000017f3fdf <???+25116639> 0xffffa000028ce03f <???+42786879> 0xffffa000028ce05f <???+42786911> 0xffffa000028ce07f <???+42786943> 0xffffa000028ce09f <???+42786975> 0x000010000061840f <???+6390799> 2) tst-libc-locking fails only some times TEST tst-libc-locking.so OSv v0.51.0-15-ge54f98e6 eth0: 192.168.122.15 __sigsetjmp() stubbed Running 1 test case... *** No errors detected Assertion failed: !node_algorithms::inited(this->priv_value_traits().to_node_ptr(value)) (/usr/include/boost/intrusive/list.hpp: iterator_to: 1300) [backtrace] 0x000000000022965a <__assert_fail+26> 0x00000000003e5d43 <memory::page_range_allocator::remove(memory::page_range&)+179> 0x00000000003e0a86 <memory::page_range_allocator::free(memory::page_range*)+166> 0x00000000003e0beb <memory::page_pool::l2::free_batch(memory::page_pool::page_batch&)+91> 0x00000000003e0e38 <memory::page_pool::l2::unfill()+504> 0x00000000003e22f6 <memory::page_pool::l2::fill_thread()+358> 0x00000000003e614b <std::_Function_handler<void (), memory::page_pool::l2::l2()::{lambda()#1}>::_M_invoke(std::_Any_data const&)+11> 0x00000000003f4e76 <thread_main_c+38> 0x00000000003969e2 <???+3762658> Test tst-libc-locking.so FAILED 3) TEST tst-namespace.so OSv v0.51.0-15-ge54f98e6 eth0: 192.168.122.15 Aborted [backtrace] 0x000000000045f90b <osv::generate_signal(siginfo&, exception_frame*)+59> 0x000000000045f97a <osv::handle_mmap_fault(unsigned long, int, exception_frame*)+26> 0x0000000000336be9 <mmu::vm_fault(unsigned long, exception_frame*)+185> 0x0000000000396c0e <page_fault+142> 0x0000000000395a66 <???+3758694> 0x0000000000427379 <osv::application::application(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, .... 0x0000100000402fbf <???+4206527> 0x616e2d64616f6c78 <???+1634692216> Test tst-namespace.so FAILED There are actually more tests failing in similar way as tst-libc-locking which seems to indicate lack of necessary lock in some place. Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com> --- .../opensolaris/uts/common/fs/zfs/zfs_vnops.c | 1 + fs/nfs/nfs_vnops.cc | 3 +- fs/procfs/procfs_vnops.cc | 1 + fs/ramfs/ramfs_vnops.cc | 1 + fs/rofs/rofs.hh | 2 + fs/rofs/rofs_cache.cc | 60 ++++++++++++++++++- fs/rofs/rofs_vnops.cc | 26 +++++++- fs/vfs/vfs_fops.cc | 30 +++++++++- include/osv/vnode.h | 6 ++ 9 files changed, 124 insertions(+), 6 deletions(-) diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index af9bd1f3..218ca03c 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -5114,4 +5114,5 @@ struct vnops zfs_vnops = { zfs_fallocate, /* fallocate */ zfs_readlink, /* read link */ zfs_symlink, /* symbolic link */ + (vnop_get_page_addr_t) NULL, /* get page address */ }; diff --git a/fs/nfs/nfs_vnops.cc b/fs/nfs/nfs_vnops.cc index 08fbf539..0d95d2b1 100644 --- a/fs/nfs/nfs_vnops.cc +++ b/fs/nfs/nfs_vnops.cc @@ -614,8 +614,9 @@ struct vnops nfs_vnops = { nfs_op_inactive, /* inactive */ nfs_op_truncate, /* truncate */ nfs_op_link, /* link */ - (vnop_cache_t) nullptr, /* arc */ + (vnop_cache_t) nullptr, /* arc */ nfs_op_fallocate, /* fallocate */ nfs_op_readlink, /* read link */ nfs_op_symlink, /* symbolic link */ + (vnop_get_page_address_t) nullptr /* get memory page address from cache */ }; diff --git a/fs/procfs/procfs_vnops.cc b/fs/procfs/procfs_vnops.cc index 56ac31eb..deb87557 100644 --- a/fs/procfs/procfs_vnops.cc +++ b/fs/procfs/procfs_vnops.cc @@ -438,6 +438,7 @@ vnops procfs_vnops = { (vnop_fallocate_t) vop_nullop, // vop_fallocate (vnop_readlink_t) vop_nullop, // vop_readlink (vnop_symlink_t) vop_nullop, // vop_symlink + (vnop_get_page_addr_t) nullptr // vop_get_page_addr }; vfsops procfs_vfsops = { diff --git a/fs/ramfs/ramfs_vnops.cc b/fs/ramfs/ramfs_vnops.cc index cf1ae272..9d789648 100644 --- a/fs/ramfs/ramfs_vnops.cc +++ b/fs/ramfs/ramfs_vnops.cc @@ -635,5 +635,6 @@ struct vnops ramfs_vnops = { ramfs_fallocate, /* fallocate */ ramfs_readlink, /* read link */ ramfs_symlink, /* symbolic link */ + (vnop_get_page_addr_t) nullptr, /* get page address */ }; diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh index 16f811b0..401cad8d 100644 --- a/fs/rofs/rofs.hh +++ b/fs/rofs/rofs.hh @@ -128,6 +128,8 @@ struct rofs_info { namespace rofs { int cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio); + int + cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, off_t offset, void **addr); } int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t blocks_count, void* buf); diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc index cfb5287d..315c9948 100644 --- a/fs/rofs/rofs_cache.cc +++ b/fs/rofs/rofs_cache.cc @@ -12,6 +12,7 @@ #include <include/osv/uio.h> #include <osv/debug.h> #include <osv/sched.hh> +#include <sys/mman.h> /* * From cache perspective let us divide each file into sequence of contiguous 32K segments. @@ -56,7 +57,15 @@ public: this->starting_block = _starting_block; this->block_count = _block_count; this->data_ready = false; // Data has to be loaded from disk - this->data = malloc(_cache->sb->block_size * _block_count); + // Ideally it would be nice to protect this memory from writing to but unfortunately + // mmap/mprotect cannot be used as it results in assert violation + // in mmu.cc:virt_to_phys() -> possibly because drivers need to operate on non-mmaped memory (why?) + //this->data = malloc(_cache->sb->block_size * _block_count); + //this->data = mmap(NULL, cache->sb->block_size * _block_count, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_POPULATE, -1, 0); + // Because the cache may be used to directly back mmap-access we need page-aligned block of memory + //TODO: Replace 4096 with system constant + this->data = aligned_alloc(4096, _cache->sb->block_size * _block_count); + assert(this->data > 0); #if defined(ROFS_DIAGNOSTICS_ENABLED) rofs_block_allocated += block_count; #endif @@ -70,6 +79,10 @@ public: return this->block_count * this->cache->sb->block_size; } + void* memory_address(off_t offset) { + return this->data + offset; + } + bool is_data_ready() { return this->data_ready; } @@ -100,6 +113,8 @@ public: if (error) { print("!!!!! Error reading from disk\n"); } + //Cannot mprotect something that was malloc-ed or mem-aligned + //mprotect( this->data, this->cache->sb->block_size * this->block_count, PROT_READ | PROT_EXEC); return error; } }; @@ -271,4 +286,47 @@ cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_bl return error; } +int +cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, off_t offset, void **addr) +{ + // + // Find existing one or create new file cache + struct file_cache *cache = get_or_create_file_cache(inode, sb); + + struct uio _uio; + _uio.uio_offset = offset; + _uio.uio_resid = 4096; + // + // Prepare list of cache transactions (copy from memory + // or read from disk into cache memory and then copy into memory) + // --> THERE SHOULD BE ONLY one TRANSACTION + auto segment_transactions = plan_cache_transactions(cache, &_uio); + print("[rofs] [%d] rofs_get_page_address called for i-node [%d] at %d with %d ops\n", + sched::thread::current()->id(), inode->inode_no, offset, segment_transactions.size()); + + int error = 0; + + auto it = segment_transactions.begin(); + assert(it != segment_transactions.end()); // There should be at least ONE transaction + auto transaction = *it; +#if defined(ROFS_DIAGNOSTICS_ENABLED) + rofs_cache_reads += 1; +#endif + if (transaction.transaction_type == CacheTransactionType::READ_FROM_DISK) { + // Read from disk into segment missing in cache or empty segment that was in cache but had not data because + // of failure to read + error = transaction.segment->read_from_disk(device); +#if defined(ROFS_DIAGNOSTICS_ENABLED) + rofs_cache_misses += 1; +#endif + } + + if( !error) + *addr = transaction.segment->memory_address(transaction.segment_offset); + else + *addr = nullptr; + + return error; +} + } diff --git a/fs/rofs/rofs_vnops.cc b/fs/rofs/rofs_vnops.cc index 99b2bfbd..57f7b6c5 100644 --- a/fs/rofs/rofs_vnops.cc +++ b/fs/rofs/rofs_vnops.cc @@ -272,6 +272,28 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr) return 0; } +static int rofs_get_page_address(struct vnode *vnode, off_t offset, void **addr) { + struct rofs_info *rofs = (struct rofs_info *) vnode->v_mount->m_data; + struct rofs_super_block *sb = rofs->sb; + struct rofs_inode *inode = (struct rofs_inode *) vnode->v_data; + struct device *device = vnode->v_mount->m_dev; + + if (vnode->v_type == VDIR) + return EISDIR; + /* Cant read anything but reg */ + if (vnode->v_type != VREG) + return EINVAL; + /* Cant start reading before the first byte */ + if (offset < 0) + return EINVAL; + /* Cant read after the end of the file */ + if (offset >= (off_t)vnode->v_size) + return 0; + //TODO: Check if offset is page aligned + + return rofs::cache_get_page_address(inode,device,sb,offset,addr); +} + #define rofs_write ((vnop_write_t)vop_erofs) #define rofs_seek ((vnop_seek_t)vop_nullop) #define rofs_ioctl ((vnop_ioctl_t)vop_nullop) @@ -288,6 +310,7 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr) #define rofs_fallocate ((vnop_fallocate_t)vop_erofs) #define rofs_fsync ((vnop_fsync_t)vop_nullop) #define rofs_symlink ((vnop_symlink_t)vop_erofs) +//#define rofs_get_page_addr ((vnop_get_page_addr_t)nullptr) struct vnops rofs_vnops = { rofs_open, /* open */ @@ -312,7 +335,8 @@ struct vnops rofs_vnops = { rofs_arc, /* arc */ //TODO: Implement to allow memory re-use when mapping files rofs_fallocate, /* fallocate - returns error when called*/ rofs_readlink, /* read link */ - rofs_symlink /* symbolic link - returns error when called*/ + rofs_symlink, /* symbolic link - returns error when called*/ + rofs_get_page_address /* return memory page address of file offset*/ }; extern "C" void rofs_disable_cache() { diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc index 3a8f98b4..d6cb5dce 100644 --- a/fs/vfs/vfs_fops.cc +++ b/fs/vfs/vfs_fops.cc @@ -140,12 +140,34 @@ int vfs_file::chmod(mode_t mode) bool vfs_file::map_page(uintptr_t off, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pte, bool write, bool shared) { - return pagecache::get(this, off, ptep, pte, write, shared); + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + if (vp->v_op->vop_get_page_addr) { + void *page_address = nullptr; + vn_lock(vp); + //TODO: This requires better error handling than assert + assert(VOP_GET_PAGE_ADDR(vp, off, &page_address) == 0); + vn_unlock(vp); + assert(page_address != 0); + //debugf("vfs_file::map_page got page at %08x\n", page_address); + return mmu::write_pte(page_address, ptep, pte); + } + else + return pagecache::get(this, off, ptep, pte, write, shared); } bool vfs_file::put_page(void *addr, uintptr_t off, mmu::hw_ptep<0> ptep) { - return pagecache::release(this, addr, off, ptep); + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + if (vp->v_op->vop_get_page_addr) { + //Not sure if this is enough + mmu::clear_pte(ptep); + return true; + } + else { + return pagecache::release(this, addr, off, ptep); + } } void vfs_file::sync(off_t start, off_t end) @@ -182,7 +204,9 @@ std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned flags, { auto fp = this; struct vnode *vp = fp->f_dentry->d_vnode; - if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) { + //TODO: In case of ROFS there should be some check to prevent caller from + //mmap-ing writable vma (only read/execute should be possible) + if ((!vp->v_op->vop_cache && !vp->v_op->vop_get_page_addr) || (vp->v_size < (off_t)mmu::page_size)) { return mmu::default_file_mmap(this, range, flags, perm, offset); } return mmu::map_file_mmap(this, range, flags, perm, offset); diff --git a/include/osv/vnode.h b/include/osv/vnode.h index e35aa830..d5ec2a64 100755 --- a/include/osv/vnode.h +++ b/include/osv/vnode.h @@ -149,6 +149,7 @@ typedef int (*vnop_cache_t) (struct vnode *, struct file *, struct uio *); typedef int (*vnop_fallocate_t) (struct vnode *, int, loff_t, loff_t); typedef int (*vnop_readlink_t) (struct vnode *, struct uio *); typedef int (*vnop_symlink_t) (struct vnode *, char *, char *); +typedef int (*vnop_get_page_addr_t) (struct vnode *, off_t, void **); /* * vnode operations @@ -177,6 +178,10 @@ struct vnops { vnop_fallocate_t vop_fallocate; vnop_readlink_t vop_readlink; vnop_symlink_t vop_symlink; + //Not sure if this is good name + vnop_get_page_addr_t vop_get_page_addr; + //Possibly we need another vnop to "release page" which will + //be necessary once we implement some kind of LRU cache in ROFS }; /* @@ -206,6 +211,7 @@ struct vnops { #define VOP_FALLOCATE(VP, M, OFF, LEN) ((VP)->v_op->vop_fallocate)(VP, M, OFF, LEN) #define VOP_READLINK(VP, U) ((VP)->v_op->vop_readlink)(VP, U) #define VOP_SYMLINK(DVP, OP, NP) ((DVP)->v_op->vop_symlink)(DVP, OP, NP) +#define VOP_GET_PAGE_ADDR(VP, OFF, ADDR) ((VP)->v_op->vop_get_page_addr)(VP, OFF, ADDR) int vop_nullop(void); int vop_einval(void); -- 2.17.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. For more options, visit https://groups.google.com/d/optout.