This patch is a proposal of possible solution that could address
ROFS part of issue #979. Even though it seems to work in general
and deliver significant memory savings when mmaping ROFS files
there are some gaps in the implementation as some failing unit tests indicate.

In essense we are trying to point to some prefetched page-size memory area 
in the ROFS cache segment of a file being mmapped. To achieve this
we are adding new VNOP - get_page_addr - that is meant to return
memory address in file cache that corresponds to passed in file offset.
Unlike with ZFS we are completely bypassing pagecache (which might
be incorrect) and directly setting page table entry to point
to file cache memory address.

Following 3 tests fail (./scripts/build fs=rofs check):
1) tst-hub fails every time
  TEST tst-hub.so                         OSv v0.51.0-15-ge54f98e6
eth0: 192.168.122.15
Assertion failed: sched::preemptable() (core/elf.cc: resolve_pltgot: 675)

[backtrace]
0x000000000022965a <__assert_fail+26>
0x000000000034acc7 <elf::object::resolve_pltgot(unsigned int)+519>
0x000000000034ad26 <elf_resolve_pltgot+70>
0x000000000039569a <???+3757722>
0x00000000003f0fdf <???+4132831>
0x000010000040a448 <???+4236360>
0x000010000061840f <???+6390799>
0xffffa000017f3bdf <???+25115615>
0xffffa000017f3fdf <???+25116639>
0xffffa000028ce03f <???+42786879>
0xffffa000028ce05f <???+42786911>
0xffffa000028ce07f <???+42786943>
0xffffa000028ce09f <???+42786975>
0x000010000061840f <???+6390799>

2) tst-libc-locking fails only some times
  TEST tst-libc-locking.so                OSv v0.51.0-15-ge54f98e6
eth0: 192.168.122.15
__sigsetjmp() stubbed
Running 1 test case...

*** No errors detected
Assertion failed: 
!node_algorithms::inited(this->priv_value_traits().to_node_ptr(value)) 
(/usr/include/boost/intrusive/list.hpp: iterator_to: 1300)

[backtrace]
0x000000000022965a <__assert_fail+26>
0x00000000003e5d43 
<memory::page_range_allocator::remove(memory::page_range&)+179>
0x00000000003e0a86 <memory::page_range_allocator::free(memory::page_range*)+166>
0x00000000003e0beb 
<memory::page_pool::l2::free_batch(memory::page_pool::page_batch&)+91>
0x00000000003e0e38 <memory::page_pool::l2::unfill()+504>
0x00000000003e22f6 <memory::page_pool::l2::fill_thread()+358>
0x00000000003e614b <std::_Function_handler<void (), 
memory::page_pool::l2::l2()::{lambda()#1}>::_M_invoke(std::_Any_data const&)+11>
0x00000000003f4e76 <thread_main_c+38>
0x00000000003969e2 <???+3762658>
Test tst-libc-locking.so FAILED

3) 
 TEST tst-namespace.so                   OSv v0.51.0-15-ge54f98e6
eth0: 192.168.122.15
Aborted

[backtrace]
0x000000000045f90b <osv::generate_signal(siginfo&, exception_frame*)+59>
0x000000000045f97a <osv::handle_mmap_fault(unsigned long, int, 
exception_frame*)+26>
0x0000000000336be9 <mmu::vm_fault(unsigned long, exception_frame*)+185>
0x0000000000396c0e <page_fault+142>
0x0000000000395a66 <???+3758694>
0x0000000000427379 
<osv::application::application(std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, ....
0x0000100000402fbf <???+4206527>
0x616e2d64616f6c78 <???+1634692216>
Test tst-namespace.so FAILED

There are actually more tests failing in similar way
as tst-libc-locking which seems to indicate
lack of necessary lock in some place.

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 .../opensolaris/uts/common/fs/zfs/zfs_vnops.c |  1 +
 fs/nfs/nfs_vnops.cc                           |  3 +-
 fs/procfs/procfs_vnops.cc                     |  1 +
 fs/ramfs/ramfs_vnops.cc                       |  1 +
 fs/rofs/rofs.hh                               |  2 +
 fs/rofs/rofs_cache.cc                         | 60 ++++++++++++++++++-
 fs/rofs/rofs_vnops.cc                         | 26 +++++++-
 fs/vfs/vfs_fops.cc                            | 30 +++++++++-
 include/osv/vnode.h                           |  6 ++
 9 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 
b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index af9bd1f3..218ca03c 100644
--- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -5114,4 +5114,5 @@ struct vnops zfs_vnops = {
        zfs_fallocate,                  /* fallocate */
        zfs_readlink,                   /* read link */
        zfs_symlink,                    /* symbolic link */
+        (vnop_get_page_addr_t) NULL, /* get page address */
 };
diff --git a/fs/nfs/nfs_vnops.cc b/fs/nfs/nfs_vnops.cc
index 08fbf539..0d95d2b1 100644
--- a/fs/nfs/nfs_vnops.cc
+++ b/fs/nfs/nfs_vnops.cc
@@ -614,8 +614,9 @@ struct vnops nfs_vnops = {
     nfs_op_inactive,         /* inactive */
     nfs_op_truncate,         /* truncate */
     nfs_op_link,             /* link */
-    (vnop_cache_t) nullptr, /* arc */
+    (vnop_cache_t) nullptr,  /* arc */
     nfs_op_fallocate,        /* fallocate */
     nfs_op_readlink,         /* read link */
     nfs_op_symlink,          /* symbolic link */
+    (vnop_get_page_address_t) nullptr /* get memory page address from cache */
 };
diff --git a/fs/procfs/procfs_vnops.cc b/fs/procfs/procfs_vnops.cc
index 56ac31eb..deb87557 100644
--- a/fs/procfs/procfs_vnops.cc
+++ b/fs/procfs/procfs_vnops.cc
@@ -438,6 +438,7 @@ vnops procfs_vnops = {
     (vnop_fallocate_t) vop_nullop, // vop_fallocate
     (vnop_readlink_t)  vop_nullop, // vop_readlink
     (vnop_symlink_t)   vop_nullop, // vop_symlink
+    (vnop_get_page_addr_t) nullptr // vop_get_page_addr
 };
 
 vfsops procfs_vfsops = {
diff --git a/fs/ramfs/ramfs_vnops.cc b/fs/ramfs/ramfs_vnops.cc
index cf1ae272..9d789648 100644
--- a/fs/ramfs/ramfs_vnops.cc
+++ b/fs/ramfs/ramfs_vnops.cc
@@ -635,5 +635,6 @@ struct vnops ramfs_vnops = {
         ramfs_fallocate,        /* fallocate */
         ramfs_readlink,         /* read link */
         ramfs_symlink,          /* symbolic link */
+        (vnop_get_page_addr_t) nullptr, /* get page address */
 };
 
diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh
index 16f811b0..401cad8d 100644
--- a/fs/rofs/rofs.hh
+++ b/fs/rofs/rofs.hh
@@ -128,6 +128,8 @@ struct rofs_info {
 namespace rofs {
     int
     cache_read(struct rofs_inode *inode, struct device *device, struct 
rofs_super_block *sb, struct uio *uio);
+    int
+    cache_get_page_address(struct rofs_inode *inode, struct device *device, 
struct rofs_super_block *sb, off_t offset, void **addr);
 }
 
 int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t 
blocks_count, void* buf);
diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc
index cfb5287d..315c9948 100644
--- a/fs/rofs/rofs_cache.cc
+++ b/fs/rofs/rofs_cache.cc
@@ -12,6 +12,7 @@
 #include <include/osv/uio.h>
 #include <osv/debug.h>
 #include <osv/sched.hh>
+#include <sys/mman.h>
 
 /*
  * From cache perspective let us divide each file into sequence of contiguous 
32K segments.
@@ -56,7 +57,15 @@ public:
         this->starting_block = _starting_block;
         this->block_count = _block_count;
         this->data_ready = false;   // Data has to be loaded from disk
-        this->data = malloc(_cache->sb->block_size * _block_count);
+        // Ideally it would be nice to protect this memory from writing to but 
unfortunately
+        // mmap/mprotect cannot be used as it results in assert violation
+        // in mmu.cc:virt_to_phys() -> possibly because drivers need to 
operate on non-mmaped memory (why?)
+        //this->data = malloc(_cache->sb->block_size * _block_count);
+        //this->data = mmap(NULL, cache->sb->block_size * _block_count, 
PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_POPULATE, -1, 0);
+        // Because the cache may be used to directly back mmap-access we need 
page-aligned block of memory
+        //TODO: Replace 4096 with system constant
+        this->data = aligned_alloc(4096, _cache->sb->block_size * 
_block_count);
+        assert(this->data > 0);
 #if defined(ROFS_DIAGNOSTICS_ENABLED)
         rofs_block_allocated += block_count;
 #endif
@@ -70,6 +79,10 @@ public:
         return this->block_count * this->cache->sb->block_size;
     }
 
+    void* memory_address(off_t offset) {
+        return this->data + offset;
+    }
+
     bool is_data_ready() {
         return this->data_ready;
     }
@@ -100,6 +113,8 @@ public:
         if (error) {
             print("!!!!! Error reading from disk\n");
         }
+        //Cannot mprotect something that was malloc-ed or mem-aligned
+        //mprotect( this->data, this->cache->sb->block_size * 
this->block_count, PROT_READ | PROT_EXEC);
         return error;
     }
 };
@@ -271,4 +286,47 @@ cache_read(struct rofs_inode *inode, struct device 
*device, struct rofs_super_bl
     return error;
 }
 
+int
+cache_get_page_address(struct rofs_inode *inode, struct device *device, struct 
rofs_super_block *sb, off_t offset, void **addr)
+{
+    //
+    // Find existing one or create new file cache
+    struct file_cache *cache = get_or_create_file_cache(inode, sb);
+
+    struct uio _uio;
+    _uio.uio_offset = offset;
+    _uio.uio_resid = 4096;
+    //
+    // Prepare list of cache transactions (copy from memory
+    // or read from disk into cache memory and then copy into memory)
+    // --> THERE SHOULD BE ONLY one TRANSACTION
+    auto segment_transactions = plan_cache_transactions(cache, &_uio);
+    print("[rofs] [%d] rofs_get_page_address called for i-node [%d] at %d with 
%d ops\n",
+          sched::thread::current()->id(), inode->inode_no, offset, 
segment_transactions.size());
+
+    int error = 0;
+
+    auto it = segment_transactions.begin();
+    assert(it != segment_transactions.end()); // There should be at least ONE 
transaction
+    auto transaction = *it;
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+    rofs_cache_reads += 1;
+#endif
+    if (transaction.transaction_type == CacheTransactionType::READ_FROM_DISK) {
+        // Read from disk into segment missing in cache or empty segment that 
was in cache but had not data because
+        // of failure to read
+        error = transaction.segment->read_from_disk(device);
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+        rofs_cache_misses += 1;
+#endif
+   }
+
+   if( !error)
+       *addr = transaction.segment->memory_address(transaction.segment_offset);
+   else
+       *addr = nullptr;
+
+   return error;
+}
+
 }
diff --git a/fs/rofs/rofs_vnops.cc b/fs/rofs/rofs_vnops.cc
index 99b2bfbd..57f7b6c5 100644
--- a/fs/rofs/rofs_vnops.cc
+++ b/fs/rofs/rofs_vnops.cc
@@ -272,6 +272,28 @@ static int rofs_getattr(struct vnode *vnode, struct vattr 
*attr)
     return 0;
 }
 
+static int rofs_get_page_address(struct vnode *vnode, off_t offset, void 
**addr) {
+    struct rofs_info *rofs = (struct rofs_info *) vnode->v_mount->m_data;
+    struct rofs_super_block *sb = rofs->sb;
+    struct rofs_inode *inode = (struct rofs_inode *) vnode->v_data;
+    struct device *device = vnode->v_mount->m_dev;
+
+    if (vnode->v_type == VDIR)
+        return EISDIR;
+    /* Cant read anything but reg */
+    if (vnode->v_type != VREG)
+        return EINVAL;
+    /* Cant start reading before the first byte */
+    if (offset < 0)
+        return EINVAL;
+    /* Cant read after the end of the file */
+    if (offset >= (off_t)vnode->v_size)
+        return 0;
+    //TODO: Check if offset is page aligned
+
+    return rofs::cache_get_page_address(inode,device,sb,offset,addr);
+}
+
 #define rofs_write       ((vnop_write_t)vop_erofs)
 #define rofs_seek        ((vnop_seek_t)vop_nullop)
 #define rofs_ioctl       ((vnop_ioctl_t)vop_nullop)
@@ -288,6 +310,7 @@ static int rofs_getattr(struct vnode *vnode, struct vattr 
*attr)
 #define rofs_fallocate   ((vnop_fallocate_t)vop_erofs)
 #define rofs_fsync       ((vnop_fsync_t)vop_nullop)
 #define rofs_symlink     ((vnop_symlink_t)vop_erofs)
+//#define rofs_get_page_addr ((vnop_get_page_addr_t)nullptr)
 
 struct vnops rofs_vnops = {
     rofs_open,               /* open */
@@ -312,7 +335,8 @@ struct vnops rofs_vnops = {
     rofs_arc,                /* arc */ //TODO: Implement to allow memory 
re-use when mapping files
     rofs_fallocate,          /* fallocate - returns error when called*/
     rofs_readlink,           /* read link */
-    rofs_symlink             /* symbolic link - returns error when called*/
+    rofs_symlink,            /* symbolic link - returns error when called*/
+    rofs_get_page_address    /* return memory page address of file offset*/
 };
 
 extern "C" void rofs_disable_cache() {
diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc
index 3a8f98b4..d6cb5dce 100644
--- a/fs/vfs/vfs_fops.cc
+++ b/fs/vfs/vfs_fops.cc
@@ -140,12 +140,34 @@ int vfs_file::chmod(mode_t mode)
 
 bool vfs_file::map_page(uintptr_t off, mmu::hw_ptep<0> ptep, 
mmu::pt_element<0> pte, bool write, bool shared)
 {
-    return pagecache::get(this, off, ptep, pte, write, shared);
+    auto fp = this;
+    struct vnode *vp = fp->f_dentry->d_vnode;
+    if (vp->v_op->vop_get_page_addr) {
+        void *page_address = nullptr;
+        vn_lock(vp);
+        //TODO: This requires better error handling than assert
+        assert(VOP_GET_PAGE_ADDR(vp, off, &page_address) == 0);
+        vn_unlock(vp);
+        assert(page_address != 0);
+        //debugf("vfs_file::map_page got page at %08x\n", page_address);
+        return mmu::write_pte(page_address, ptep, pte);
+    }
+    else
+        return pagecache::get(this, off, ptep, pte, write, shared);
 }
 
 bool vfs_file::put_page(void *addr, uintptr_t off, mmu::hw_ptep<0> ptep)
 {
-    return pagecache::release(this, addr, off, ptep);
+    auto fp = this;
+    struct vnode *vp = fp->f_dentry->d_vnode;
+    if (vp->v_op->vop_get_page_addr) {
+        //Not sure if this is enough
+        mmu::clear_pte(ptep);
+        return true;
+    }
+    else {
+        return pagecache::release(this, addr, off, ptep);
+    }
 }
 
 void vfs_file::sync(off_t start, off_t end)
@@ -182,7 +204,9 @@ std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range 
range, unsigned flags,
 {
        auto fp = this;
        struct vnode *vp = fp->f_dentry->d_vnode;
-       if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) {
+       //TODO: In case of ROFS there should be some check to prevent caller 
from
+    //mmap-ing writable vma (only read/execute should be possible)
+       if ((!vp->v_op->vop_cache && !vp->v_op->vop_get_page_addr) || 
(vp->v_size < (off_t)mmu::page_size)) {
                return mmu::default_file_mmap(this, range, flags, perm, offset);
        }
        return mmu::map_file_mmap(this, range, flags, perm, offset);
diff --git a/include/osv/vnode.h b/include/osv/vnode.h
index e35aa830..d5ec2a64 100755
--- a/include/osv/vnode.h
+++ b/include/osv/vnode.h
@@ -149,6 +149,7 @@ typedef int (*vnop_cache_t) (struct vnode *, struct file *, 
struct uio *);
 typedef int (*vnop_fallocate_t) (struct vnode *, int, loff_t, loff_t);
 typedef int (*vnop_readlink_t)  (struct vnode *, struct uio *);
 typedef int (*vnop_symlink_t)   (struct vnode *, char *, char *);
+typedef int (*vnop_get_page_addr_t)   (struct vnode *, off_t, void **);
 
 /*
  * vnode operations
@@ -177,6 +178,10 @@ struct vnops {
        vnop_fallocate_t        vop_fallocate;
        vnop_readlink_t         vop_readlink;
        vnop_symlink_t          vop_symlink;
+       //Not sure if this is good name
+       vnop_get_page_addr_t    vop_get_page_addr;
+       //Possibly we need another vnop to "release page" which will
+       //be necessary once we implement some kind of LRU cache in ROFS
 };
 
 /*
@@ -206,6 +211,7 @@ struct vnops {
 #define VOP_FALLOCATE(VP, M, OFF, LEN) ((VP)->v_op->vop_fallocate)(VP, M, OFF, 
LEN)
 #define VOP_READLINK(VP, U)        ((VP)->v_op->vop_readlink)(VP, U)
 #define VOP_SYMLINK(DVP, OP, NP)   ((DVP)->v_op->vop_symlink)(DVP, OP, NP)
+#define VOP_GET_PAGE_ADDR(VP, OFF, ADDR) ((VP)->v_op->vop_get_page_addr)(VP, 
OFF, ADDR)
 
 int     vop_nullop(void);
 int     vop_einval(void);
-- 
2.17.1

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to