Re: [Virtio-fs] [PATCH] virtiofsd: remove symlink fallbacks

2020-05-14 Thread Liu Bo
On Thu, May 14, 2020 at 04:07:36PM +0200, Miklos Szeredi wrote:
> Path lookup in the kernel has special rules for looking up magic symlinks
> under /proc.  If a filesystem operation is instructed to follow symlinks
> (e.g. via AT_SYMLINK_FOLLOW or lack of AT_SYMLINK_NOFOLLOW), and the final
> component is such a proc symlink, then the target of the magic symlink is
> used for the operation, even if the target itself is a symlink.  I.e. path
> lookup is always terminated after following a final magic symlink.
>

Hi Miklos,

Are these mentioned special rules supported by a recent kernel version
or are they there from day one linux?

thanks,
liubo

> I was erronously assuming that in the above case the target symlink would
> also be followed, and so workarounds were added for a couple of operations
> to handle the symlink case.  Since the symlink can be handled simply by
> following the proc symlink, these workardouds are not needed.
> 
> Also remove the "norace" option, which disabled the workarounds.
> 
> Commit bdfd66788349 ("virtiofsd: Fix xattr operations") already dealt with
> the same issue for xattr operations.
> 
> Signed-off-by: Miklos Szeredi 
> ---
>  tools/virtiofsd/passthrough_ll.c | 175 ++-
>  1 file changed, 6 insertions(+), 169 deletions(-)
> 
> diff --git a/tools/virtiofsd/passthrough_ll.c 
> b/tools/virtiofsd/passthrough_ll.c
> index 3ba1d9098460..2ce7c96085bf 100644
> --- a/tools/virtiofsd/passthrough_ll.c
> +++ b/tools/virtiofsd/passthrough_ll.c
> @@ -140,7 +140,6 @@ enum {
>  struct lo_data {
>  pthread_mutex_t mutex;
>  int debug;
> -int norace;
>  int writeback;
>  int flock;
>  int posix_lock;
> @@ -176,7 +175,6 @@ static const struct fuse_opt lo_opts[] = {
>  { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
>  { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
>  { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
> -{ "norace", offsetof(struct lo_data, norace), 1 },
>  { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
>  { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
>  FUSE_OPT_END
> @@ -592,136 +590,6 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
>  fuse_reply_attr(req, , lo->timeout);
>  }
>  
> -/*
> - * Increments parent->nlookup and caller must release refcount using
> - * lo_inode_put().
> - */
> -static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
> -  char path[PATH_MAX], struct lo_inode **parent)
> -{
> -char procname[64];
> -char *last;
> -struct stat stat;
> -struct lo_inode *p;
> -int retries = 2;
> -int res;
> -
> -retry:
> -sprintf(procname, "%i", inode->fd);
> -
> -res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX);
> -if (res < 0) {
> -fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__);
> -goto fail_noretry;
> -}
> -
> -if (res >= PATH_MAX) {
> -fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__);
> -goto fail_noretry;
> -}
> -path[res] = '\0';
> -
> -last = strrchr(path, '/');
> -if (last == NULL) {
> -/* Shouldn't happen */
> -fuse_log(
> -FUSE_LOG_WARNING,
> -"%s: INTERNAL ERROR: bad path read from proc\n", __func__);
> -goto fail_noretry;
> -}
> -if (last == path) {
> -p = >root;
> -pthread_mutex_lock(>mutex);
> -p->nlookup++;
> -g_atomic_int_inc(>refcount);
> -pthread_mutex_unlock(>mutex);
> -} else {
> -*last = '\0';
> -res = fstatat(AT_FDCWD, last == path ? "/" : path, , 0);
> -if (res == -1) {
> -if (!retries) {
> -fuse_log(FUSE_LOG_WARNING,
> - "%s: failed to stat parent: %m\n", __func__);
> -}
> -goto fail;
> -}
> -p = lo_find(lo, );
> -if (p == NULL) {
> -if (!retries) {
> -fuse_log(FUSE_LOG_WARNING,
> - "%s: failed to find parent\n", __func__);
> -}
> -goto fail;
> -}
> -}
> -last++;
> -res = fstatat(p->fd, last, , AT_SYMLINK_NOFOLLOW);
> -if (res == -1) {
> -if (!retries) {
> -fuse_log(FUSE_LOG_WARNING,
> - "%s: failed to stat last\n", __func__);
> -}
> -goto fail_unref;
> -}
> -if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) {
> -if (!retries) {
> -fuse_log(FUSE_LOG_WARNING,
> - "%s: failed to match last\n", __func__);
> -}
> -goto fail_unref;
> -}
> -*parent = p;
> -memmove(path, last, strlen(last) + 1);
> -
> -return 0;
> -
> -fail_unref:
> -unref_inode_lolocked(lo, p, 1);
> -lo_inode_put(lo, );
> -fail:
> -if 

Re: [Qemu-devel] [Virtio-fs] [PATCH] virtiofsd: add man page

2019-08-29 Thread Liu Bo
On Thu, Aug 29, 2019 at 11:41:33AM +0100, Stefan Hajnoczi wrote:

Looks good to me, thanks for working this out.

Reviewed-by: Liu Bo 

thanks,
-liubo
> Signed-off-by: Stefan Hajnoczi 
> ---
>  Makefile |  7 +++
>  contrib/virtiofsd/virtiofsd.texi | 85 
>  2 files changed, 92 insertions(+)
>  create mode 100644 contrib/virtiofsd/virtiofsd.texi
> 
> diff --git a/Makefile b/Makefile
> index a3dfdd6fa8..cc18025753 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -334,6 +334,9 @@ DOCS+=docs/qemu-cpu-models.7
>  ifdef CONFIG_VIRTFS
>  DOCS+=fsdev/virtfs-proxy-helper.1
>  endif
> +ifdef CONFIG_LINUX
> +DOCS+=contrib/virtiofsd/virtiofsd.1
> +endif
>  ifdef CONFIG_TRACE_SYSTEMTAP
>  DOCS+=scripts/qemu-trace-stap.1
>  endif
> @@ -834,6 +837,9 @@ ifdef CONFIG_VIRTFS
>   $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
>   $(INSTALL_DATA) fsdev/virtfs-proxy-helper.1 "$(DESTDIR)$(mandir)/man1"
>  endif
> +ifdef CONFIG_LINUX
> + $(INSTALL_DATA) contrib/virtiofsd.1 "$(DESTDIR)$(mandir)/man1"
> +endif
>  
>  install-datadir:
>   $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)"
> @@ -1018,6 +1024,7 @@ qemu.1: qemu-doc.texi qemu-options.texi 
> qemu-monitor.texi qemu-monitor-info.texi
>  qemu.1: qemu-option-trace.texi
>  qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
>  fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
> +contrib/virtiofsd/virtiofsd.1: contrib/virtiofsd/virtiofsd.texi
>  qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
>  qemu-ga.8: qemu-ga.texi
>  docs/qemu-block-drivers.7: docs/qemu-block-drivers.texi
> diff --git a/contrib/virtiofsd/virtiofsd.texi 
> b/contrib/virtiofsd/virtiofsd.texi
> new file mode 100644
> index 00..eec7fbf4e6
> --- /dev/null
> +++ b/contrib/virtiofsd/virtiofsd.texi
> @@ -0,0 +1,85 @@
> +@example
> +@c man begin SYNOPSIS
> +@command{virtiofsd} [OPTION] 
> @option{--socket-path=}@var{path}|@option{--fd=}@var{fdnum} @option{-o 
> source=}@var{path}
> +@c man end
> +@end example
> +
> +@c man begin DESCRIPTION
> +
> +Share a host directory tree with a guest through a virtio-fs device.  This
> +program is a vhost-user backend that implements the virtio-fs device.  Each
> +virtio-fs device instance requires its own virtiofsd process.
> +
> +This program is designed to work with QEMU's @code{--device 
> vhost-user-fs-pci}
> +but should work with any virtual machine monitor (VMM) that supports
> +vhost-user.  See the EXAMPLES section below.
> +
> +This program must be run as the root user.  Upon startup the program will
> +switch into a new file system namespace with the shared directory tree as its
> +root.  This prevents "file system escapes" due to symlinks and other file
> +system objects that might lead to files outside the shared directory.  The
> +program also sandboxes itself using seccomp(2) to prevent ptrace(2) and other
> +vectors that could allow an attacker to compromise the system after gaining
> +control of the virtiofsd process.
> +
> +@c man end
> +
> +@c man begin OPTIONS
> +@table @option
> +@item -h, --help
> +Print help.
> +@item -V, --version
> +Print version.
> +@item -d, -o debug
> +Enable debug output.
> +@item --syslog
> +Print log messages to syslog instead of stderr.
> +@item -o log_level=@var{level}
> +Print only log messages matching @var{level} or more severe.  @var{level} is
> +one of @code{err}, @code{warn}, @code{info}, or @code{debug}.  The default is
> +@var{info}.
> +@item -o source=@var{path}
> +Share host directory tree located at @var{path}.  This option is required.
> +@item --socket-path=@var{path}, -o vhost_user_socket=@var{path}
> +Listen on vhost-user UNIX domain socket at @var{path}.
> +@item --fd=@var{fdnum}
> +Accept connections from vhost-user UNIX domain socket file descriptor 
> @var{fdnum}.  The file descriptor must already be listening for connections.
> +@item --thread-pool-size=@var{num}
> +Restrict the number of worker threads per request queue to @var{num}.  The 
> default is 64.
> +@item --cache=@code{none}|@code{auto}|@code{always}
> +Select the desired trade-off between coherency and performance.  @code{none}
> +forbids the FUSE client from caching to achieve best coherency at the cost of
> +performance.  @code{auto} acts similar to NFS with a 1 second metadata cache
> +timeout.  @code{always} sets a long cache lifetime at the expense of 
> coherency.
> +@item --writeback
> +Enable writeback cache, allowing the FUSE client to buffer and merge write 
> requests.
> +@end table
> +@c man end
> +
> +@c man begin EXAMPLES
> +Export @code{/var/lib/fs/vm001/}

Re: [Qemu-devel] [Virtio-fs] [PATCH 0/4] virtiofsd: multithreading preparation part 3

2019-08-10 Thread Liu Bo
On Thu, Aug 08, 2019 at 08:53:20AM -0400, Vivek Goyal wrote:
> On Thu, Aug 08, 2019 at 10:53:16AM +0100, Dr. David Alan Gilbert wrote:
> > * Stefan Hajnoczi (stefa...@redhat.com) wrote:
> > > On Wed, Aug 07, 2019 at 04:57:15PM -0400, Vivek Goyal wrote:
> > > > Kernel also serializes MAP/UNMAP on one inode. So you will need to run
> > > > multiple jobs operating on different inodes to see parallel MAP/UNMAP
> > > > (atleast from kernel's point of view).
> > > 
> > > Okay, there is still room to experiment with how MAP and UNMAP are
> > > handled by virtiofsd and QEMU even if the host kernel ultimately becomes
> > > the bottleneck.
> > > 
> > > One possible optimization is to eliminate REMOVEMAPPING requests when
> > > the guest driver knows a SETUPMAPPING will follow immediately.  I see
> > > the following request pattern in a fio randread iodepth=64 job:
> > > 
> > >   unique: 995348, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, 
> > > pid: 1351
> > >   lo_setupmapping(ino=135, fi=0x(nil), foffset=3860856832, len=2097152, 
> > > moffset=859832320, flags=0)
> > >  unique: 995348, success, outsize: 16
> > >   unique: 995350, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, 
> > > pid: 12
> > >  unique: 995350, success, outsize: 16
> > >   unique: 995352, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, 
> > > pid: 1351
> > >   lo_setupmapping(ino=135, fi=0x(nil), foffset=16777216, len=2097152, 
> > > moffset=861929472, flags=0)
> > >  unique: 995352, success, outsize: 16
> > >   unique: 995354, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, 
> > > pid: 12
> > >  unique: 995354, success, outsize: 16
> > >   virtio_send_msg: elem 9: with 1 in desc of length 16
> > >   unique: 995356, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, 
> > > pid: 1351
> > >   lo_setupmapping(ino=135, fi=0x(nil), foffset=383778816, len=2097152, 
> > > moffset=864026624, flags=0)
> > >  unique: 995356, success, outsize: 16
> > >   unique: 995358, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, 
> > > pid: 12
> > > 
> > > The REMOVEMAPPING requests are unnecessary since we can map over the top
> > > of the old mapping instead of taking the extra step of removing it
> > > first.
> > 
> > Yep, those should go - I think Vivek likes to keep them for testing
> > since they make things fail more completely if there's a screwup.
> 
> I like to keep them because otherwise they keep the resources busy
> on host. If DAX range is being used immediately, then this optimization
> makes more sense. I will keep this in mind.
>

Other than the resource not being released, do you think there'll be
any stale data problem if we don't do removemapping at all, neither
background reclaim nor inline reclaim?
(truncate/punch_hole/evict_inode still needs to remove mapping though)

thanks,
-liubo

> > 
> > > Some more questions to consider for DAX performance optimization:
> > > 
> > > 1. Is FUSE_READ/FUSE_WRITE more efficient than DAX for some I/O patterns?
> > 
> > Probably for cases where the data is only accessed once, and you can't
> > preemptively map.
> > Another variant on (1) is whether we could do read/writes while the mmap
> > is happening to absorb the latency.
> 
> For small random I/O, dax might not be very effective. Overhead of
> setting up mapping and tearing it down is significant.
> 
> Vivek
> 
> ___
> Virtio-fs mailing list
> virtio...@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs



Re: [Qemu-devel] [Virtio-fs] [PATCH 0/4] virtiofsd: multithreading preparation part 3

2019-08-10 Thread Liu Bo
On Fri, Aug 09, 2019 at 09:21:02AM +0100, Stefan Hajnoczi wrote:
> On Thu, Aug 08, 2019 at 10:53:16AM +0100, Dr. David Alan Gilbert wrote:
> > * Stefan Hajnoczi (stefa...@redhat.com) wrote:
> > > On Wed, Aug 07, 2019 at 04:57:15PM -0400, Vivek Goyal wrote:
> > > 2. Can MAP/UNMAP be performed directly in QEMU via a separate virtqueue?
> > 
> > I think there's two things to solve here that I don't currently know the
> > answer to:
> >   2a) We'd need to get the fd to qemu for the thing to mmap;
> >   we might be able to cache the fd on the qemu side for existing
> >   mappings, so when asking for a new mapping for an existing file then
> >   it would already have the fd.
> > 
> >   2b) Running a device with a mix of queues inside QEMU and on
> >   vhost-user; I don't think we have anything with that mix
> 
> vhost-user-net works in the same way.  The ctrl queue is handled by QEMU
> and the rx/tx queues by the vhost device.  This is in fact how vhost was
> initially designed: the vhost device is not a full virtio device, only
> the dataplane.

> > > 3. Can READ/WRITE be performed directly in QEMU via a separate virtqueue
> > >to eliminate the bad address problem?
> > 
> > Are you thinking of doing all read/writes that way, or just the corner
> > cases? It doesn't seem worth it for the corner cases unless you're
> > finding them cropping up in real work loads.
> 
> Send all READ/WRITE requests to QEMU instead of virtiofsd.
> 
> Only handle metadata requests in virtiofsd (OPEN, RELEASE, READDIR,
> MKDIR, etc).

For now qemu is not aware of virtio-fs's fd info, but I think it's
doable, I like the idea.

thanks,
-liubo
> 
> > > I'm not going to tackle DAX optimization myself right now but wanted to
> > > share these ideas.
> > 
> > One I was thinking about that feels easier than (2) was to change the
> > vhost slave protocol to be split transaction; it wouldn't do anything
> > for the latency but it would be able to do some in parallel if we can
> > get the kernel to feed it.
> 
> There are two cases:
> 1. mmapping multiple inode.  This should benefit from parallelism,
>although mmap is still expensive because it involves TLB shootdown
>for all other threads running this process.
> 2. mmapping the same inode.  Here the host kernel is likely to serialize
>mmaps even more, making it hard to gain performance.
> 
> It's probably worth writing a tiny benchmark first to evaluate the
> potential gains.
> 
> Stefan



> ___
> Virtio-fs mailing list
> virtio...@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs



Re: [Qemu-devel] [Virtio-fs] [PATCH 1/5] virtiofsd: skip unnecessary vu_queue_get_avail_bytes()

2019-07-26 Thread Liu Bo
On Fri, Jul 26, 2019 at 10:10:59AM +0100, Stefan Hajnoczi wrote:
> When debug output is disabled there is no need to calculate the number
> of in/out bytes available.
> 
> There is also no need to skip a request if there are 0 out bytes.  The
> request parsing code already handles invalid requests.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  contrib/virtiofsd/fuse_virtio.c | 13 +
>  1 file changed, 5 insertions(+), 8 deletions(-)
> 
> diff --git a/contrib/virtiofsd/fuse_virtio.c b/contrib/virtiofsd/fuse_virtio.c
> index 083e4fc317..d543c6d30f 100644
> --- a/contrib/virtiofsd/fuse_virtio.c
> +++ b/contrib/virtiofsd/fuse_virtio.c
> @@ -507,18 +507,16 @@ static void *fv_queue_thread(void *opaque)
> ret = 
> pthread_rwlock_rdlock(>virtio_dev->vu_dispatch_rwlock);
> assert(ret == 0); /* there is no possible error case */
>  
> -   /* out is from guest, in is too guest */
> -   unsigned int in_bytes, out_bytes;
> -   vu_queue_get_avail_bytes(dev, q, _bytes, _bytes, ~0, 
> ~0);
> +   if (se->debug) {
> +   /* out is from guest, in is too guest */
> +   unsigned int in_bytes, out_bytes;
> +   vu_queue_get_avail_bytes(dev, q, _bytes, 
> _bytes, ~0, ~0);
>  
> -   if (se->debug)
> fuse_debug("%s: Queue %d gave evalue: %zx available: 
> in: %u out: %u\n",
> __func__, qi->qidx, (size_t)evalue, in_bytes,
> out_bytes);
> -
> -   if (!out_bytes) {
> -   goto next;
> }
> +
> while (1) {
> bool allocated_bufv = false;
> struct fuse_bufvec bufv;
> @@ -708,7 +706,6 @@ static void *fv_queue_thread(void *opaque)
> elem = NULL;
>  }
>  
> -next:
>  pthread_rwlock_unlock(>virtio_dev->vu_dispatch_rwlock);
>  }
>  pthread_mutex_destroy();
> -- 
> 2.21.0
> 
> ___
> Virtio-fs mailing list
> virtio...@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs

Reviewed-by: Liu Bo 

thanks,
-liubo



Re: [Qemu-devel] [Virtio-fs] [PATCH 2/5] virtiofsd: prevent lo_lookup() NULL pointer dereference

2019-07-26 Thread Liu Bo
On Fri, Jul 26, 2019 at 10:11:00AM +0100, Stefan Hajnoczi wrote:
> Most lo_do_lookup() have already checked that the parent inode exists.
> lo_lookup() hasn't and can therefore hit a NULL pointer dereference when
> lo_inode(req, parent) returns NULL.
>

Sigh...this one has been fixed by 3 different developers...Me, Pengtao and 
Stefan.

The following one on the ML did the exactly same thing.
---
Subject: [Virtio-fs] [PATCH] virtiofsd: fix lo_do_lookup panic

It needs to check for invalid parent dir.

Signed-off-by: Peng Tao 
---

thanks,
-liubo

> Signed-off-by: Stefan Hajnoczi 
> ---
>  contrib/virtiofsd/passthrough_ll.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/contrib/virtiofsd/passthrough_ll.c 
> b/contrib/virtiofsd/passthrough_ll.c
> index 9ae1381618..277a17fc03 100644
> --- a/contrib/virtiofsd/passthrough_ll.c
> +++ b/contrib/virtiofsd/passthrough_ll.c
> @@ -766,6 +766,10 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t 
> parent, const char *name,
>   struct lo_data *lo = lo_data(req);
>   struct lo_inode *inode, *dir = lo_inode(req, parent);
>  
> + if (!dir) {
> + return EBADF;
> + }
> +
>   memset(e, 0, sizeof(*e));
>   e->attr_timeout = lo->timeout;
>   e->entry_timeout = lo->timeout;
> -- 
> 2.21.0
> 
> ___
> Virtio-fs mailing list
> virtio...@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs