Re: [Virtio-fs] [PATCH] virtiofsd: remove symlink fallbacks
On Thu, May 14, 2020 at 04:07:36PM +0200, Miklos Szeredi wrote: > Path lookup in the kernel has special rules for looking up magic symlinks > under /proc. If a filesystem operation is instructed to follow symlinks > (e.g. via AT_SYMLINK_FOLLOW or lack of AT_SYMLINK_NOFOLLOW), and the final > component is such a proc symlink, then the target of the magic symlink is > used for the operation, even if the target itself is a symlink. I.e. path > lookup is always terminated after following a final magic symlink. > Hi Miklos, Are these mentioned special rules supported by a recent kernel version or are they there from day one linux? thanks, liubo > I was erronously assuming that in the above case the target symlink would > also be followed, and so workarounds were added for a couple of operations > to handle the symlink case. Since the symlink can be handled simply by > following the proc symlink, these workardouds are not needed. > > Also remove the "norace" option, which disabled the workarounds. > > Commit bdfd66788349 ("virtiofsd: Fix xattr operations") already dealt with > the same issue for xattr operations. > > Signed-off-by: Miklos Szeredi > --- > tools/virtiofsd/passthrough_ll.c | 175 ++- > 1 file changed, 6 insertions(+), 169 deletions(-) > > diff --git a/tools/virtiofsd/passthrough_ll.c > b/tools/virtiofsd/passthrough_ll.c > index 3ba1d9098460..2ce7c96085bf 100644 > --- a/tools/virtiofsd/passthrough_ll.c > +++ b/tools/virtiofsd/passthrough_ll.c > @@ -140,7 +140,6 @@ enum { > struct lo_data { > pthread_mutex_t mutex; > int debug; > -int norace; > int writeback; > int flock; > int posix_lock; > @@ -176,7 +175,6 @@ static const struct fuse_opt lo_opts[] = { > { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, > { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, > { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, > -{ "norace", offsetof(struct lo_data, norace), 1 }, > { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, > { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, > FUSE_OPT_END > @@ -592,136 +590,6 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino, > fuse_reply_attr(req, , lo->timeout); > } > > -/* > - * Increments parent->nlookup and caller must release refcount using > - * lo_inode_put(). > - */ > -static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, > - char path[PATH_MAX], struct lo_inode **parent) > -{ > -char procname[64]; > -char *last; > -struct stat stat; > -struct lo_inode *p; > -int retries = 2; > -int res; > - > -retry: > -sprintf(procname, "%i", inode->fd); > - > -res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX); > -if (res < 0) { > -fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); > -goto fail_noretry; > -} > - > -if (res >= PATH_MAX) { > -fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__); > -goto fail_noretry; > -} > -path[res] = '\0'; > - > -last = strrchr(path, '/'); > -if (last == NULL) { > -/* Shouldn't happen */ > -fuse_log( > -FUSE_LOG_WARNING, > -"%s: INTERNAL ERROR: bad path read from proc\n", __func__); > -goto fail_noretry; > -} > -if (last == path) { > -p = >root; > -pthread_mutex_lock(>mutex); > -p->nlookup++; > -g_atomic_int_inc(>refcount); > -pthread_mutex_unlock(>mutex); > -} else { > -*last = '\0'; > -res = fstatat(AT_FDCWD, last == path ? "/" : path, , 0); > -if (res == -1) { > -if (!retries) { > -fuse_log(FUSE_LOG_WARNING, > - "%s: failed to stat parent: %m\n", __func__); > -} > -goto fail; > -} > -p = lo_find(lo, ); > -if (p == NULL) { > -if (!retries) { > -fuse_log(FUSE_LOG_WARNING, > - "%s: failed to find parent\n", __func__); > -} > -goto fail; > -} > -} > -last++; > -res = fstatat(p->fd, last, , AT_SYMLINK_NOFOLLOW); > -if (res == -1) { > -if (!retries) { > -fuse_log(FUSE_LOG_WARNING, > - "%s: failed to stat last\n", __func__); > -} > -goto fail_unref; > -} > -if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) { > -if (!retries) { > -fuse_log(FUSE_LOG_WARNING, > - "%s: failed to match last\n", __func__); > -} > -goto fail_unref; > -} > -*parent = p; > -memmove(path, last, strlen(last) + 1); > - > -return 0; > - > -fail_unref: > -unref_inode_lolocked(lo, p, 1); > -lo_inode_put(lo, ); > -fail: > -if
Re: [Qemu-devel] [Virtio-fs] [PATCH] virtiofsd: add man page
On Thu, Aug 29, 2019 at 11:41:33AM +0100, Stefan Hajnoczi wrote: Looks good to me, thanks for working this out. Reviewed-by: Liu Bo thanks, -liubo > Signed-off-by: Stefan Hajnoczi > --- > Makefile | 7 +++ > contrib/virtiofsd/virtiofsd.texi | 85 > 2 files changed, 92 insertions(+) > create mode 100644 contrib/virtiofsd/virtiofsd.texi > > diff --git a/Makefile b/Makefile > index a3dfdd6fa8..cc18025753 100644 > --- a/Makefile > +++ b/Makefile > @@ -334,6 +334,9 @@ DOCS+=docs/qemu-cpu-models.7 > ifdef CONFIG_VIRTFS > DOCS+=fsdev/virtfs-proxy-helper.1 > endif > +ifdef CONFIG_LINUX > +DOCS+=contrib/virtiofsd/virtiofsd.1 > +endif > ifdef CONFIG_TRACE_SYSTEMTAP > DOCS+=scripts/qemu-trace-stap.1 > endif > @@ -834,6 +837,9 @@ ifdef CONFIG_VIRTFS > $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" > $(INSTALL_DATA) fsdev/virtfs-proxy-helper.1 "$(DESTDIR)$(mandir)/man1" > endif > +ifdef CONFIG_LINUX > + $(INSTALL_DATA) contrib/virtiofsd.1 "$(DESTDIR)$(mandir)/man1" > +endif > > install-datadir: > $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)" > @@ -1018,6 +1024,7 @@ qemu.1: qemu-doc.texi qemu-options.texi > qemu-monitor.texi qemu-monitor-info.texi > qemu.1: qemu-option-trace.texi > qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi > fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi > +contrib/virtiofsd/virtiofsd.1: contrib/virtiofsd/virtiofsd.texi > qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi > qemu-ga.8: qemu-ga.texi > docs/qemu-block-drivers.7: docs/qemu-block-drivers.texi > diff --git a/contrib/virtiofsd/virtiofsd.texi > b/contrib/virtiofsd/virtiofsd.texi > new file mode 100644 > index 00..eec7fbf4e6 > --- /dev/null > +++ b/contrib/virtiofsd/virtiofsd.texi > @@ -0,0 +1,85 @@ > +@example > +@c man begin SYNOPSIS > +@command{virtiofsd} [OPTION] > @option{--socket-path=}@var{path}|@option{--fd=}@var{fdnum} @option{-o > source=}@var{path} > +@c man end > +@end example > + > +@c man begin DESCRIPTION > + > +Share a host directory tree with a guest through a virtio-fs device. This > +program is a vhost-user backend that implements the virtio-fs device. Each > +virtio-fs device instance requires its own virtiofsd process. > + > +This program is designed to work with QEMU's @code{--device > vhost-user-fs-pci} > +but should work with any virtual machine monitor (VMM) that supports > +vhost-user. See the EXAMPLES section below. > + > +This program must be run as the root user. Upon startup the program will > +switch into a new file system namespace with the shared directory tree as its > +root. This prevents "file system escapes" due to symlinks and other file > +system objects that might lead to files outside the shared directory. The > +program also sandboxes itself using seccomp(2) to prevent ptrace(2) and other > +vectors that could allow an attacker to compromise the system after gaining > +control of the virtiofsd process. > + > +@c man end > + > +@c man begin OPTIONS > +@table @option > +@item -h, --help > +Print help. > +@item -V, --version > +Print version. > +@item -d, -o debug > +Enable debug output. > +@item --syslog > +Print log messages to syslog instead of stderr. > +@item -o log_level=@var{level} > +Print only log messages matching @var{level} or more severe. @var{level} is > +one of @code{err}, @code{warn}, @code{info}, or @code{debug}. The default is > +@var{info}. > +@item -o source=@var{path} > +Share host directory tree located at @var{path}. This option is required. > +@item --socket-path=@var{path}, -o vhost_user_socket=@var{path} > +Listen on vhost-user UNIX domain socket at @var{path}. > +@item --fd=@var{fdnum} > +Accept connections from vhost-user UNIX domain socket file descriptor > @var{fdnum}. The file descriptor must already be listening for connections. > +@item --thread-pool-size=@var{num} > +Restrict the number of worker threads per request queue to @var{num}. The > default is 64. > +@item --cache=@code{none}|@code{auto}|@code{always} > +Select the desired trade-off between coherency and performance. @code{none} > +forbids the FUSE client from caching to achieve best coherency at the cost of > +performance. @code{auto} acts similar to NFS with a 1 second metadata cache > +timeout. @code{always} sets a long cache lifetime at the expense of > coherency. > +@item --writeback > +Enable writeback cache, allowing the FUSE client to buffer and merge write > requests. > +@end table > +@c man end > + > +@c man begin EXAMPLES > +Export @code{/var/lib/fs/vm001/}
Re: [Qemu-devel] [Virtio-fs] [PATCH 0/4] virtiofsd: multithreading preparation part 3
On Thu, Aug 08, 2019 at 08:53:20AM -0400, Vivek Goyal wrote: > On Thu, Aug 08, 2019 at 10:53:16AM +0100, Dr. David Alan Gilbert wrote: > > * Stefan Hajnoczi (stefa...@redhat.com) wrote: > > > On Wed, Aug 07, 2019 at 04:57:15PM -0400, Vivek Goyal wrote: > > > > Kernel also serializes MAP/UNMAP on one inode. So you will need to run > > > > multiple jobs operating on different inodes to see parallel MAP/UNMAP > > > > (atleast from kernel's point of view). > > > > > > Okay, there is still room to experiment with how MAP and UNMAP are > > > handled by virtiofsd and QEMU even if the host kernel ultimately becomes > > > the bottleneck. > > > > > > One possible optimization is to eliminate REMOVEMAPPING requests when > > > the guest driver knows a SETUPMAPPING will follow immediately. I see > > > the following request pattern in a fio randread iodepth=64 job: > > > > > > unique: 995348, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, > > > pid: 1351 > > > lo_setupmapping(ino=135, fi=0x(nil), foffset=3860856832, len=2097152, > > > moffset=859832320, flags=0) > > > unique: 995348, success, outsize: 16 > > > unique: 995350, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, > > > pid: 12 > > > unique: 995350, success, outsize: 16 > > > unique: 995352, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, > > > pid: 1351 > > > lo_setupmapping(ino=135, fi=0x(nil), foffset=16777216, len=2097152, > > > moffset=861929472, flags=0) > > > unique: 995352, success, outsize: 16 > > > unique: 995354, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, > > > pid: 12 > > > unique: 995354, success, outsize: 16 > > > virtio_send_msg: elem 9: with 1 in desc of length 16 > > > unique: 995356, opcode: SETUPMAPPING (48), nodeid: 135, insize: 80, > > > pid: 1351 > > > lo_setupmapping(ino=135, fi=0x(nil), foffset=383778816, len=2097152, > > > moffset=864026624, flags=0) > > > unique: 995356, success, outsize: 16 > > > unique: 995358, opcode: REMOVEMAPPING (49), nodeid: 135, insize: 60, > > > pid: 12 > > > > > > The REMOVEMAPPING requests are unnecessary since we can map over the top > > > of the old mapping instead of taking the extra step of removing it > > > first. > > > > Yep, those should go - I think Vivek likes to keep them for testing > > since they make things fail more completely if there's a screwup. > > I like to keep them because otherwise they keep the resources busy > on host. If DAX range is being used immediately, then this optimization > makes more sense. I will keep this in mind. > Other than the resource not being released, do you think there'll be any stale data problem if we don't do removemapping at all, neither background reclaim nor inline reclaim? (truncate/punch_hole/evict_inode still needs to remove mapping though) thanks, -liubo > > > > > Some more questions to consider for DAX performance optimization: > > > > > > 1. Is FUSE_READ/FUSE_WRITE more efficient than DAX for some I/O patterns? > > > > Probably for cases where the data is only accessed once, and you can't > > preemptively map. > > Another variant on (1) is whether we could do read/writes while the mmap > > is happening to absorb the latency. > > For small random I/O, dax might not be very effective. Overhead of > setting up mapping and tearing it down is significant. > > Vivek > > ___ > Virtio-fs mailing list > virtio...@redhat.com > https://www.redhat.com/mailman/listinfo/virtio-fs
Re: [Qemu-devel] [Virtio-fs] [PATCH 0/4] virtiofsd: multithreading preparation part 3
On Fri, Aug 09, 2019 at 09:21:02AM +0100, Stefan Hajnoczi wrote: > On Thu, Aug 08, 2019 at 10:53:16AM +0100, Dr. David Alan Gilbert wrote: > > * Stefan Hajnoczi (stefa...@redhat.com) wrote: > > > On Wed, Aug 07, 2019 at 04:57:15PM -0400, Vivek Goyal wrote: > > > 2. Can MAP/UNMAP be performed directly in QEMU via a separate virtqueue? > > > > I think there's two things to solve here that I don't currently know the > > answer to: > > 2a) We'd need to get the fd to qemu for the thing to mmap; > > we might be able to cache the fd on the qemu side for existing > > mappings, so when asking for a new mapping for an existing file then > > it would already have the fd. > > > > 2b) Running a device with a mix of queues inside QEMU and on > > vhost-user; I don't think we have anything with that mix > > vhost-user-net works in the same way. The ctrl queue is handled by QEMU > and the rx/tx queues by the vhost device. This is in fact how vhost was > initially designed: the vhost device is not a full virtio device, only > the dataplane. > > > 3. Can READ/WRITE be performed directly in QEMU via a separate virtqueue > > >to eliminate the bad address problem? > > > > Are you thinking of doing all read/writes that way, or just the corner > > cases? It doesn't seem worth it for the corner cases unless you're > > finding them cropping up in real work loads. > > Send all READ/WRITE requests to QEMU instead of virtiofsd. > > Only handle metadata requests in virtiofsd (OPEN, RELEASE, READDIR, > MKDIR, etc). For now qemu is not aware of virtio-fs's fd info, but I think it's doable, I like the idea. thanks, -liubo > > > > I'm not going to tackle DAX optimization myself right now but wanted to > > > share these ideas. > > > > One I was thinking about that feels easier than (2) was to change the > > vhost slave protocol to be split transaction; it wouldn't do anything > > for the latency but it would be able to do some in parallel if we can > > get the kernel to feed it. > > There are two cases: > 1. mmapping multiple inode. This should benefit from parallelism, >although mmap is still expensive because it involves TLB shootdown >for all other threads running this process. > 2. mmapping the same inode. Here the host kernel is likely to serialize >mmaps even more, making it hard to gain performance. > > It's probably worth writing a tiny benchmark first to evaluate the > potential gains. > > Stefan > ___ > Virtio-fs mailing list > virtio...@redhat.com > https://www.redhat.com/mailman/listinfo/virtio-fs
Re: [Qemu-devel] [Virtio-fs] [PATCH 1/5] virtiofsd: skip unnecessary vu_queue_get_avail_bytes()
On Fri, Jul 26, 2019 at 10:10:59AM +0100, Stefan Hajnoczi wrote: > When debug output is disabled there is no need to calculate the number > of in/out bytes available. > > There is also no need to skip a request if there are 0 out bytes. The > request parsing code already handles invalid requests. > > Signed-off-by: Stefan Hajnoczi > --- > contrib/virtiofsd/fuse_virtio.c | 13 + > 1 file changed, 5 insertions(+), 8 deletions(-) > > diff --git a/contrib/virtiofsd/fuse_virtio.c b/contrib/virtiofsd/fuse_virtio.c > index 083e4fc317..d543c6d30f 100644 > --- a/contrib/virtiofsd/fuse_virtio.c > +++ b/contrib/virtiofsd/fuse_virtio.c > @@ -507,18 +507,16 @@ static void *fv_queue_thread(void *opaque) > ret = > pthread_rwlock_rdlock(>virtio_dev->vu_dispatch_rwlock); > assert(ret == 0); /* there is no possible error case */ > > - /* out is from guest, in is too guest */ > - unsigned int in_bytes, out_bytes; > - vu_queue_get_avail_bytes(dev, q, _bytes, _bytes, ~0, > ~0); > + if (se->debug) { > + /* out is from guest, in is too guest */ > + unsigned int in_bytes, out_bytes; > + vu_queue_get_avail_bytes(dev, q, _bytes, > _bytes, ~0, ~0); > > - if (se->debug) > fuse_debug("%s: Queue %d gave evalue: %zx available: > in: %u out: %u\n", > __func__, qi->qidx, (size_t)evalue, in_bytes, > out_bytes); > - > - if (!out_bytes) { > - goto next; > } > + > while (1) { > bool allocated_bufv = false; > struct fuse_bufvec bufv; > @@ -708,7 +706,6 @@ static void *fv_queue_thread(void *opaque) > elem = NULL; > } > > -next: > pthread_rwlock_unlock(>virtio_dev->vu_dispatch_rwlock); > } > pthread_mutex_destroy(); > -- > 2.21.0 > > ___ > Virtio-fs mailing list > virtio...@redhat.com > https://www.redhat.com/mailman/listinfo/virtio-fs Reviewed-by: Liu Bo thanks, -liubo
Re: [Qemu-devel] [Virtio-fs] [PATCH 2/5] virtiofsd: prevent lo_lookup() NULL pointer dereference
On Fri, Jul 26, 2019 at 10:11:00AM +0100, Stefan Hajnoczi wrote: > Most lo_do_lookup() have already checked that the parent inode exists. > lo_lookup() hasn't and can therefore hit a NULL pointer dereference when > lo_inode(req, parent) returns NULL. > Sigh...this one has been fixed by 3 different developers...Me, Pengtao and Stefan. The following one on the ML did the exactly same thing. --- Subject: [Virtio-fs] [PATCH] virtiofsd: fix lo_do_lookup panic It needs to check for invalid parent dir. Signed-off-by: Peng Tao --- thanks, -liubo > Signed-off-by: Stefan Hajnoczi > --- > contrib/virtiofsd/passthrough_ll.c | 4 > 1 file changed, 4 insertions(+) > > diff --git a/contrib/virtiofsd/passthrough_ll.c > b/contrib/virtiofsd/passthrough_ll.c > index 9ae1381618..277a17fc03 100644 > --- a/contrib/virtiofsd/passthrough_ll.c > +++ b/contrib/virtiofsd/passthrough_ll.c > @@ -766,6 +766,10 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t > parent, const char *name, > struct lo_data *lo = lo_data(req); > struct lo_inode *inode, *dir = lo_inode(req, parent); > > + if (!dir) { > + return EBADF; > + } > + > memset(e, 0, sizeof(*e)); > e->attr_timeout = lo->timeout; > e->entry_timeout = lo->timeout; > -- > 2.21.0 > > ___ > Virtio-fs mailing list > virtio...@redhat.com > https://www.redhat.com/mailman/listinfo/virtio-fs