Re: [PATCH v2 2/4] fuse: Make DAX mount option a tri-state

2021-07-19 Thread JeffleXu



On 7/20/21 2:02 AM, Vivek Goyal wrote:
> On Fri, Jul 16, 2021 at 06:47:51PM +0800, Jeffle Xu wrote:
>> We add 'always', 'never', and 'inode' (default). '-o dax' continues to
>> operate the same which is equivalent to 'always'.
>>
>> By the time this patch is applied, 'inode' mode is actually equal to
>> 'always' mode, before the per-file DAX flag is introduced in the
>> following patch.
>>
>> Signed-off-by: Jeffle Xu 
>> ---
>>  fs/fuse/dax.c   | 13 -
>>  fs/fuse/fuse_i.h| 11 +--
>>  fs/fuse/inode.c |  2 +-
>>  fs/fuse/virtio_fs.c | 16 ++--
>>  4 files changed, 36 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
>> index c6f4e82e65f3..a478e824c2d0 100644
>> --- a/fs/fuse/dax.c
>> +++ b/fs/fuse/dax.c
>> @@ -70,6 +70,9 @@ struct fuse_inode_dax {
>>  };
>>  
>>  struct fuse_conn_dax {
>> +/** dax mode: FUSE_DAX_MOUNT_* (always, never or per-file) **/
>> +unsigned int mode;
> 
> Why "/**" ?

I copied this comment style from fuse in v4.19... Anyway, I will fix this.

> 
> How about make it something like "enum fuse_dax_mode mode" instead?
> 
> enum fuse_dax_mode dax_mode;

OK.

> 
>> +
>>  /* DAX device */
>>  struct dax_device *dev;
>>  
>> @@ -1288,7 +1291,8 @@ static int fuse_dax_mem_range_init(struct 
>> fuse_conn_dax *fcd)
>>  return ret;
>>  }
>>  
>> -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
>> +int fuse_dax_conn_alloc(struct fuse_conn *fc, unsigned int mode,
>> +struct dax_device *dax_dev)
>>  {
>>  struct fuse_conn_dax *fcd;
>>  int err;
>> @@ -1301,6 +1305,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct 
>> dax_device *dax_dev)
>>  return -ENOMEM;
>>  
>>  spin_lock_init(>lock);
>> +fcd->mode = mode;
>>  fcd->dev = dax_dev;
>>  err = fuse_dax_mem_range_init(fcd);
>>  if (err) {
>> @@ -1339,10 +1344,16 @@ static const struct address_space_operations 
>> fuse_dax_file_aops  = {
>>  static bool fuse_should_enable_dax(struct inode *inode)
>>  {
>>  struct fuse_conn *fc = get_fuse_conn(inode);
>> +unsigned int mode;
>>  
>>  if (!fc->dax)
>>  return false;
>>  
>> +mode = fc->dax->mode;
>> +
>> +if (mode == FUSE_DAX_MOUNT_NEVER)
>> +return false;
>> +
>>  return true;
>>  }
>>  
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index 07829ce78695..f29018323845 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -487,6 +487,12 @@ struct fuse_dev {
>>  struct list_head entry;
>>  };
>>  
>> +enum {
> And this becomes.
> 
> enum fuse_dax_mode {
> };

OK.

> 
>> +FUSE_DAX_MOUNT_INODE,
>> +FUSE_DAX_MOUNT_ALWAYS,
>> +FUSE_DAX_MOUNT_NEVER,
>> +};
> 
> How about getting rid of "MOUNT" and just do.
> 
>   FUSE_DAX_INODE,
>   FUSE_DAX_ALWAYS,
>   FUSE_DAX_NEVER,

OK.

> 
>> +
>>  struct fuse_fs_context {
>>  int fd;
>>  unsigned int rootmode;
>> @@ -503,7 +509,7 @@ struct fuse_fs_context {
>>  bool no_control:1;
>>  bool no_force_umount:1;
>>  bool legacy_opts_show:1;
>> -bool dax:1;
>> +unsigned int dax;
> 
> enum fuse_dax_mode dax_mode;

OK.

> 
>>  unsigned int max_read;
>>  unsigned int blksize;
>>  const char *subtype;
>> @@ -1242,7 +1248,8 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct 
>> iov_iter *to);
>>  ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>>  int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
>>  int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 
>> dmap_end);
>> -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
>> +int fuse_dax_conn_alloc(struct fuse_conn *fc, unsigned int mode,
>  ^^
>   enum fuse_dax_mode dax_mode

OK.

>> +struct dax_device *dax_dev);
>>  void fuse_dax_conn_free(struct fuse_conn *fc);
>>  bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
>>  void fuse_dax_inode_init(struct inode *inode);
>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
>> index b9beb39a4a18..f6b46395edb2 100644
>> --- a/fs/fuse/inode.c
>> +++ b/fs/fuse/inode.c
>> @@ -1434,7 +1434,7 @@ int fuse_fill_super_common(struct super_block *sb, 
>> struct fuse_fs_context *ctx)
>>  sb->s_subtype = ctx->subtype;
>>  ctx->subtype = NULL;
>>  if (IS_ENABLED(CONFIG_FUSE_DAX)) {
>> -err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
>> +err = fuse_dax_conn_alloc(fc, ctx->dax, ctx->dax_dev);
>>  if (err)
>>  goto err;
>>  }
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index 8f52cdaa8445..561f711d1945 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -88,12 +88,21 @@ struct virtio_fs_req_work {
>>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>>  

Re: [PATCH v2 0/4] virtiofs,fuse: support per-file DAX

2021-07-19 Thread JeffleXu



On 7/20/21 5:30 AM, Vivek Goyal wrote:
> On Fri, Jul 16, 2021 at 06:47:49PM +0800, Jeffle Xu wrote:
>> This patchset adds support of per-file DAX for virtiofs, which is
>> inspired by Ira Weiny's work on ext4[1] and xfs[2].
>>
>> There are three related scenarios:
>> 1. Alloc inode: get per-file DAX flag from fuse_attr.flags. (patch 3)
>> 2. Per-file DAX flag changes when the file has been opened. (patch 3)
>> In this case, the dentry and inode are all marked as DONT_CACHE, and
>> the DAX state won't be updated until the file is closed and reopened
>> later.
>> 3. Users can change the per-file DAX flag inside the guest by chattr(1).
>> (patch 4)
>> 4. Create new files under directories with DAX enabled. When creating
>> new files in ext4/xfs on host, the new created files will inherit the
>> per-file DAX flag from the directory, and thus the new created files in
>> virtiofs will also inherit the per-file DAX flag if the fuse server
>> derives fuse_attr.flags from the underlying ext4/xfs inode's per-file
>> DAX flag.
> 
> Thinking little bit more about this from requirement perspective. I think
> we are trying to address two use cases here.
> 
> A. Client does not know which files DAX should be used on. Only server
>knows it and server passes this information to client. I suspect
>that's your primary use case.

Yes, this is the starting point of this patch set.

> 
> B. Client is driving which files are supposed to be using DAX. This is
>exactly same as the model ext4/xfs are using by storing a persistent
>flag on inode. 
> 
> Current patches seem to be a hybrid of both approach A and B. 
> 
> If we were to implement B, then fuse client probably needs to have the
> capability to query FS_XFLAG_DAX on inode and decide whether to
> enable DAX or not. (Without extra round trip). Or know it indirectly
> by extending GETATTR and requesting this explicitly.

If guest queries if the file is DAX capable or not by an extra GETATTR,
I'm afraid this will add extra round trip.

Or if we add the DAX flag (ATTR_DAX) by extending LOOKUP, as done by
this patch set, then the FUSE server needs to set ATTR_DAX according to
the FS_XFLAG_DAX of the backend files, *to make the FS_XFLAG_DAX flag
previously set by FUSE client work*. Then it becomes a *mandatory*
requirement when implementing FUSE server. i.e., it becomes part of the
FUSE protocol rather than implementation specific. FUSE server can still
implement some other algorithm deciding whether to set ATTR_DAX or not,
though it must set ATTR_DAX once the backend file is flagged with
FS_XFLAG_DAX.

Besides, as you said, FUSE server needs to add one extra
GETFLAGS/FSGETXATTR ioctl per LOOKUP in this case. To eliminate this
cost, we could negotiate the per-file DAX capability during FUSE_INIT.
Only when the per-file DAX capability is negotiated, will the FUSE
server do extra GETFLAGS/FSGETXATTR ioctl and set ATTR_DAX flag when
doing LOOKUP.


Personally, I prefer the second way, i.e., by extending LOOKUP (adding
ATTR_DAX), to eliminate the extra round trip.

> 
> If we were only implementing A, then server does not have a way to 
> tell client to enable DAX. Server can either look at FS_XFLAG_DAX
> and decide to enable DAX or use some other property. Given querying
> FS_XFLAG_DAX will be an extra ioctl() on every inode lookup/getattr,
> it probably will be a server option. But enabling on server does not
> mean client will enable it.

As I said previously, it can be negotiated whether this per-file DAX
capability is needed. Guest can advertise this capability when '-o
dax=inode' is configured.

> 
> I think my primary concern with this patch right now is trying to
> figure out which requirement we are trying to cater to first and how
> to connect server and client well so they both understand what mode
> they are operating in and interact well.
> 


-- 
Thanks,
Jeffle
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vdpa/vdpa_sim: Use the negotiated features when calling vringh_init_iotlb

2021-07-19 Thread Jason Wang


在 2021/7/19 下午9:44, Eli Cohen 写道:

When calling vringh_init_iotlb(), use the negotiated features which
might be different than the supported features.

Fixes: 011c35bac5ef ("vdpa_sim: add supported_features field in 
vdpasim_dev_attr)



As Stefano said.

It should be 2c53d0f64c06f ("vdpasim: vDPA device simulator")

Other than this

Acked-by: Jason Wang 



Signed-off-by: Eli Cohen 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 14e024de5cbf..89a474c7a096 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -66,7 +66,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, 
unsigned int idx)
  {
struct vdpasim_virtqueue *vq = >vqs[idx];
  
-	vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features,

+   vringh_init_iotlb(>vring, vdpasim->features,
  VDPASIM_QUEUE_MAX, false,
  (struct vring_desc *)(uintptr_t)vq->desc_addr,
  (struct vring_avail *)
@@ -86,7 +86,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim,
vq->device_addr = 0;
vq->cb = NULL;
vq->private = NULL;
-   vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features,
+   vringh_init_iotlb(>vring, vdpasim->features,
  VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
  
  	vq->vring.notify = NULL;


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] virtio-balloon: Use virtio_find_vqs() helper

2021-07-19 Thread Jason Wang


在 2021/7/19 下午12:22, tianxianting 写道:

thanks,

I checked, actually all virtio drivers have switched to use the helper 
after this one merged.



Ok. Cool.

Thanks




在 2021/7/19 上午11:46, Jason Wang 写道:


在 2021/7/16 下午8:46, tianxianting 写道:

Do you interest in this patch? just little improvment:)

在 2021/7/13 下午11:38, Xianting Tian 写道:

From: Xianting Tian 

Use the helper virtio_find_vqs().

Signed-off-by: Xianting Tian 
---
  drivers/virtio/virtio_balloon.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c 
b/drivers/virtio/virtio_balloon.c

index 510e931..18e0bf3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -531,8 +531,8 @@ static int init_vqs(struct virtio_balloon *vb)
  callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
  }
  -    err = vb->vdev->config->find_vqs(vb->vdev, 
VIRTIO_BALLOON_VQ_MAX,

- vqs, callbacks, names, NULL, NULL);
+    err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
+    callbacks, names, NULL);
  if (err)
  return err;




Acked-by: Jason Wang 

Maybe it's better to convert all the drivers that doesn't use 
virtio_find_vqs{_ctx}.


Thanks




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 0/4] virtiofs,fuse: support per-file DAX

2021-07-19 Thread Vivek Goyal
On Fri, Jul 16, 2021 at 06:47:49PM +0800, Jeffle Xu wrote:
> This patchset adds support of per-file DAX for virtiofs, which is
> inspired by Ira Weiny's work on ext4[1] and xfs[2].
> 
> There are three related scenarios:
> 1. Alloc inode: get per-file DAX flag from fuse_attr.flags. (patch 3)
> 2. Per-file DAX flag changes when the file has been opened. (patch 3)
> In this case, the dentry and inode are all marked as DONT_CACHE, and
> the DAX state won't be updated until the file is closed and reopened
> later.
> 3. Users can change the per-file DAX flag inside the guest by chattr(1).
> (patch 4)
> 4. Create new files under directories with DAX enabled. When creating
> new files in ext4/xfs on host, the new created files will inherit the
> per-file DAX flag from the directory, and thus the new created files in
> virtiofs will also inherit the per-file DAX flag if the fuse server
> derives fuse_attr.flags from the underlying ext4/xfs inode's per-file
> DAX flag.

Thinking little bit more about this from requirement perspective. I think
we are trying to address two use cases here.

A. Client does not know which files DAX should be used on. Only server
   knows it and server passes this information to client. I suspect
   that's your primary use case.

B. Client is driving which files are supposed to be using DAX. This is
   exactly same as the model ext4/xfs are using by storing a persistent
   flag on inode. 

Current patches seem to be a hybrid of both approach A and B. 

If we were to implement B, then fuse client probably needs to have the
capability to query FS_XFLAG_DAX on inode and decide whether to
enable DAX or not. (Without extra round trip). Or know it indirectly
by extending GETATTR and requesting this explicitly.

If we were only implementing A, then server does not have a way to 
tell client to enable DAX. Server can either look at FS_XFLAG_DAX
and decide to enable DAX or use some other property. Given querying
FS_XFLAG_DAX will be an extra ioctl() on every inode lookup/getattr,
it probably will be a server option. But enabling on server does not
mean client will enable it.

I think my primary concern with this patch right now is trying to
figure out which requirement we are trying to cater to first and how
to connect server and client well so they both understand what mode
they are operating in and interact well.

Vivek

> 
> 
> Any comment is welcome.
> 
> [1] commit 9cb20f94afcd ("fs/ext4: Make DAX mount option a tri-state")
> [2] commit 02beb2686ff9 ("fs/xfs: Make DAX mount option a tri-state")
> 
> 
> changes since v1:
> - add support for changing per-file DAX flags inside guest (patch 4)
> 
> v1:https://www.spinics.net/lists/linux-virtualization/msg51008.html
> 
> Jeffle Xu (4):
>   fuse: add fuse_should_enable_dax() helper
>   fuse: Make DAX mount option a tri-state
>   fuse: add per-file DAX flag
>   fuse: support changing per-file DAX flag inside guest
> 
>  fs/fuse/dax.c | 36 ++--
>  fs/fuse/file.c|  4 ++--
>  fs/fuse/fuse_i.h  | 16 
>  fs/fuse/inode.c   |  7 +--
>  fs/fuse/ioctl.c   |  9 ++---
>  fs/fuse/virtio_fs.c   | 16 ++--
>  include/uapi/linux/fuse.h |  5 +
>  7 files changed, 78 insertions(+), 15 deletions(-)
> 
> -- 
> 2.27.0
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC 1/3] cpuidle: add poll_source API

2021-07-19 Thread Marcelo Tosatti
Hi Stefan,

On Tue, Jul 13, 2021 at 05:19:04PM +0100, Stefan Hajnoczi wrote:
> Introduce an API for adding cpuidle poll callbacks:
> 
>   struct poll_source_ops {
>   void (*start)(struct poll_source *src);
>   void (*stop)(struct poll_source *src);
>   void (*poll)(struct poll_source *src);
>   };
> 
>   int poll_source_register(struct poll_source *src);
>   int poll_source_unregister(struct poll_source *src);
> 
> When cpuidle enters the poll state it invokes ->start() and then invokes
> ->poll() repeatedly from the busy wait loop. Finally ->stop() is invoked
> when the busy wait loop finishes.
> 
> The ->poll() function should check for activity and cause
> TIF_NEED_RESCHED to be set in order to stop the busy wait loop.
> 
> This API is intended to be used by drivers that can cheaply poll for
> events. Participating in cpuidle polling allows them to avoid interrupt
> latencies during periods where the CPU is going to poll anyway.
> 
> Note that each poll_source is bound to a particular CPU. The API is
> mainly intended to by used by drivers that have multiple queues with irq
> affinity.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  drivers/cpuidle/Makefile  |  1 +
>  include/linux/poll_source.h   | 53 +++
>  drivers/cpuidle/poll_source.c | 99 +++
>  drivers/cpuidle/poll_state.c  |  6 +++
>  4 files changed, 159 insertions(+)
>  create mode 100644 include/linux/poll_source.h
>  create mode 100644 drivers/cpuidle/poll_source.c
> 
> diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
> index 26bbc5e74123..994f72d6fe95 100644
> --- a/drivers/cpuidle/Makefile
> +++ b/drivers/cpuidle/Makefile
> @@ -7,6 +7,7 @@ obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
>  obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
>  obj-$(CONFIG_DT_IDLE_STATES)   += dt_idle_states.o
>  obj-$(CONFIG_ARCH_HAS_CPU_RELAX)   += poll_state.o
> +obj-$(CONFIG_ARCH_HAS_CPU_RELAX)   += poll_source.o
>  obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o
>  
>  
> ##
> diff --git a/include/linux/poll_source.h b/include/linux/poll_source.h
> new file mode 100644
> index ..ccfb424e170b
> --- /dev/null
> +++ b/include/linux/poll_source.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * poll_source.h - cpuidle busy waiting API
> + */
> +#ifndef __LINUX_POLLSOURCE_H__
> +#define __LINUX_POLLSOURCE_H__
> +
> +#include 
> +
> +struct poll_source;
> +
> +struct poll_source_ops {
> + void (*start)(struct poll_source *src);
> + void (*stop)(struct poll_source *src);
> + void (*poll)(struct poll_source *src);
> +};
> +
> +struct poll_source {
> + const struct poll_source_ops *ops;
> + struct list_head node;
> + int cpu;
> +};
> +
> +/**
> + * poll_source_register - Add a poll_source for a CPU
> + */
> +#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_ARCH_HAS_CPU_RELAX)
> +int poll_source_register(struct poll_source *src);
> +#else
> +static inline int poll_source_register(struct poll_source *src)
> +{
> + return 0;
> +}
> +#endif
> +
> +/**
> + * poll_source_unregister - Remove a previously registered poll_source
> + */
> +#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_ARCH_HAS_CPU_RELAX)
> +int poll_source_unregister(struct poll_source *src);
> +#else
> +static inline int poll_source_unregister(struct poll_source *src)
> +{
> + return 0;
> +}
> +#endif
> +
> +/* Used by the cpuidle driver */
> +void poll_source_start(void);
> +void poll_source_run_once(void);
> +void poll_source_stop(void);
> +
> +#endif /* __LINUX_POLLSOURCE_H__ */
> diff --git a/drivers/cpuidle/poll_source.c b/drivers/cpuidle/poll_source.c
> new file mode 100644
> index ..46100e5a71e4
> --- /dev/null
> +++ b/drivers/cpuidle/poll_source.c
> @@ -0,0 +1,99 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * poll_source.c - cpuidle busy waiting API
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +/* The per-cpu list of registered poll sources */
> +DEFINE_PER_CPU(struct list_head, poll_source_list);
> +
> +/* Called from idle task with TIF_POLLING_NRFLAG set and irqs enabled */
> +void poll_source_start(void)
> +{
> + struct poll_source *src;
> +
> + list_for_each_entry(src, this_cpu_ptr(_source_list), node)
> + src->ops->start(src);
> +}
> +
> +/* Called from idle task with TIF_POLLING_NRFLAG set and irqs enabled */
> +void poll_source_run_once(void)
> +{
> + struct poll_source *src;
> +
> + list_for_each_entry(src, this_cpu_ptr(_source_list), node)
> + src->ops->poll(src);
> +}
> +
> +/* Called from idle task with TIF_POLLING_NRFLAG set and irqs enabled */
> +void poll_source_stop(void)
> +{
> + struct poll_source *src;
> +
> + list_for_each_entry(src, this_cpu_ptr(_source_list), node)
> + src->ops->stop(src);
> +}
> 

Re: [PATCH v2 4/4] fuse: support changing per-file DAX flag inside guest

2021-07-19 Thread Vivek Goyal
On Fri, Jul 16, 2021 at 06:47:53PM +0800, Jeffle Xu wrote:
> Fuse client can enable or disable per-file DAX inside guest by
> chattr(1). Similarly the new state won't be updated until the file is
> closed and reopened later.
> 
> It is worth nothing that it is a best-effort style, since whether
> per-file DAX is enabled or not is controlled by fuse_attr.flags retrieved
> by FUSE LOOKUP routine, while the algorithm constructing fuse_attr.flags
> is totally fuse server specific, not to mention ioctl may not be
> supported by fuse server at all.
> 
> Signed-off-by: Jeffle Xu 
> ---
>  fs/fuse/ioctl.c | 9 ++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
> index 546ea3d58fb4..172e05c3f038 100644
> --- a/fs/fuse/ioctl.c
> +++ b/fs/fuse/ioctl.c
> @@ -460,6 +460,7 @@ int fuse_fileattr_set(struct user_namespace *mnt_userns,
>   struct fuse_file *ff;
>   unsigned int flags = fa->flags;
>   struct fsxattr xfa;
> + bool newdax;
>   int err;
>  
>   ff = fuse_priv_ioctl_prepare(inode);
> @@ -467,10 +468,9 @@ int fuse_fileattr_set(struct user_namespace *mnt_userns,
>   return PTR_ERR(ff);
>  
>   if (fa->flags_valid) {
> + newdax = flags & FS_DAX_FL;
>   err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS,
> , sizeof(flags));
> - if (err)
> - goto cleanup;
>   } else {
>   memset(, 0, sizeof(xfa));
>   xfa.fsx_xflags = fa->fsx_xflags;
> @@ -479,11 +479,14 @@ int fuse_fileattr_set(struct user_namespace *mnt_userns,
>   xfa.fsx_projid = fa->fsx_projid;
>   xfa.fsx_cowextsize = fa->fsx_cowextsize;
>  
> + newdax = fa->fsx_xflags & FS_XFLAG_DAX;
>   err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR,
> , sizeof(xfa));
>   }
>  
> -cleanup:
> + if (!err && IS_ENABLED(CONFIG_FUSE_DAX))
> + fuse_dax_dontcache(inode, newdax);

This assumes that server will set ATTR_DAX flag for inode based on
whether inode has FS_DAX_FL set or not.

So that means server first will have to know that client has DAX enabled
so that it can query FS_DAX_FL. And in current framework we don't have
a way for server to know if client is using DAX or not.

I think there is little disconnect here. So either client should be
checking FS_DAX_FL flag on inode. But we probably don't want to pay
extra round trip cost for this. 

That means somehow server should return this information as part of
inode attrs only if client wants this extra file attr informaiton. So
may be GETATTR should be enhanced instead to return file attr information
too if client asked for it?

I have not looked what it takes to implement this. If this is too 
complicated, then alternate approach will be that it is up to the
server to decide what inodes should use DAX and there is no guarantee
that server will make sue of FS_DAX_FL flag. fuse will still support
setting FS_DAX_FL but server could choose to not use it at all. In
that case fuse client will not have to query file attrs in GETATTR
and just rely on ATTR_DAX flag set by server. I think that's what
you are implementing.  If that's the case then dontcache does not make
much sense because you don't even know if server is looking at
FS_DAX_FL to decide whether DAX should be used or not.

Thanks
Vivek

> +
>   fuse_priv_ioctl_cleanup(inode, ff);
>  
>   return err;
> -- 
> 2.27.0
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 3/4] fuse: add per-file DAX flag

2021-07-19 Thread Vivek Goyal
On Fri, Jul 16, 2021 at 06:47:52PM +0800, Jeffle Xu wrote:
> Add one flag for fuse_attr.flags indicating if DAX shall be enabled for
> this file.
> 
> When the per-file DAX flag changes for an *opened* file, the state of
> the file won't be updated until this file is closed and reopened later.
> 
> Signed-off-by: Jeffle Xu 
> ---
>  fs/fuse/dax.c | 21 +
>  fs/fuse/file.c|  4 ++--
>  fs/fuse/fuse_i.h  |  5 +++--
>  fs/fuse/inode.c   |  5 -
>  include/uapi/linux/fuse.h |  5 +
>  5 files changed, 31 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
> index a478e824c2d0..0e862119757a 100644
> --- a/fs/fuse/dax.c
> +++ b/fs/fuse/dax.c
> @@ -1341,7 +1341,7 @@ static const struct address_space_operations 
> fuse_dax_file_aops  = {
>   .invalidatepage = noop_invalidatepage,
>  };
>  
> -static bool fuse_should_enable_dax(struct inode *inode)
> +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
>  {
>   struct fuse_conn *fc = get_fuse_conn(inode);
>   unsigned int mode;
> @@ -1354,18 +1354,31 @@ static bool fuse_should_enable_dax(struct inode 
> *inode)
>   if (mode == FUSE_DAX_MOUNT_NEVER)
>   return false;
>  
> - return true;
> + if (mode == FUSE_DAX_MOUNT_ALWAYS)
> + return true;
> +
> + WARN_ON(mode != FUSE_DAX_MOUNT_INODE);
> + return flags & FUSE_ATTR_DAX;
>  }
>  
> -void fuse_dax_inode_init(struct inode *inode)
> +void fuse_dax_inode_init(struct inode *inode, unsigned int flags)
>  {
> - if (!fuse_should_enable_dax(inode))
> + if (!fuse_should_enable_dax(inode, flags))
>   return;
>  
>   inode->i_flags |= S_DAX;
>   inode->i_data.a_ops = _dax_file_aops;
>  }
>  
> +void fuse_dax_dontcache(struct inode *inode, bool newdax)
> +{
> + struct fuse_conn *fc = get_fuse_conn(inode);
> +
> + if (fc->dax && fc->dax->mode == FUSE_DAX_MOUNT_INODE &&
> + IS_DAX(inode) != newdax)
> + d_mark_dontcache(inode);
> +}
> +

This capability to mark an inode dontcache should probably be in a
separate patch. These seem to logically two functionalities. One is
enabling DAX on an inode. And second is making sure how soon you
see the effect of that change and hence marking inode dontcache.

Not sure how useful this is. In cache=none mode we should get rid of
inode ASAP. In cache=auto mode we will get rid of after 1 second (or
after a user specified timeout). So only place this seems to be
useful is cache=always.

Vivek


>  bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int 
> map_alignment)
>  {
>   if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 97f860cfc195..cf42af492146 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -3142,7 +3142,7 @@ static const struct address_space_operations 
> fuse_file_aops  = {
>   .write_end  = fuse_write_end,
>  };
>  
> -void fuse_init_file_inode(struct inode *inode)
> +void fuse_init_file_inode(struct inode *inode, struct fuse_attr *attr)
>  {
>   struct fuse_inode *fi = get_fuse_inode(inode);
>  
> @@ -3156,5 +3156,5 @@ void fuse_init_file_inode(struct inode *inode)
>   fi->writepages = RB_ROOT;
>  
>   if (IS_ENABLED(CONFIG_FUSE_DAX))
> - fuse_dax_inode_init(inode);
> + fuse_dax_inode_init(inode, attr->flags);
>  }
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index f29018323845..0793b93d680a 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -1000,7 +1000,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
>  /**
>   * Initialize file operations on a regular file
>   */
> -void fuse_init_file_inode(struct inode *inode);
> +void fuse_init_file_inode(struct inode *inode, struct fuse_attr *attr);
>  
>  /**
>   * Initialize inode operations on regular files and special files
> @@ -1252,8 +1252,9 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, unsigned 
> int mode,
>   struct dax_device *dax_dev);
>  void fuse_dax_conn_free(struct fuse_conn *fc);
>  bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
> -void fuse_dax_inode_init(struct inode *inode);
> +void fuse_dax_inode_init(struct inode *inode, unsigned int flags);
>  void fuse_dax_inode_cleanup(struct inode *inode);
> +void fuse_dax_dontcache(struct inode *inode, bool newdax);
>  bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int 
> map_alignment);
>  void fuse_dax_cancel_work(struct fuse_conn *fc);
>  
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index f6b46395edb2..2ae92798126e 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -269,6 +269,9 @@ void fuse_change_attributes(struct inode *inode, struct 
> fuse_attr *attr,
>   if (inval)
>   invalidate_inode_pages2(inode->i_mapping);
>   }
> +
> + if (IS_ENABLED(CONFIG_FUSE_DAX))
> + 

Re: [PATCH v2 3/4] fuse: add per-file DAX flag

2021-07-19 Thread Vivek Goyal
On Fri, Jul 16, 2021 at 06:47:52PM +0800, Jeffle Xu wrote:
> Add one flag for fuse_attr.flags indicating if DAX shall be enabled for
> this file.
> 
> When the per-file DAX flag changes for an *opened* file, the state of
> the file won't be updated until this file is closed and reopened later.
> 
> Signed-off-by: Jeffle Xu 

[..]
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 36ed092227fa..90c9df10d37a 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -184,6 +184,9 @@
>   *
>   *  7.34
>   *  - add FUSE_SYNCFS
> + *
> + *  7.35
> + *  - add FUSE_ATTR_DAX
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -449,8 +452,10 @@ struct fuse_file_lock {
>   * fuse_attr flags
>   *
>   * FUSE_ATTR_SUBMOUNT: Object is a submount root
> + * FUSE_ATTR_DAX: Enable DAX for this file in per-file DAX mode
>   */
>  #define FUSE_ATTR_SUBMOUNT  (1 << 0)
> +#define FUSE_ATTR_DAX(1 << 1)

Generic fuse changes (addition of FUSE_ATTR_DAX) should probably in
a separate patch. 

I am not clear on one thing. If we are planning to rely on persistent
inode attr (FS_XFLAG_DAX as per Documentation/filesystems/dax.rst), then
why fuse server needs to communicate the state of that attr using a 
flag? Can client directly query it?  I am not sure where at these
attrs stored and if fuse protocol currently supports it.

What about flag STATX_ATTR_DAX. We probably should report that too
in stat if we are using dax on the inode?

Vivek

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 2/4] fuse: Make DAX mount option a tri-state

2021-07-19 Thread Vivek Goyal
On Fri, Jul 16, 2021 at 06:47:51PM +0800, Jeffle Xu wrote:
> We add 'always', 'never', and 'inode' (default). '-o dax' continues to
> operate the same which is equivalent to 'always'.
> 
> By the time this patch is applied, 'inode' mode is actually equal to
> 'always' mode, before the per-file DAX flag is introduced in the
> following patch.
> 
> Signed-off-by: Jeffle Xu 
> ---
>  fs/fuse/dax.c   | 13 -
>  fs/fuse/fuse_i.h| 11 +--
>  fs/fuse/inode.c |  2 +-
>  fs/fuse/virtio_fs.c | 16 ++--
>  4 files changed, 36 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
> index c6f4e82e65f3..a478e824c2d0 100644
> --- a/fs/fuse/dax.c
> +++ b/fs/fuse/dax.c
> @@ -70,6 +70,9 @@ struct fuse_inode_dax {
>  };
>  
>  struct fuse_conn_dax {
> + /** dax mode: FUSE_DAX_MOUNT_* (always, never or per-file) **/
> + unsigned int mode;

Why "/**" ?

How about make it something like "enum fuse_dax_mode mode" instead?

enum fuse_dax_mode dax_mode;

> +
>   /* DAX device */
>   struct dax_device *dev;
>  
> @@ -1288,7 +1291,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax 
> *fcd)
>   return ret;
>  }
>  
> -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
> +int fuse_dax_conn_alloc(struct fuse_conn *fc, unsigned int mode,
> + struct dax_device *dax_dev)
>  {
>   struct fuse_conn_dax *fcd;
>   int err;
> @@ -1301,6 +1305,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct 
> dax_device *dax_dev)
>   return -ENOMEM;
>  
>   spin_lock_init(>lock);
> + fcd->mode = mode;
>   fcd->dev = dax_dev;
>   err = fuse_dax_mem_range_init(fcd);
>   if (err) {
> @@ -1339,10 +1344,16 @@ static const struct address_space_operations 
> fuse_dax_file_aops  = {
>  static bool fuse_should_enable_dax(struct inode *inode)
>  {
>   struct fuse_conn *fc = get_fuse_conn(inode);
> + unsigned int mode;
>  
>   if (!fc->dax)
>   return false;
>  
> + mode = fc->dax->mode;
> +
> + if (mode == FUSE_DAX_MOUNT_NEVER)
> + return false;
> +
>   return true;
>  }
>  
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 07829ce78695..f29018323845 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -487,6 +487,12 @@ struct fuse_dev {
>   struct list_head entry;
>  };
>  
> +enum {
And this becomes.

enum fuse_dax_mode {
};

> + FUSE_DAX_MOUNT_INODE,
> + FUSE_DAX_MOUNT_ALWAYS,
> + FUSE_DAX_MOUNT_NEVER,
> +};

How about getting rid of "MOUNT" and just do.

FUSE_DAX_INODE,
FUSE_DAX_ALWAYS,
FUSE_DAX_NEVER,

> +
>  struct fuse_fs_context {
>   int fd;
>   unsigned int rootmode;
> @@ -503,7 +509,7 @@ struct fuse_fs_context {
>   bool no_control:1;
>   bool no_force_umount:1;
>   bool legacy_opts_show:1;
> - bool dax:1;
> + unsigned int dax;

enum fuse_dax_mode dax_mode;

>   unsigned int max_read;
>   unsigned int blksize;
>   const char *subtype;
> @@ -1242,7 +1248,8 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct 
> iov_iter *to);
>  ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>  int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
>  int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 
> dmap_end);
> -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
> +int fuse_dax_conn_alloc(struct fuse_conn *fc, unsigned int mode,
   ^^
enum fuse_dax_mode dax_mode
> + struct dax_device *dax_dev);
>  void fuse_dax_conn_free(struct fuse_conn *fc);
>  bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
>  void fuse_dax_inode_init(struct inode *inode);
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index b9beb39a4a18..f6b46395edb2 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1434,7 +1434,7 @@ int fuse_fill_super_common(struct super_block *sb, 
> struct fuse_fs_context *ctx)
>   sb->s_subtype = ctx->subtype;
>   ctx->subtype = NULL;
>   if (IS_ENABLED(CONFIG_FUSE_DAX)) {
> - err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
> + err = fuse_dax_conn_alloc(fc, ctx->dax, ctx->dax_dev);
>   if (err)
>   goto err;
>   }
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 8f52cdaa8445..561f711d1945 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -88,12 +88,21 @@ struct virtio_fs_req_work {
>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>struct fuse_req *req, bool in_flight);
>  
> +static const struct constant_table dax_param_enums[] = {
> + {"inode",   FUSE_DAX_MOUNT_INODE },
> + {"always",  FUSE_DAX_MOUNT_ALWAYS },
> + {"never",   FUSE_DAX_MOUNT_NEVER 

RE: [PATCH 1/4] virtio: Improve vq->broken access to avoid any compiler optimization

2021-07-19 Thread Parav Pandit via Virtualization



> From: Michael S. Tsirkin 
> Sent: Monday, July 19, 2021 5:35 PM
> 
> On Mon, Jul 19, 2021 at 05:26:22AM +, Parav Pandit wrote:
> >
> >
> > > From: Michael S. Tsirkin 
> > > Sent: Sunday, July 18, 2021 2:09 AM
> > >
> > > On Sat, Jul 17, 2021 at 10:42:55AM +0300, Parav Pandit wrote:
> > > > Currently vq->broken field is read by virtqueue_is_broken() in
> > > > busy loop in one context by virtnet_send_command().
> > > >
> > > > vq->broken is set to true in other process context by
> > > > virtio_break_device(). Reader and writer are accessing it without
> > > > any synchronization. This may lead to a compiler optimization
> > > > which may result to optimize reading vq->broken only once.
> > > >
> > > > Hence, force reading vq->broken on each invocation of
> > > > virtqueue_is_broken() and ensure that its update is visible.
> > > >
> > > > Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all
> > > > virtqueues broken.")
> > >
> > > This is all theoretical right?
> > > virtqueue_get_buf is not inlined so compiler generally assumes any
> > > vq field can change.
> > Broken bit checking cannot rely on some other kernel API for correctness.
> > So it possibly not hitting this case now, but we shouldn't depend other APIs
> usage to ensure correctness.
> >
> > >
> > > I'm inclined to not include a Fixes
> > > tag then. And please do change subject to say "theoretical"
> > > to make that clear to people.
> > >
> > I do not have any strong opinion on fixes tag. But virtio_broken_queue()
> API should be self-contained; for that I am not sure if this just theoretical.
> > Do you mean theoretical, because we haven't hit this bug?
> 
> Because with existing code I don't believe existing compilers are clever
> enough to optimize this away.
Ok. got it. I will mention in the commit log.

> 
> > > > Signed-off-by: Parav Pandit 
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 6 --
> > > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c
> > > > b/drivers/virtio/virtio_ring.c index 89bfe46a8a7f..7f379fe7d78d
> > > > 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -2373,7 +2373,7 @@ bool virtqueue_is_broken(struct virtqueue
> > > > *_vq) {
> > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > >
> > > > -   return vq->broken;
> > > > +   return READ_ONCE(vq->broken);
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(virtqueue_is_broken);
> > > >
> > > > @@ -2387,7 +2387,9 @@ void virtio_break_device(struct
> > > > virtio_device
> > > > *dev)
> > > >
> > > > list_for_each_entry(_vq, >vqs, list) {
> > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > -   vq->broken = true;
> > > > +
> > > > +   /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
> > > > +   smp_store_release(>broken, true);
> > >
> > > A bit puzzled here. Why do we need release semantics here?
> > > IUC store_release does not generally pair with READ_ONCE - does it?
> > >
> > It does; paired at few places, such as,
> >
> > (a) in uverbs_main.c, default_async_file
> > (b) in netlink.c, cb_table
> > (c) fs/dcache.c i_dir_seq,
> >
> > However, in this scenario, WRITE_ONCE() is enough. So I will simplify and
> use that in v1.
> >
> >
> > > The commit log does not describe this either.
> > In commit log I mentioned that "ensure that update is visible".
> > I think a better commit log is, to say: "ensure that broken bit is written".
> 
> "is read repeatedly" maybe.

I updated it to below in v2.

" Hence, force reading vq->broken on each invocation of
virtqueue_is_broken() and also force writing it so that such update is visible 
to the readers."


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


RE: [PATCH 4/4] virtio_pci: Support surprise removal of virtio pci device

2021-07-19 Thread Parav Pandit via Virtualization


> From: Michael S. Tsirkin 
> Sent: Monday, July 19, 2021 5:37 PM
> 
> On Mon, Jul 19, 2021 at 05:44:49AM +, Parav Pandit wrote:
> >
> >
> > > From: Michael S. Tsirkin 
> > > Sent: Sunday, July 18, 2021 2:17 AM
> > >
> > > On Sat, Jul 17, 2021 at 10:42:58AM +0300, Parav Pandit wrote:
> > > > When a virtio pci device undergo surprise removal (aka async
> > > > removaln in
> > >
> > > typo
> > Fixing it. Checkpatch.pl and codespell, both didn't catch it. 
> >
> > >
> > > OK that's nice, but I suspect this is not enough.
> > > For example we need to also fix up all config space reads to handle
> > > all-ones patterns specially.
> > >
> > > E.g.
> > >
> > > /* After writing 0 to device_status, the driver MUST wait for a 
> > > read of
> > >  * device_status to return 0 before reinitializing the device.
> > >  * This will flush out the status write, and flush in device 
> > > writes,
> > >  * including MSI-X interrupts, if any.
> > >  */
> > > while (vp_modern_get_status(mdev))
> > > msleep(1);
> > >
> > > lots of code in drivers needs to be fixed too.
> > Above one particularly known to us in the hot plug area.
> > Above fix is needed to close the race of hot plug and unplug happening in
> parallel, which is occurring today, but less common.
> > It is in my todo list to fix it.
> > Will take care of it in near future in other series.
> >
> > >
> > > I guess we need to annotate drivers somehow to mark up whether they
> > > support surprise removal? And maybe find a way to let host know?
> > What is the benefit of it? Who can make use of that information?
> 
> For example, host could avoid removing devices by hot removal if guest is
> not ready for it. Or libosinfo could use that to tell users what to do.
Not sure how to achieve it. Because this is decided by the pci hot plug driver 
at bit early stage.
And pci hot plug slot doesn't know what is going to arrive in it.

> 
> > In virtio pci case, it is similar improvement to what nvme pci driver went
> through few years back to support hot plug/unplug.
> > Lets complete this of fixes to make it little more robust like nvme.
> 
> At least please mention in commit log that it's incomplete.
This fix is self-contained. I don’t see mentioning about other bugs in this 
commit log.
But sure yes, I will mentioned in the cover letter that more improvements will 
be done subsequently.
Thanks.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC v3 00/29] vDPA software assisted live migration

2021-07-19 Thread Stefan Hajnoczi
On Mon, May 24, 2021 at 07:29:06AM -0400, Michael S. Tsirkin wrote:
> On Mon, May 24, 2021 at 12:37:48PM +0200, Eugenio Perez Martin wrote:
> > On Mon, May 24, 2021 at 11:38 AM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, May 19, 2021 at 06:28:34PM +0200, Eugenio Pérez wrote:
> > > > Commit 17 introduces the buffer forwarding. Previous one are for
> > > > preparations again, and laters are for enabling some obvious
> > > > optimizations. However, it needs the vdpa device to be able to map
> > > > every IOVA space, and some vDPA devices are not able to do so. Checking
> > > > of this is added in previous commits.
> > >
> > > That might become a significant limitation. And it worries me that
> > > this is such a big patchset which might yet take a while to get
> > > finalized.
> > >
> > 
> > Sorry, maybe I've been unclear here: Latter commits in this series
> > address this limitation. Still not perfect: for example, it does not
> > support adding or removing guest's memory at the moment, but this
> > should be easy to implement on top.
> > 
> > The main issue I'm observing is from the kernel if I'm not wrong: If I
> > unmap every address, I cannot re-map them again. But code in this
> > patchset is mostly final, except for the comments it may arise in the
> > mail list of course.
> > 
> > > I have an idea: how about as a first step we implement a transparent
> > > switch from vdpa to a software virtio in QEMU or a software vhost in
> > > kernel?
> > >
> > > This will give us live migration quickly with performance comparable
> > > to failover but without dependance on guest cooperation.
> > >
> > 
> > I think it should be doable. I'm not sure about the effort that needs
> > to be done in qemu to hide these "hypervisor-failover devices" from
> > guest's view but it should be comparable to failover, as you say.
> > 
> > Networking should be ok by its nature, although it could require care
> > on the host hardware setup. But I'm not sure how other types of
> > vhost/vdpa devices may work that way. How would a disk/scsi device
> > switch modes? Can the kernel take control of the vdpa device through
> > vhost, and just start reporting with a dirty bitmap?
> > 
> > Thanks!
> 
> It depends of course, e.g. blk is mostly reads/writes so
> not a lot of state. just don't reorder or drop requests.

QEMU's virtio-blk does not attempt to change states (e.g. quiesce the
device or switch between vhost kernel/QEMU, etc) while there are
in-flight requests. Instead all currently active requests must complete
(in some cases they can be cancelled to stop them early). Note that
failed requests can be kept in a list across the switch and then
resubmitted later.

The underlying storage never has requests in flight while the device is
switched. The reason QEMU does this is because there's no way to hand
over an in-flight preadv(2), Linux AIO, or other host kernel block layer
request to another process.

Stefan


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 4/4] virtio_pci: Support surprise removal of virtio pci device

2021-07-19 Thread Michael S. Tsirkin
On Mon, Jul 19, 2021 at 05:44:49AM +, Parav Pandit wrote:
> 
> 
> > From: Michael S. Tsirkin 
> > Sent: Sunday, July 18, 2021 2:17 AM
> > 
> > On Sat, Jul 17, 2021 at 10:42:58AM +0300, Parav Pandit wrote:
> > > When a virtio pci device undergo surprise removal (aka async removaln
> > > in
> > 
> > typo
> Fixing it. Checkpatch.pl and codespell, both didn't catch it. 
> 
> > 
> > OK that's nice, but I suspect this is not enough.
> > For example we need to also fix up all config space reads to handle all-ones
> > patterns specially.
> > 
> > E.g.
> > 
> > /* After writing 0 to device_status, the driver MUST wait for a 
> > read of
> >  * device_status to return 0 before reinitializing the device.
> >  * This will flush out the status write, and flush in device writes,
> >  * including MSI-X interrupts, if any.
> >  */
> > while (vp_modern_get_status(mdev))
> > msleep(1);
> > 
> > lots of code in drivers needs to be fixed too.
> Above one particularly known to us in the hot plug area.
> Above fix is needed to close the race of hot plug and unplug happening in 
> parallel, which is occurring today, but less common.
> It is in my todo list to fix it.
> Will take care of it in near future in other series.
> 
> > 
> > I guess we need to annotate drivers somehow to mark up whether they
> > support surprise removal? And maybe find a way to let host know?
> What is the benefit of it? Who can make use of that information?

For example, host could avoid removing devices by hot removal if guest
is not ready for it. Or libosinfo could use that to tell users what to
do.

> In virtio pci case, it is similar improvement to what nvme pci driver went 
> through few years back to support hot plug/unplug.
> Lets complete this of fixes to make it little more robust like nvme.

At least please mention in commit log that it's incomplete.

> > 
> > 
> > 
> > > ---
> > >  drivers/virtio/virtio_pci_common.c | 7 +++
> > >  1 file changed, 7 insertions(+)
> > >
> > > diff --git a/drivers/virtio/virtio_pci_common.c
> > > b/drivers/virtio/virtio_pci_common.c
> > > index 222d630c41fc..b35bb2d57f62 100644
> > > --- a/drivers/virtio/virtio_pci_common.c
> > > +++ b/drivers/virtio/virtio_pci_common.c
> > > @@ -576,6 +576,13 @@ static void virtio_pci_remove(struct pci_dev
> > *pci_dev)
> > >   struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
> > >   struct device *dev = get_device(_dev->vdev.dev);
> > >
> > > + /*
> > > +  * Device is marked broken on surprise removal so that virtio upper
> > > +  * layers can abort any ongoing operation.
> > > +  */
> > > + if (!pci_device_is_present(pci_dev))
> > > + virtio_break_device(_dev->vdev);
> > > +
> > >   pci_disable_sriov(pci_dev);
> > >
> > >   unregister_virtio_device(_dev->vdev);
> > > --
> > > 2.27.0
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 1/4] virtio: Improve vq->broken access to avoid any compiler optimization

2021-07-19 Thread Michael S. Tsirkin
On Mon, Jul 19, 2021 at 05:26:22AM +, Parav Pandit wrote:
> 
> 
> > From: Michael S. Tsirkin 
> > Sent: Sunday, July 18, 2021 2:09 AM
> > 
> > On Sat, Jul 17, 2021 at 10:42:55AM +0300, Parav Pandit wrote:
> > > Currently vq->broken field is read by virtqueue_is_broken() in busy
> > > loop in one context by virtnet_send_command().
> > >
> > > vq->broken is set to true in other process context by
> > > virtio_break_device(). Reader and writer are accessing it without any
> > > synchronization. This may lead to a compiler optimization which may
> > > result to optimize reading vq->broken only once.
> > >
> > > Hence, force reading vq->broken on each invocation of
> > > virtqueue_is_broken() and ensure that its update is visible.
> > >
> > > Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all
> > > virtqueues broken.")
> > 
> > This is all theoretical right?
> > virtqueue_get_buf is not inlined so compiler generally assumes any vq field
> > can change.
> Broken bit checking cannot rely on some other kernel API for correctness.
> So it possibly not hitting this case now, but we shouldn't depend other APIs 
> usage to ensure correctness.
> 
> > 
> > I'm inclined to not include a Fixes
> > tag then. And please do change subject to say "theoretical"
> > to make that clear to people.
> > 
> I do not have any strong opinion on fixes tag. But virtio_broken_queue() API 
> should be self-contained; for that I am not sure if this just theoretical.
> Do you mean theoretical, because we haven't hit this bug?

Because with existing code I don't believe existing compilers are clever enough 
to
optimize this away.

> > > Signed-off-by: Parav Pandit 
> > > ---
> > >  drivers/virtio/virtio_ring.c | 6 --
> > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c
> > > b/drivers/virtio/virtio_ring.c index 89bfe46a8a7f..7f379fe7d78d 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -2373,7 +2373,7 @@ bool virtqueue_is_broken(struct virtqueue *_vq)
> > > {
> > >   struct vring_virtqueue *vq = to_vvq(_vq);
> > >
> > > - return vq->broken;
> > > + return READ_ONCE(vq->broken);
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_is_broken);
> > >
> > > @@ -2387,7 +2387,9 @@ void virtio_break_device(struct virtio_device
> > > *dev)
> > >
> > >   list_for_each_entry(_vq, >vqs, list) {
> > >   struct vring_virtqueue *vq = to_vvq(_vq);
> > > - vq->broken = true;
> > > +
> > > + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
> > > + smp_store_release(>broken, true);
> > 
> > A bit puzzled here. Why do we need release semantics here?
> > IUC store_release does not generally pair with READ_ONCE - does it?
> > 
> It does; paired at few places, such as,
> 
> (a) in uverbs_main.c, default_async_file
> (b) in netlink.c, cb_table
> (c) fs/dcache.c i_dir_seq,
> 
> However, in this scenario, WRITE_ONCE() is enough. So I will simplify and use 
> that in v1.
> 
> 
> > The commit log does not describe this either.
> In commit log I mentioned that "ensure that update is visible".
> I think a better commit log is, to say: "ensure that broken bit is written".

"is read repeatedly" maybe.

> I will send v2 with 
> (a) updated comment
> (b) replacing smp.. with WRITE_ONCE()
> (c) dropping the fixes tag.
> 
> > 
> > >   }
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtio_break_device);
> > > --
> > > 2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 1/2] tools: add missing infrastructure for building ptr_ring.h

2021-07-19 Thread Michael S. Tsirkin
On Mon, Jul 19, 2021 at 09:40:39AM +0800, Yunsheng Lin wrote:
> On 2021/7/18 10:09, Michael S. Tsirkin wrote:
> > On Tue, Jul 06, 2021 at 10:04:02AM +0800, Yunsheng Lin wrote:
> >> On 2021/7/6 2:39, Michael S. Tsirkin wrote:
> >>> On Mon, Jul 05, 2021 at 11:57:34AM +0800, Yunsheng Lin wrote:
> 
> [..]
> 
>  diff --git a/tools/include/asm/processor.h 
>  b/tools/include/asm/processor.h
>  new file mode 100644
>  index 000..3198ad6
>  --- /dev/null
>  +++ b/tools/include/asm/processor.h
>  @@ -0,0 +1,36 @@
>  +/* SPDX-License-Identifier: GPL-2.0 */
>  +
>  +#ifndef __TOOLS_LINUX_ASM_PROCESSOR_H
>  +#define __TOOLS_LINUX_ASM_PROCESSOR_H
>  +
>  +#include 
>  +
>  +#if defined(__i386__) || defined(__x86_64__)
>  +#include "../../arch/x86/include/asm/vdso/processor.h"
>  +#elif defined(__arm__)
>  +#include "../../arch/arm/include/asm/vdso/processor.h"
>  +#elif defined(__aarch64__)
>  +#include "../../arch/arm64/include/asm/vdso/processor.h"
>  +#elif defined(__powerpc__)
>  +#include "../../arch/powerpc/include/vdso/processor.h"
>  +#elif defined(__s390__)
>  +#include "../../arch/s390/include/vdso/processor.h"
>  +#elif defined(__sh__)
>  +#include "../../arch/sh/include/asm/processor.h"
>  +#elif defined(__sparc__)
>  +#include "../../arch/sparc/include/asm/processor.h"
>  +#elif defined(__alpha__)
>  +#include "../../arch/alpha/include/asm/processor.h"
>  +#elif defined(__mips__)
>  +#include "../../arch/mips/include/asm/vdso/processor.h"
>  +#elif defined(__ia64__)
>  +#include "../../arch/ia64/include/asm/processor.h"
>  +#elif defined(__xtensa__)
>  +#include "../../arch/xtensa/include/asm/processor.h"
>  +#elif defined(__nds32__)
>  +#include "../../arch/nds32/include/asm/processor.h"
>  +#else
>  +#define cpu_relax() sched_yield()
> >>>
> >>> Does this have a chance to work outside of kernel?
> >>
> >> I am not sure I understand what you meant here.
> >> sched_yield() is a pthread API, so it should work in the
> >> user space.
> >> And it allow the rigntest to compile when it is built on
> >> the arch which is not handled as above.
> > 
> > It might compile but is likely too heavy to behave
> > reasonably.
> > 
> > Also, given you did not actually test it I don't
> > think you should add such arch code.
> > Note you broke at least s390 here:
> > ../../arch/s390/include/vdso/processor.h
> > does not actually exist. Where these headers
> > do exit they tend to include lots of code which won't
> > build out of kernel.
> 
> You are right, it should be in:
> ../../arch/s390/include/asm/vdso/processor.h
> 
> > 
> > All this is just for cpu_relax - open coding that seems way easier.
> 
> Sure.
> 
> As Eugenio has posted a patchset to fix the compilation, which does
> not seems to be merged yet and may have some merging conflicts with
> this patchset, so either wait for the Eugenio' patchset to be merged
> before proceeding with this patchset, or explicitly note the dependency
> of Eugenio' patchset when sending the new version of patchset. I am not
> familiar with the merging flow of virtio to say which way is better, any
> suggestion how to proceed with this patchset?
> 
> 1. https://lkml.org/lkml/2021/7/6/1132
> 
> > 
> > 
> >>>
>  +#endif
> >>>
> >>> did you actually test or even test build all these arches?
> >>> Not sure we need to bother with hacks like these.
> >>
> >> Only x86_64 and arm64 arches have been built and tested.
> > 
> > In that case I think you should not add code that you
> > have not even built let alone tested.
> 
> Ok.
> 
> > 
> > 
> >> This is added referring the tools/include/asm/barrier.h.
> >>
> >>>
> >>>
>  +
> > 
> > .


I will merge Eugenio's patchset soon.

-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 4/4] virtio_pci: Support surprise removal of virtio pci device

2021-07-19 Thread Parav Pandit via Virtualization
When a virtio pci device undergo surprise removal (aka async removal in
PCIe spec), mark the device as broken so that any upper layer drivers can
abort any outstanding operation.

When a virtio net pci device undergo surprise removal which is used by a
NetworkManager, a below call trace was observed.

kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059]
watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059]
CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S  W I  L5.13.0-hotplug+ 
#8
Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020
Workqueue: events linkwatch_event
RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net]
Call Trace:
 virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net]
 ? __hw_addr_create_ex+0x85/0xc0
 __dev_mc_add+0x72/0x80
 igmp6_group_added+0xa7/0xd0
 ipv6_mc_up+0x3c/0x60
 ipv6_find_idev+0x36/0x80
 addrconf_add_dev+0x1e/0xa0
 addrconf_dev_config+0x71/0x130
 addrconf_notify+0x1f5/0xb40
 ? rtnl_is_locked+0x11/0x20
 ? __switch_to_asm+0x42/0x70
 ? finish_task_switch+0xaf/0x2c0
 ? raw_notifier_call_chain+0x3e/0x50
 raw_notifier_call_chain+0x3e/0x50
 netdev_state_change+0x67/0x90
 linkwatch_do_dev+0x3c/0x50
 __linkwatch_run_queue+0xd2/0x220
 linkwatch_event+0x21/0x30
 process_one_work+0x1c8/0x370
 worker_thread+0x30/0x380
 ? process_one_work+0x370/0x370
 kthread+0x118/0x140
 ? set_kthread_struct+0x40/0x40
 ret_from_fork+0x1f/0x30

Hence, add the ability to abort the command on surprise removal
which prevents infinite loop and system lockup.

Signed-off-by: Parav Pandit 
---
changelog:
v0->v1:
 - fixed typo in comment
---
 drivers/virtio/virtio_pci_common.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 222d630c41fc..b35bb2d57f62 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -576,6 +576,13 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
struct device *dev = get_device(_dev->vdev.dev);
 
+   /*
+* Device is marked broken on surprise removal so that virtio upper
+* layers can abort any ongoing operation.
+*/
+   if (!pci_device_is_present(pci_dev))
+   virtio_break_device(_dev->vdev);
+
pci_disable_sriov(pci_dev);
 
unregister_virtio_device(_dev->vdev);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 2/4] virtio: Keep vring_del_virtqueue() mirror of VQ create

2021-07-19 Thread Parav Pandit via Virtualization
Keep the vring_del_virtqueue() mirror of the create routines.
i.e. to delete list entry first as it is added last during the create
routine.

Signed-off-by: Parav Pandit 
---
 drivers/virtio/virtio_ring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e179c7c7622c..d5934c2e5a89 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2291,6 +2291,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   list_del(&_vq->list);
+
if (vq->we_own_ring) {
if (vq->packed_ring) {
vring_free_queue(vq->vq.vdev,
@@ -2321,7 +2323,6 @@ void vring_del_virtqueue(struct virtqueue *_vq)
kfree(vq->split.desc_state);
kfree(vq->split.desc_extra);
}
-   list_del(&_vq->list);
kfree(vq);
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 0/4] virtio short improvements

2021-07-19 Thread Parav Pandit via Virtualization
Hi,

This series contains small improvements for virtio pci driver.
The main idea support surprise removal of virtio pci device.

Patches 1 to 3 prepare the code to handle surprise removal by marking
the device as broken in patch-4.

Patch summary:
patch-1: ensures that compiler optimization doesn't occur on vq->broken
 flag
patch-2: maintains the mirror sequence on VQ delete and VQ create
patch-3: protects vqs list for simultaneous access from reader and a writer
patch-4: handles surprise removal of virtio pci device which avoids
 call trace and system lockup

---
changelog:
v1->v2:
 - updated commit log for WRITE_ONCE usage
 - improved vqs protection patch-3 for using natural structure alignment
v0->v1:
 - fixed below comments from Michael
 - fixed typo in patch4 in comment
 - using WRITE_ONCE instead of smp_store_release()
 - using spinlock instead of rwlock
 - improved comment in patch
 - removed fixes tag in patch-1

Parav Pandit (4):
  virtio: Improve vq->broken access to avoid any compiler optimization
  virtio: Keep vring_del_virtqueue() mirror of VQ create
  virtio: Protect vqs list access
  virtio_pci: Support surprise removal of virtio pci device

 drivers/virtio/virtio.c|  1 +
 drivers/virtio/virtio_pci_common.c |  7 +++
 drivers/virtio/virtio_ring.c   | 17 ++---
 include/linux/virtio.h |  1 +
 4 files changed, 23 insertions(+), 3 deletions(-)

-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 1/4] virtio: Improve vq->broken access to avoid any compiler optimization

2021-07-19 Thread Parav Pandit via Virtualization
Currently vq->broken field is read by virtqueue_is_broken() in busy
loop in one context by virtnet_send_command().

vq->broken is set to true in other process context by
virtio_break_device(). Reader and writer are accessing it without any
synchronization. This may lead to a compiler optimization which may
result to optimize reading vq->broken only once.

Hence, force reading vq->broken on each invocation of
virtqueue_is_broken() and also force writing it so that such
update is visible to the readers.

Signed-off-by: Parav Pandit 
---
changelog:
v1->v2:
 - updated commit log to describe WRITE_ONCE usages
v0->v1:
 - removed fixes tag from commit log
 - using WRITE_ONCE() as its enough for smp cache coherency as well as
   undesired compiler optimization
---
 drivers/virtio/virtio_ring.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 89bfe46a8a7f..e179c7c7622c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2373,7 +2373,7 @@ bool virtqueue_is_broken(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
-   return vq->broken;
+   return READ_ONCE(vq->broken);
 }
 EXPORT_SYMBOL_GPL(virtqueue_is_broken);
 
@@ -2387,7 +2387,9 @@ void virtio_break_device(struct virtio_device *dev)
 
list_for_each_entry(_vq, >vqs, list) {
struct vring_virtqueue *vq = to_vvq(_vq);
-   vq->broken = true;
+
+   /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+   WRITE_ONCE(vq->broken, true);
}
 }
 EXPORT_SYMBOL_GPL(virtio_break_device);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 3/4] virtio: Protect vqs list access

2021-07-19 Thread Parav Pandit via Virtualization
VQs may be accessed to mark the device broken while they are
created/destroyed. Hence protect the access to the vqs list.

Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues 
broken.")
Signed-off-by: Parav Pandit 
---
changelog:
v1->v2:
 - use the hole in current struct to place vqs_list_lock to have natural packing
v0->v1:
 - use spinlock instead of rwlock
---
 drivers/virtio/virtio.c  | 1 +
 drivers/virtio/virtio_ring.c | 8 
 include/linux/virtio.h   | 1 +
 3 files changed, 10 insertions(+)

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 4b15c00c0a0a..49984d2cba24 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -355,6 +355,7 @@ int register_virtio_device(struct virtio_device *dev)
virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
INIT_LIST_HEAD(>vqs);
+   spin_lock_init(>vqs_list_lock);
 
/*
 * device_add() causes the bus infrastructure to look for a matching
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d5934c2e5a89..c2aaa0eff6df 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1755,7 +1755,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
cpu_to_le16(vq->packed.event_flags_shadow);
}
 
+   spin_lock(>vqs_list_lock);
list_add_tail(>vq.list, >vqs);
+   spin_unlock(>vqs_list_lock);
return >vq;
 
 err_desc_extra:
@@ -2229,7 +2231,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int 
index,
memset(vq->split.desc_state, 0, vring.num *
sizeof(struct vring_desc_state_split));
 
+   spin_lock(>vqs_list_lock);
list_add_tail(>vq.list, >vqs);
+   spin_unlock(>vqs_list_lock);
return >vq;
 
 err_extra:
@@ -2291,7 +2295,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   spin_lock(>vq.vdev->vqs_list_lock);
list_del(&_vq->list);
+   spin_unlock(>vq.vdev->vqs_list_lock);
 
if (vq->we_own_ring) {
if (vq->packed_ring) {
@@ -2386,12 +2392,14 @@ void virtio_break_device(struct virtio_device *dev)
 {
struct virtqueue *_vq;
 
+   spin_lock(>vqs_list_lock);
list_for_each_entry(_vq, >vqs, list) {
struct vring_virtqueue *vq = to_vvq(_vq);
 
/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
WRITE_ONCE(vq->broken, true);
}
+   spin_unlock(>vqs_list_lock);
 }
 EXPORT_SYMBOL_GPL(virtio_break_device);
 
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index b1894e0323fa..41edbc01ffa4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -110,6 +110,7 @@ struct virtio_device {
bool config_enabled;
bool config_change_pending;
spinlock_t config_lock;
+   spinlock_t vqs_list_lock; /* Protects VQs list access */
struct device dev;
struct virtio_device_id id;
const struct virtio_config_ops *config;
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/5] dt-bindings: virtio: mmio: Add support for device subnode

2021-07-19 Thread Viresh Kumar
On 14-07-21, 23:07, Arnd Bergmann wrote:
> On Wed, Jul 14, 2021 at 5:43 PM Rob Herring  wrote:
> > I guess it comes down to is 'virtio,mmio' providing a bus or is it
> > just a device? I guess a bus (so 2 nodes) does make sense here.
> > 'virtio,mmio' defines how you access/discover the virtio queues (the
> > bus) and the functional device (i2c, gpio, iommu, etc.) is accessed
> > via the virtio queues.
> 
> It's not really a bus since there is only ever one device behind it.
> A better analogy would be your 'serdev' framework: You could
> have a 8250 or a pl011 uart, and behind that have a mouse, GPS
> receiver or bluetooth dongle.
> 
> In Documentation/devicetree/bindings/serial/serial.yaml, you also
> have two nodes for a single device, so we could follow that
> example.

So two device nodes is final then ? Pretty much like how this patchset did it
already ? I need to get rid of reg thing and use "virtio,DID" though.

-- 
viresh
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 4/4] virtio_pci: Support surprise removal of virtio pci device

2021-07-19 Thread Cornelia Huck
On Sat, Jul 17 2021, "Michael S. Tsirkin"  wrote:

> On Sat, Jul 17, 2021 at 10:42:58AM +0300, Parav Pandit wrote:
>> When a virtio pci device undergo surprise removal (aka async removaln in
>
> typo
>
>> PCIe spec), mark the device is broken so that any upper layer drivers can
>> abort any outstanding operation.
>> 
>> When a virtio net pci device undergo surprise removal which is used by a
>> NetworkManager, a below call trace was observed.
>> 
>> kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059]
>> watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059]
>> CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S  W I  L
>> 5.13.0-hotplug+ #8
>> Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020
>> Workqueue: events linkwatch_event
>> RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net]
>> Call Trace:
>>  virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net]
>>  ? __hw_addr_create_ex+0x85/0xc0
>>  __dev_mc_add+0x72/0x80
>>  igmp6_group_added+0xa7/0xd0
>>  ipv6_mc_up+0x3c/0x60
>>  ipv6_find_idev+0x36/0x80
>>  addrconf_add_dev+0x1e/0xa0
>>  addrconf_dev_config+0x71/0x130
>>  addrconf_notify+0x1f5/0xb40
>>  ? rtnl_is_locked+0x11/0x20
>>  ? __switch_to_asm+0x42/0x70
>>  ? finish_task_switch+0xaf/0x2c0
>>  ? raw_notifier_call_chain+0x3e/0x50
>>  raw_notifier_call_chain+0x3e/0x50
>>  netdev_state_change+0x67/0x90
>>  linkwatch_do_dev+0x3c/0x50
>>  __linkwatch_run_queue+0xd2/0x220
>>  linkwatch_event+0x21/0x30
>>  process_one_work+0x1c8/0x370
>>  worker_thread+0x30/0x380
>>  ? process_one_work+0x370/0x370
>>  kthread+0x118/0x140
>>  ? set_kthread_struct+0x40/0x40
>>  ret_from_fork+0x1f/0x30
>> 
>> Hence, add the ability to abort the command on surprise removal
>> which prevents infinite loop and system lockup.
>> 
>> Signed-off-by: Parav Pandit 
>
> OK that's nice, but I suspect this is not enough.
> For example we need to also fix up all config space reads
> to handle all-ones patterns specially.
>
> E.g.
>
> /* After writing 0 to device_status, the driver MUST wait for a read 
> of
>  * device_status to return 0 before reinitializing the device.
>  * This will flush out the status write, and flush in device writes,
>  * including MSI-X interrupts, if any.
>  */
> while (vp_modern_get_status(mdev))
> msleep(1);
>
> lots of code in drivers needs to be fixed too.
>
> I guess we need to annotate drivers somehow to mark up
> whether they support surprise removal? And maybe find a
> way to let host know?

I'm wondering whether virtio-pci surprise removal would need more
support in drivers than virtio-ccw surprise removal; given that
virtio-ccw *only* supports surprise removal and I don't remember any
problem reports, the situation is probably not that bad.

Is surprise removal of block devices still a big problem? We have some
support for (non-virtio) ccw devices (e.g. dasd) via a 'disconnected'
state that was designed to mitigate problems with block devices that are
suddenly gone.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 2/4] virtio: Keep vring_del_virtqueue() mirror of VQ create

2021-07-19 Thread Parav Pandit via Virtualization
Keep the vring_del_virtqueue() mirror of the create routines.
i.e. to delete list entry first as it is added last during the create
routine.

Signed-off-by: Parav Pandit 
---
 drivers/virtio/virtio_ring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e179c7c7622c..d5934c2e5a89 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2291,6 +2291,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   list_del(&_vq->list);
+
if (vq->we_own_ring) {
if (vq->packed_ring) {
vring_free_queue(vq->vq.vdev,
@@ -2321,7 +2323,6 @@ void vring_del_virtqueue(struct virtqueue *_vq)
kfree(vq->split.desc_state);
kfree(vq->split.desc_extra);
}
-   list_del(&_vq->list);
kfree(vq);
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 4/4] virtio_pci: Support surprise removal of virtio pci device

2021-07-19 Thread Parav Pandit via Virtualization
When a virtio pci device undergo surprise removal (aka async removal in
PCIe spec), mark the device as broken so that any upper layer drivers can
abort any outstanding operation.

When a virtio net pci device undergo surprise removal which is used by a
NetworkManager, a below call trace was observed.

kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059]
watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059]
CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S  W I  L5.13.0-hotplug+ 
#8
Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020
Workqueue: events linkwatch_event
RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net]
Call Trace:
 virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net]
 ? __hw_addr_create_ex+0x85/0xc0
 __dev_mc_add+0x72/0x80
 igmp6_group_added+0xa7/0xd0
 ipv6_mc_up+0x3c/0x60
 ipv6_find_idev+0x36/0x80
 addrconf_add_dev+0x1e/0xa0
 addrconf_dev_config+0x71/0x130
 addrconf_notify+0x1f5/0xb40
 ? rtnl_is_locked+0x11/0x20
 ? __switch_to_asm+0x42/0x70
 ? finish_task_switch+0xaf/0x2c0
 ? raw_notifier_call_chain+0x3e/0x50
 raw_notifier_call_chain+0x3e/0x50
 netdev_state_change+0x67/0x90
 linkwatch_do_dev+0x3c/0x50
 __linkwatch_run_queue+0xd2/0x220
 linkwatch_event+0x21/0x30
 process_one_work+0x1c8/0x370
 worker_thread+0x30/0x380
 ? process_one_work+0x370/0x370
 kthread+0x118/0x140
 ? set_kthread_struct+0x40/0x40
 ret_from_fork+0x1f/0x30

Hence, add the ability to abort the command on surprise removal
which prevents infinite loop and system lockup.

Signed-off-by: Parav Pandit 
---
changelog:
v0->v1:
 - fixed typo in comment
---
 drivers/virtio/virtio_pci_common.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 222d630c41fc..b35bb2d57f62 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -576,6 +576,13 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
struct device *dev = get_device(_dev->vdev.dev);
 
+   /*
+* Device is marked broken on surprise removal so that virtio upper
+* layers can abort any ongoing operation.
+*/
+   if (!pci_device_is_present(pci_dev))
+   virtio_break_device(_dev->vdev);
+
pci_disable_sriov(pci_dev);
 
unregister_virtio_device(_dev->vdev);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 1/4] virtio: Improve vq->broken access to avoid any compiler optimization

2021-07-19 Thread Parav Pandit via Virtualization
Currently vq->broken field is read by virtqueue_is_broken() in busy
loop in one context by virtnet_send_command().

vq->broken is set to true in other process context by
virtio_break_device(). Reader and writer are accessing it without any
synchronization. This may lead to a compiler optimization which may
result to optimize reading vq->broken only once.

Hence, force reading vq->broken on each invocation of
virtqueue_is_broken() and ensure that its update is visible.

Signed-off-by: Parav Pandit 
---
changelog:
v0->v1:
 - removed fixes tag from commit log
 - using WRITE_ONCE() as its enough for smp cache coherency as well as
   undesired compiler optimization
---
 drivers/virtio/virtio_ring.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 89bfe46a8a7f..e179c7c7622c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2373,7 +2373,7 @@ bool virtqueue_is_broken(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
-   return vq->broken;
+   return READ_ONCE(vq->broken);
 }
 EXPORT_SYMBOL_GPL(virtqueue_is_broken);
 
@@ -2387,7 +2387,9 @@ void virtio_break_device(struct virtio_device *dev)
 
list_for_each_entry(_vq, >vqs, list) {
struct vring_virtqueue *vq = to_vvq(_vq);
-   vq->broken = true;
+
+   /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+   WRITE_ONCE(vq->broken, true);
}
 }
 EXPORT_SYMBOL_GPL(virtio_break_device);
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 3/4] virtio: Protect vqs list access

2021-07-19 Thread Parav Pandit via Virtualization
VQs may be accessed to mark the device broken while they are
created/destroyed. Hence protect the access to the vqs list.

Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues 
broken.")
Signed-off-by: Parav Pandit 
---
changelog:
v0->v1:
 - use spinlock instead of rwlock
---
 drivers/virtio/virtio.c  | 1 +
 drivers/virtio/virtio_ring.c | 8 
 include/linux/virtio.h   | 1 +
 3 files changed, 10 insertions(+)

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 4b15c00c0a0a..49984d2cba24 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -355,6 +355,7 @@ int register_virtio_device(struct virtio_device *dev)
virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
INIT_LIST_HEAD(>vqs);
+   spin_lock_init(>vqs_list_lock);
 
/*
 * device_add() causes the bus infrastructure to look for a matching
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d5934c2e5a89..c2aaa0eff6df 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1755,7 +1755,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
cpu_to_le16(vq->packed.event_flags_shadow);
}
 
+   spin_lock(>vqs_list_lock);
list_add_tail(>vq.list, >vqs);
+   spin_unlock(>vqs_list_lock);
return >vq;
 
 err_desc_extra:
@@ -2229,7 +2231,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int 
index,
memset(vq->split.desc_state, 0, vring.num *
sizeof(struct vring_desc_state_split));
 
+   spin_lock(>vqs_list_lock);
list_add_tail(>vq.list, >vqs);
+   spin_unlock(>vqs_list_lock);
return >vq;
 
 err_extra:
@@ -2291,7 +2295,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   spin_lock(>vq.vdev->vqs_list_lock);
list_del(&_vq->list);
+   spin_unlock(>vq.vdev->vqs_list_lock);
 
if (vq->we_own_ring) {
if (vq->packed_ring) {
@@ -2386,12 +2392,14 @@ void virtio_break_device(struct virtio_device *dev)
 {
struct virtqueue *_vq;
 
+   spin_lock(>vqs_list_lock);
list_for_each_entry(_vq, >vqs, list) {
struct vring_virtqueue *vq = to_vvq(_vq);
 
/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
WRITE_ONCE(vq->broken, true);
}
+   spin_unlock(>vqs_list_lock);
 }
 EXPORT_SYMBOL_GPL(virtio_break_device);
 
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index b1894e0323fa..e6cda4137e7b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -115,6 +115,7 @@ struct virtio_device {
const struct virtio_config_ops *config;
const struct vringh_config_ops *vringh_config;
struct list_head vqs;
+   spinlock_t vqs_list_lock; /* Protects VQs list access */
u64 features;
void *priv;
 };
-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 0/4] virtio short improvements

2021-07-19 Thread Parav Pandit via Virtualization
Hi,

This series contains small improvements for virtio pci driver.
The main idea support surprise removal of virtio pci device.

Patches 1 to 3 prepare the code to handle surprise removal by marking
the device as broken in patch-4.

Patch summary:
patch-1: ensures that compiler optimization doesn't occur on vq->broken
 flag
patch-2: maintains the mirror sequence on VQ delete and VQ create
patch-3: protects vqs list for simultaneous access from reader and a writer
patch-4: handles surprise removal of virtio pci device which avoids
 call trace and system lockup

---
changelog:
v0->v1:
 - fixed below comments from Michael
 - fixed typo in patch4 in comment
 - using WRITE_ONCE instead of smp_store_release()
 - using spinlock instead of rwlock
 - improved comment in patch
 - removed fixes tag in patch-1

Parav Pandit (4):
  virtio: Improve vq->broken access to avoid any compiler optimization
  virtio: Keep vring_del_virtqueue() mirror of VQ create
  virtio: Protect vqs list access
  virtio_pci: Support surprise removal of virtio pci device

 drivers/virtio/virtio.c|  1 +
 drivers/virtio/virtio_pci_common.c |  7 +++
 drivers/virtio/virtio_ring.c   | 17 ++---
 include/linux/virtio.h |  1 +
 4 files changed, 23 insertions(+), 3 deletions(-)

-- 
2.27.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization