Re: [PATCH v5 08/11] vduse: Implement an MMU-based IOMMU driver

2021-03-25 Thread Jason Wang


在 2021/3/25 下午3:38, Yongji Xie 写道:

On Thu, Mar 25, 2021 at 12:53 PM Jason Wang  wrote:


在 2021/3/24 下午3:39, Yongji Xie 写道:

On Wed, Mar 24, 2021 at 11:54 AM Jason Wang  wrote:

在 2021/3/15 下午1:37, Xie Yongji 写道:

This implements an MMU-based IOMMU driver to support mapping
kernel dma buffer into userspace. The basic idea behind it is
treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set
up MMU mapping instead of IOMMU mapping for the DMA transfer so
that the userspace process is able to use its virtual address to
access the dma buffer in kernel.

And to avoid security issue, a bounce-buffering mechanism is
introduced to prevent userspace accessing the original buffer
directly.

Signed-off-by: Xie Yongji 
---
drivers/vdpa/vdpa_user/iova_domain.c | 535 
+++
drivers/vdpa/vdpa_user/iova_domain.h |  75 +
2 files changed, 610 insertions(+)
create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c 
b/drivers/vdpa/vdpa_user/iova_domain.c
new file mode 100644
index ..83de216b0e51
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMU-based IOMMU implementation
+ *
+ * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights 
reserved.

2021 as well.


Sure.


+ *
+ * Author: Xie Yongji 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "iova_domain.h"
+
+static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
+  u64 start, u64 last,
+  u64 addr, unsigned int perm,
+  struct file *file, u64 offset)
+{
+ struct vdpa_map_file *map_file;
+ int ret;
+
+ map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
+ if (!map_file)
+ return -ENOMEM;
+
+ map_file->file = get_file(file);
+ map_file->offset = offset;
+
+ ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
+ addr, perm, map_file);
+ if (ret) {
+ fput(map_file->file);
+ kfree(map_file);
+ return ret;
+ }
+ return 0;
+}
+
+static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
+   u64 start, u64 last)
+{
+ struct vdpa_map_file *map_file;
+ struct vhost_iotlb_map *map;
+
+ while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
+ map_file = (struct vdpa_map_file *)map->opaque;
+ fput(map_file->file);
+ kfree(map_file);
+ vhost_iotlb_map_free(domain->iotlb, map);
+ }
+}
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+  struct vhost_iotlb *iotlb)
+{
+ struct vdpa_map_file *map_file;
+ struct vhost_iotlb_map *map;
+ u64 start = 0ULL, last = ULLONG_MAX;
+ int ret;
+
+ spin_lock(>iotlb_lock);
+ vduse_iotlb_del_range(domain, start, last);
+
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+  map = vhost_iotlb_itree_next(map, start, last)) {
+ map_file = (struct vdpa_map_file *)map->opaque;
+ ret = vduse_iotlb_add_range(domain, map->start, map->last,
+ map->addr, map->perm,
+ map_file->file,
+ map_file->offset);
+ if (ret)
+ goto err;
+ }
+ spin_unlock(>iotlb_lock);
+
+ return 0;
+err:
+ vduse_iotlb_del_range(domain, start, last);
+ spin_unlock(>iotlb_lock);
+ return ret;
+}
+
+static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
+  u64 iova, u64 size, u64 paddr)
+{
+ struct vduse_bounce_map *map;
+ unsigned int index;
+ u64 last = iova + size - 1;
+
+ while (iova < last) {
+ map = >bounce_maps[iova >> PAGE_SHIFT];
+ index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
+ map->orig_phys[index] = paddr;
+ paddr += IOVA_ALLOC_SIZE;
+ iova += IOVA_ALLOC_SIZE;
+ }
+}
+
+static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
+u64 iova, u64 size)
+{
+ struct vduse_bounce_map *map;
+ unsigned int index;
+ u64 last = iova + size - 1;
+
+ while (iova < last) {
+ map = >bounce_maps[iova >> PAGE_SHIFT];
+ index = offset_in_page(iova) >> IOVA_ALLOC_ORDER;
+ map->orig_phys[index] = INVALID_PHYS_ADDR;
+ iova += IOVA_ALLOC_SIZE;
+ }
+}
+
+static void do_bounce(phys_addr_t orig, void *addr, size_t size,
+   enum dma_data_direction dir)
+{
+ unsigned long pfn = PFN_DOWN(orig);
+
+ if 

Re: drm/ttm: switch to per device LRU lock

2021-03-25 Thread Christian König

Thanks! Just a copy issue.

Patch to fix this is on the mailing list.

Christian.

Am 25.03.21 um 16:00 schrieb Colin Ian King:

Hi,

Static analysis with Coverity in linux-next has detected an issue in
drivers/gpu/drm/ttm/ttm_bo.c with the follow commit:

commit a1f091f8ef2b680a5184db065527612247cb4cae
Author: Christian König 
Date:   Tue Oct 6 17:26:42 2020 +0200

 drm/ttm: switch to per device LRU lock

 Instead of having a global lock for potentially less contention.


The analysis is as follows:

617 int ttm_mem_evict_first(struct ttm_device *bdev,
618struct ttm_resource_manager *man,
619const struct ttm_place *place,
620struct ttm_operation_ctx *ctx,
621struct ww_acquire_ctx *ticket)
622 {
1. assign_zero: Assigning: bo = NULL.

623struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
624bool locked = false;
625unsigned i;
626int ret;
627

Explicit null dereferenced (FORWARD_NULL)2. var_deref_op:
Dereferencing null pointer bo.

628spin_lock(>bdev->lru_lock);
629for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

The spin_lock on bo is dereferencing a null bo pointer.

Colin


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

re: drm/ttm: switch to per device LRU lock

2021-03-25 Thread Colin Ian King
Hi,

Static analysis with Coverity in linux-next has detected an issue in
drivers/gpu/drm/ttm/ttm_bo.c with the follow commit:

commit a1f091f8ef2b680a5184db065527612247cb4cae
Author: Christian König 
Date:   Tue Oct 6 17:26:42 2020 +0200

drm/ttm: switch to per device LRU lock

Instead of having a global lock for potentially less contention.


The analysis is as follows:

617 int ttm_mem_evict_first(struct ttm_device *bdev,
618struct ttm_resource_manager *man,
619const struct ttm_place *place,
620struct ttm_operation_ctx *ctx,
621struct ww_acquire_ctx *ticket)
622 {
   1. assign_zero: Assigning: bo = NULL.

623struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
624bool locked = false;
625unsigned i;
626int ret;
627

   Explicit null dereferenced (FORWARD_NULL)2. var_deref_op:
Dereferencing null pointer bo.

628spin_lock(>bdev->lru_lock);
629for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

The spin_lock on bo is dereferencing a null bo pointer.

Colin
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 00/10] drm: Support simple-framebuffer devices and firmware fbs

2021-03-25 Thread Hans de Goede
Hi,

On 3/18/21 11:29 AM, Thomas Zimmermann wrote:
> This patchset adds support for simple-framebuffer platform devices and
> a handover mechanism for native drivers to take-over control of the
> hardware.
> 
> The new driver, called simpledrm, binds to a simple-frambuffer platform
> device. The kernel's boot code creates such devices for firmware-provided
> framebuffers, such as EFI-GOP or VESA. Typically the BIOS, UEFI or boot
> loader sets up the framebuffers. Description via device tree is also an
> option.
> 
> Simpledrm is small enough to be linked into the kernel. The driver's main
> purpose is to provide graphical output during the early phases of the boot
> process, before the native DRM drivers are available. Native drivers are
> typically loaded from an initrd ram disk. Occationally simpledrm can also
> serve as interim solution on graphics hardware without native DRM driver.
> 
> So far distributions rely on fbdev drivers, such as efifb, vesafb or
> simplefb, for early-boot graphical output. However fbdev is deprecated and
> the drivers do not provide DRM interfaces for modern userspace.
> 
> Patches 1 and 2 prepare the DRM format helpers for simpledrm.
> 
> Patches 3 and 4 add a hand-over mechanism. Simpledrm acquires it's
> framebuffer's I/O-memory range and provides a callback function to be
> removed by a native driver. The native driver will remove simpledrm before
> taking over the hardware. The removal is integrated into existing helpers,
> so drivers use it automatically.
> 
> Patches 5 to 10 add the simpledrm driver. It's build on simple DRM helpers
> and SHMEM. It supports 16-bit, 24-bit and 32-bit RGB framebuffers. During
> pageflips, SHMEM buffers are copied into the framebuffer memory, similar
> to cirrus or mgag200. The code in patches 8 and 9 handles clocks and
> regulators. It's based on the simplefb drivers, but has been modified for
> DRM.

Thank you for your work on this, this is very interesting.

> I've also been working on fastboot support (i.e., flicker-free booting).
> This requires state-readout from simpledrm via generic interfaces, as
> outlined in [1]. I do have some prototype code, but it will take a while
> to get this ready. Simpledrm will then support it.
> 
> I've tested simpledrm with x86 EFI and VESA framebuffers, which both work
> reliably. The fbdev console and Weston work automatically. Xorg requires
> manual configuration of the device. Xorgs current modesetting driver does
> not work with both, platform and PCI device, for the same physical
> hardware. Once configured, X11 works. I looked into X11, but couldn't see
> an easy way of fixing the problem. With the push towards Wayland+Xwayland
> I expect the problem to become a non-issue soon. Additional testing has
> been reported at [2].
> 
> One cosmetical issue is that simpledrm's device file is card0 and the
> native driver's device file is card1. After simpledrm has been kicked out,
> only card1 is left. This does not seem to be a practical problem however.
> 
> TODO/IDEAS:
> 
>   * provide deferred takeover

I'm not sure what you mean with this ?  Currently deferred-takeover is
handled in the fbcon code. Current flickerfree boot works like this
(assuming a single LCD panel in a laptop):

1. EFI/GOP sets up the framebuffer, draws a vendor logo
2. The bootloader runs in silent mode and does not touch anything gfx related
3. kernel boots, with a loglevel of 3 so only CRIT/EMERG messages are shown
2. efifb loads; and tells fbcon that a framebuffer is now available for it to 
"bind"
   to. Since CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y fbcon defers taking 
over
   the console and leaves the dummy-console driver in place (unless there have 
already
   been kernel messages logged, which there shouldn't because loglevel=3)
3. i915 loads, reads out the hw state compares this to the preferred-mode for 
the
   panel which it would set, they match, nothing happens. i915 takes ownership
   of the scanout-buffer set up by the GOP, but leaves it in place.
   i915 also removes the efifb /dev/fb0 and installs its own /dev/fb0 fbdev 
compat
   device, fbcon is notified of this, but is still deferred and leaves the dummy
   console driver in place as console driver.
4. Plymouth loads, allocates a new scan-out buffer at the panel's preferred 
resolution,
   plymouth reads the vendor-logo through the BGRT ACPI interface and fills the
   scanout-buffer with the vendor-logo + a spinner. Then plymouth installs the 
new
   scanout-buffer on the crtc, this is done atomically during vsync, so the user
   sees no changes, other then the spinner appearing
   (note the active VT is now in graphical mode)
5. From here on not flickering is a userspace problem

AFAICT this should work fine with simplekms too, unless it clears the screen
to black when it binds.

An addition to the above sequence, if at any time either the kernel or userspace
prints a message to the console; and at that time a fbdev is registered then 
fbcon

[PATCH AUTOSEL 4.9 07/13] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b14e62f11075..d2431afeda84 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -306,8 +306,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 4.14 10/16] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3d7bea15c57b..4b5590f4e98b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,8 +324,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 4.19 12/20] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 98b6eb902df9..732327756ee1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -322,8 +322,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->kick = NULL;
vq->call_ctx = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 5.4 16/24] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 57ab79fbcee9..a279ecacbf60 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,8 +320,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->kick = NULL;
vq->call_ctx = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 5.10 22/39] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a262e12c6dc2..5ccb0705beae 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -332,8 +332,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->error_ctx = NULL;
vq->kick = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 5.10 01/39] virtiofs: Fail dax mount if device does not support it

2021-03-25 Thread Sasha Levin
From: Vivek Goyal 

[ Upstream commit 3f9b9efd82a84f27e95d0414f852caf1fa839e83 ]

Right now "mount -t virtiofs -o dax myfs /mnt/virtiofs" succeeds even
if filesystem deivce does not have a cache window and hence DAX can't
be supported.

This gives a false sense to user that they are using DAX with virtiofs
but fact of the matter is that they are not.

Fix this by returning error if dax can't be supported and user has asked
for it.

Signed-off-by: Vivek Goyal 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Miklos Szeredi 
Signed-off-by: Sasha Levin 
---
 fs/fuse/virtio_fs.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index d2c0e58c6416..3d83c9e12848 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1324,8 +1324,15 @@ static int virtio_fs_fill_super(struct super_block *sb, 
struct fs_context *fsc)
 
/* virtiofs allocates and installs its own fuse devices */
ctx->fudptr = NULL;
-   if (ctx->dax)
+   if (ctx->dax) {
+   if (!fs->dax_dev) {
+   err = -EINVAL;
+   pr_err("virtio-fs: dax can't be enabled as filesystem"
+  " device does not support it.\n");
+   goto err_free_fuse_devs;
+   }
ctx->dax_dev = fs->dax_dev;
+   }
err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 5.11 23/44] vhost: Fix vhost_vq_reset()

2021-03-25 Thread Sasha Levin
From: Laurent Vivier 

[ Upstream commit beb691e69f4dec7bfe8b81b509848acfd1f0dbf9 ]

vhost_reset_is_le() is vhost_init_is_le(), and in the case of
cross-endian legacy, vhost_init_is_le() depends on vq->user_be.

vq->user_be is set by vhost_disable_cross_endian().

But in vhost_vq_reset(), we have:

vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);

And so user_be is used before being set.

To fix that, reverse the lines order as there is no other dependency
between them.

Signed-off-by: Laurent Vivier 
Link: https://lore.kernel.org/r/20210312140913.788592-1-lviv...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a262e12c6dc2..5ccb0705beae 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -332,8 +332,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->error_ctx = NULL;
vq->kick = NULL;
vq->log_ctx = NULL;
-   vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
+   vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH AUTOSEL 5.11 01/44] virtiofs: Fail dax mount if device does not support it

2021-03-25 Thread Sasha Levin
From: Vivek Goyal 

[ Upstream commit 3f9b9efd82a84f27e95d0414f852caf1fa839e83 ]

Right now "mount -t virtiofs -o dax myfs /mnt/virtiofs" succeeds even
if filesystem deivce does not have a cache window and hence DAX can't
be supported.

This gives a false sense to user that they are using DAX with virtiofs
but fact of the matter is that they are not.

Fix this by returning error if dax can't be supported and user has asked
for it.

Signed-off-by: Vivek Goyal 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Miklos Szeredi 
Signed-off-by: Sasha Levin 
---
 fs/fuse/virtio_fs.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 8868ac31a3c0..4ee6f734ba83 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1324,8 +1324,15 @@ static int virtio_fs_fill_super(struct super_block *sb, 
struct fs_context *fsc)
 
/* virtiofs allocates and installs its own fuse devices */
ctx->fudptr = NULL;
-   if (ctx->dax)
+   if (ctx->dax) {
+   if (!fs->dax_dev) {
+   err = -EINVAL;
+   pr_err("virtio-fs: dax can't be enabled as filesystem"
+  " device does not support it.\n");
+   goto err_free_fuse_devs;
+   }
ctx->dax_dev = fs->dax_dev;
+   }
err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
-- 
2.30.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 03/11] vhost-vdpa: protect concurrent access to vhost device iotlb

2021-03-25 Thread Stefano Garzarella

On Mon, Mar 15, 2021 at 01:37:13PM +0800, Xie Yongji wrote:

Use vhost_dev->mutex to protect vhost device iotlb from
concurrent access.

Fixes: 4c8cf318("vhost: introduce vDPA-based backend")
Signed-off-by: Xie Yongji 
---
drivers/vhost/vdpa.c | 6 +-
1 file changed, 5 insertions(+), 1 deletion(-)



Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 00/22] virtio/vsock: introduce SOCK_SEQPACKET support

2021-03-25 Thread Stefano Garzarella

Hi Arseny,

On Tue, Mar 23, 2021 at 04:07:13PM +0300, Arseny Krasnov wrote:

This patchset implements support of SOCK_SEQPACKET for virtio
transport.
As SOCK_SEQPACKET guarantees to save record boundaries, so to
do it, two new packet operations were added: first for start of record
and second to mark end of record(SEQ_BEGIN and SEQ_END later). Also,
both operations carries metadata - to maintain boundaries and payload
integrity. Metadata is introduced by adding special header with two
fields - message id and message length:

struct virtio_vsock_seq_hdr {
__le32  msg_id;
__le32  msg_len;
} __attribute__((packed));

This header is transmitted as payload of SEQ_BEGIN and SEQ_END
packets(buffer of second virtio descriptor in chain) in the same way as
data transmitted in RW packets. Payload was chosen as buffer for this
header to avoid touching first virtio buffer which carries header of
packet, because someone could check that size of this buffer is equal
to size of packet header. To send record, packet with start marker is
sent first(it's header carries length of record and id),then all data
is sent as usual 'RW' packets and finally SEQ_END is sent(it carries
id of message, which is equal to id of SEQ_BEGIN), also after sending
SEQ_END id is incremented. On receiver's side,size of record is known
from packet with start record marker. To check that no packets were
dropped by transport, 'msg_id's of two sequential SEQ_BEGIN and SEQ_END
are checked to be equal and length of data between two markers is
compared to then length in SEQ_BEGIN header.
Now as  packets of one socket are not reordered neither on
vsock nor on vhost transport layers, such markers allows to restore
original record on receiver's side. If user's buffer is smaller that
record length, when all out of size data is dropped.
Maximum length of datagram is not limited as in stream socket,
because same credit logic is used. Difference with stream socket is
that user is not woken up until whole record is received or error
occurred. Implementation also supports 'MSG_EOR' and 'MSG_TRUNC' flags.
Tests also implemented.

Thanks to st...@yandex.ru for encouragements and initial design
recommendations.

Arseny Krasnov (22):
 af_vsock: update functions for connectible socket
 af_vsock: separate wait data loop
 af_vsock: separate receive data loop
 af_vsock: implement SEQPACKET receive loop
 af_vsock: separate wait space loop
 af_vsock: implement send logic for SEQPACKET
 af_vsock: rest of SEQPACKET support
 af_vsock: update comments for stream sockets
 virtio/vsock: set packet's type in virtio_transport_send_pkt_info()
 virtio/vsock: simplify credit update function API
 virtio/vsock: dequeue callback for SOCK_SEQPACKET
 virtio/vsock: fetch length for SEQPACKET record
 virtio/vsock: add SEQPACKET receive logic
 virtio/vsock: rest of SOCK_SEQPACKET support
 virtio/vsock: SEQPACKET support feature bit
 virtio/vsock: setup SEQPACKET ops for transport
 vhost/vsock: setup SEQPACKET ops for transport
 vsock/loopback: setup SEQPACKET ops for transport
 vhost/vsock: SEQPACKET feature bit support
 virtio/vsock: SEQPACKET feature bit support
 vsock_test: add SOCK_SEQPACKET tests
 virtio/vsock: update trace event for SEQPACKET

drivers/vhost/vsock.c|  21 +-
include/linux/virtio_vsock.h |  21 +
include/net/af_vsock.h   |   9 +
.../events/vsock_virtio_transport_common.h   |  48 +-
include/uapi/linux/virtio_vsock.h|  19 +
net/vmw_vsock/af_vsock.c | 581 +++--
net/vmw_vsock/virtio_transport.c |  17 +
net/vmw_vsock/virtio_transport_common.c  | 379 +--
net/vmw_vsock/vsock_loopback.c   |  12 +
tools/testing/vsock/util.c   |  32 +-
tools/testing/vsock/util.h   |   3 +
tools/testing/vsock/vsock_test.c | 126 
12 files changed, 1015 insertions(+), 253 deletions(-)

v6 -> v7:
General changelog:
- virtio transport callback for message length now removed
  from transport. Length of record is returned by dequeue
  callback.

- function which tries to get message length now returns 0
  when rx queue is empty. Also length of current message in
  progress is set to 0, when message processed or error
  happens.

- patches for virtio feature bit moved after patches with
  transport ops.

Per patch changelog:
 see every patch after '---' line.


I reviewed the series and I left some comments, I think we are at a good 
point, but we should have the specification accepted before merging this 
series to avoid having to change the implementation later.


What do you think?

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 18/22] vsock/loopback: setup SEQPACKET ops for transport

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:14:33PM +0300, Arseny Krasnov wrote:

This adds SEQPACKET ops for loopback transport and 'seqpacket_allow()'
callback.

Signed-off-by: Arseny Krasnov 
---
net/vmw_vsock/vsock_loopback.c | 12 
1 file changed, 12 insertions(+)



Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 17/22] vhost/vsock: setup SEQPACKET ops for transport

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:14:18PM +0300, Arseny Krasnov wrote:

This also removes ignore of non-stream type of packets and adds
'seqpacket_allow()' callback.

Signed-off-by: Arseny Krasnov 
---
drivers/vhost/vsock.c | 15 +--
1 file changed, 13 insertions(+), 2 deletions(-)


Same thing for this transporter too, maybe we can merge with the patch 
"vhost/vsock: SEQPACKET feature bit support".


Stefano



diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 5e78fb719602..5af141772068 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -354,8 +354,7 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
return NULL;
}

-   if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
-   pkt->len = le32_to_cpu(pkt->hdr.len);
+   pkt->len = le32_to_cpu(pkt->hdr.len);

/* No payload */
if (!pkt->len)
@@ -398,6 +397,8 @@ static bool vhost_vsock_more_replies(struct vhost_vsock 
*vsock)
return val < vq->num;
}

+static bool vhost_transport_seqpacket_allow(void);
+
static struct virtio_transport vhost_transport = {
.transport = {
.module   = THIS_MODULE,
@@ -424,6 +425,10 @@ static struct virtio_transport vhost_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,

+   .seqpacket_dequeue= virtio_transport_seqpacket_dequeue,
+   .seqpacket_enqueue= virtio_transport_seqpacket_enqueue,
+   .seqpacket_allow  = vhost_transport_seqpacket_allow,
+
.notify_poll_in   = virtio_transport_notify_poll_in,
.notify_poll_out  = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -439,8 +444,14 @@ static struct virtio_transport vhost_transport = {
},

.send_pkt = vhost_transport_send_pkt,
+   .seqpacket_allow = false
};

+static bool vhost_transport_seqpacket_allow(void)
+{
+   return vhost_transport.seqpacket_allow;
+}
+
static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 16/22] virtio/vsock: setup SEQPACKET ops for transport

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:14:03PM +0300, Arseny Krasnov wrote:

This adds SEQPACKET ops for virtio transport and 'seqpacket_allow()'
callback.

Signed-off-by: Arseny Krasnov 
---
net/vmw_vsock/virtio_transport.c | 12 
1 file changed, 12 insertions(+)


Sorry for not mentioning this in the previous review, but maybe we can 
merge this patch with "virtio/vsock: SEQPACKET feature bit support", so 
we have a single patch when we fully enable the SEQPACKET support in 
this transport.


Anyway, I don't have a strong opinion on that.

What do you think?

Stefano



diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c

index 2700a63ab095..83ae2078c847 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -443,6 +443,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >rx_work);
}

+static bool virtio_transport_seqpacket_allow(void);
+
static struct virtio_transport virtio_transport = {
.transport = {
.module   = THIS_MODULE,
@@ -469,6 +471,10 @@ static struct virtio_transport virtio_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,

+   .seqpacket_dequeue= virtio_transport_seqpacket_dequeue,
+   .seqpacket_enqueue= virtio_transport_seqpacket_enqueue,
+   .seqpacket_allow  = virtio_transport_seqpacket_allow,
+
.notify_poll_in   = virtio_transport_notify_poll_in,
.notify_poll_out  = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -483,8 +489,14 @@ static struct virtio_transport virtio_transport = {
},

.send_pkt = virtio_transport_send_pkt,
+   .seqpacket_allow = false
};

+static bool virtio_transport_seqpacket_allow(void)
+{
+   return virtio_transport.seqpacket_allow;
+}
+
static void virtio_transport_rx_work(struct work_struct *work)
{
struct virtio_vsock *vsock =
-- 2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 15/22] virtio/vsock: SEQPACKET support feature bit

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:13:49PM +0300, Arseny Krasnov wrote:

This adds new virtio vsock specific feature bit which means
SOCK_SEQPACKET support. Guest negotiates this bit with vhost,
thus checking that vhost side supports SEQPACKET.

Signed-off-by: Arseny Krasnov 
---
include/uapi/linux/virtio_vsock.h | 3 +++
1 file changed, 3 insertions(+)


Since you have this patch, I think you can generalize the title, update 
the description, and merge here the changes I mentioned in patch 11/22 
about changes of include/uapi/linux/virtio_vsock.h.


So you can have a single patch with the new virtio-spec defines and 
structs related to SEQPACKET, of course then we move it before patch 11.


What do you think?

Stefano



diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 692f8078cced..619aaebb355a 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -38,6 +38,9 @@
#include 
#include 

+/* The feature bitmap for virtio vsock */
+#define VIRTIO_VSOCK_F_SEQPACKET   0   /* SOCK_SEQPACKET supported */
+
struct virtio_vsock_config {
__le64 guest_cid;
} __attribute__((packed));
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 14/22] virtio/vsock: rest of SOCK_SEQPACKET support

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:13:29PM +0300, Arseny Krasnov wrote:

This adds rest of logic for SEQPACKET:
1) SEQPACKET specific functions which send SEQ_BEGIN/SEQ_END.
  Note that both functions may sleep to wait enough space for
  SEQPACKET header.
2) SEQ_BEGIN/SEQ_END in TAP packet capture.
3) Send SHUTDOWN on socket close for SEQPACKET type.
4) Set SEQPACKET packet type during send.
5) Set MSG_EOR in flags for SEQPACKET during send.
6) 'seqpacket_allow' flag to virtio transport.

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
In 'virtio_transport_seqpacket_enqueue()', 'next_tx_msg_id' is updated
in both cases when message send successfully or error occured.

include/linux/virtio_vsock.h|  7 ++
net/vmw_vsock/virtio_transport_common.c | 88 -
2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 0e3aa395c07c..ab5f56fd7251 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -22,6 +22,7 @@ struct virtio_vsock_seq_state {
u32 user_read_seq_len;
u32 user_read_copied;
u32 curr_rx_msg_id;
+   u32 next_tx_msg_id;
};

/* Per-socket state (accessed via vsk->trans) */
@@ -76,6 +77,8 @@ struct virtio_transport {

/* Takes ownership of the packet */
int (*send_pkt)(struct virtio_vsock_pkt *pkt);
+
+   bool seqpacket_allow;
};

ssize_t
@@ -89,6 +92,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   size_t len, int flags);

int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  size_t len);
+int
virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   int flags,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index bfe0d7026bf8..01a56c7da8bd 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -139,6 +139,8 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
break;
case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
+   case VIRTIO_VSOCK_OP_SEQ_BEGIN:
+   case VIRTIO_VSOCK_OP_SEQ_END:
hdr->op = cpu_to_le16(AF_VSOCK_OP_CONTROL);
break;
default:
@@ -187,7 +189,12 @@ static int virtio_transport_send_pkt_info(struct 
vsock_sock *vsk,
struct virtio_vsock_pkt *pkt;
u32 pkt_len = info->pkt_len;

-   info->type = VIRTIO_VSOCK_TYPE_STREAM;
+   info->type = virtio_transport_get_type(sk_vsock(vsk));
+
+   if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET &&
+   info->msg &&
+   info->msg->msg_flags & MSG_EOR)
+   info->flags |= VIRTIO_VSOCK_RW_EOR;

t_ops = virtio_transport_get_ops(vsk);
if (unlikely(!t_ops))
@@ -401,6 +408,43 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static int virtio_transport_seqpacket_send_ctrl(struct vsock_sock *vsk,
+   int type,
+   size_t len,
+   int flags)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt_info info = {
+   .op = type,
+   .vsk = vsk,
+   .pkt_len = sizeof(struct virtio_vsock_seq_hdr)
+   };
+
+   struct virtio_vsock_seq_hdr seq_hdr = {
+   .msg_id = cpu_to_le32(vvs->seq_state.next_tx_msg_id),
+   .msg_len = cpu_to_le32(len)
+   };
+
+   struct kvec seq_hdr_kiov = {
+   .iov_base = (void *)_hdr,
+   .iov_len = sizeof(struct virtio_vsock_seq_hdr)
+   };
+
+   struct msghdr msg = {0};
+
+   //XXX: do we need 'vsock_transport_send_notify_data' pointer?
+   if (vsock_wait_space(sk_vsock(vsk),
+sizeof(struct virtio_vsock_seq_hdr),
+flags, NULL))
+   return -1;
+
+   iov_iter_kvec(_iter, WRITE, _hdr_kiov, 1, sizeof(seq_hdr));
+
+   info.msg = 
+
+   return virtio_transport_send_pkt_info(vsk, );
+}
+
static inline void virtio_transport_remove_pkt(struct virtio_vsock_pkt *pkt)
{
list_del(>list);
@@ -595,6 +639,46 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);

+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  size_t len)
+{
+   int written = -1;
+
+   if (msg->msg_iter.iov_offset == 0) {
+   /* Send SEQBEGIN. */
+   if (virtio_transport_seqpacket_send_ctrl(vsk,
+   

Re: [RFC PATCH v7 13/22] virtio/vsock: add SEQPACKET receive logic

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:13:13PM +0300, Arseny Krasnov wrote:

This modifies current receive logic for SEQPACKET support:
1) Inserts 'SEQ_BEGIN' packet to socket's rx queue.
2) Inserts 'RW' packet to socket's rx queue, but without merging with
  buffer of last packet in queue.
3) Performs check for packet and socket types on receive(if mismatch,
  then reset connection).

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
In 'virtio_transport_recv_pkt()', 'sock_put()' is added, when type of
received packet does not match to the type of socket.

net/vmw_vsock/virtio_transport_common.c | 64 +
1 file changed, 45 insertions(+), 19 deletions(-)



Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 12/22] virtio/vsock: fetch length for SEQPACKET record

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:12:55PM +0300, Arseny Krasnov wrote:

This adds transport callback which tries to fetch record begin marker
from socket's rx queue. It is called from af_vsock.c before reading data
packets of record.

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
1) Now 'virtio_transport_seqpacket_seq_get_len()' returns 0, if rx
   queue of socket is empty. Else it returns length of current message
   to handle.
2) If dequeue callback is called, but there is no detected length of
   message to dequeue, EAGAIN is returned, and outer loop restarts
   receiving.

net/vmw_vsock/virtio_transport_common.c | 61 +
1 file changed, 61 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index a8f4326e45e8..41f05034593e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -399,6 +399,62 @@ static inline void virtio_transport_remove_pkt(struct 
virtio_vsock_pkt *pkt)
virtio_transport_free_pkt(pkt);
}

+static size_t virtio_transport_drop_until_seq_begin(struct 
virtio_vsock_sock *vvs)

+{
+   struct virtio_vsock_pkt *pkt, *n;
+   size_t bytes_dropped = 0;
+
+   list_for_each_entry_safe(pkt, n, >rx_queue, list) {
+   if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_SEQ_BEGIN)
+   break;
+
+   bytes_dropped += le32_to_cpu(pkt->hdr.len);
+   virtio_transport_dec_rx_pkt(vvs, pkt);
+   virtio_transport_remove_pkt(pkt);
+   }
+
+   return bytes_dropped;
+}
+
+static size_t virtio_transport_seqpacket_seq_get_len(struct vsock_sock *vsk)
+{
+   struct virtio_vsock_seq_hdr *seq_hdr;
+   struct virtio_vsock_sock *vvs;
+   struct virtio_vsock_pkt *pkt;
+   size_t bytes_dropped = 0;
+
+   vvs = vsk->trans;
+
+   spin_lock_bh(>rx_lock);
+
+   /* Have some record to process, return it's length. */
+   if (vvs->seq_state.user_read_seq_len)
+   goto out;
+
+   /* Fetch all orphaned 'RW' packets and send credit update. */
+   bytes_dropped = virtio_transport_drop_until_seq_begin(vvs);
+
+   if (list_empty(>rx_queue))
+   goto out;
+
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, list);
+
+   vvs->seq_state.user_read_copied = 0;
+
+   seq_hdr = (struct virtio_vsock_seq_hdr *)pkt->buf;
+   vvs->seq_state.user_read_seq_len = le32_to_cpu(seq_hdr->msg_len);
+   vvs->seq_state.curr_rx_msg_id = le32_to_cpu(seq_hdr->msg_id);
+   virtio_transport_dec_rx_pkt(vvs, pkt);
+   virtio_transport_remove_pkt(pkt);
+out:
+   spin_unlock_bh(>rx_lock);
+
+   if (bytes_dropped)
+   virtio_transport_send_credit_update(vsk);
+
+   return vvs->seq_state.user_read_seq_len;
+}
+
static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 struct msghdr *msg,
 bool *msg_ready)
@@ -522,6 +578,11 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
if (flags & MSG_PEEK)
return -EOPNOTSUPP;

+   *msg_len = virtio_transport_seqpacket_seq_get_len(vsk);
+
+   if (*msg_len == 0)
+   return -EAGAIN;
+


Okay, I see now, I think you can move this patch before the previous one 
or merge them in a single patch, it is better to review and to bisect.


As mentioned, I think we can return msg_len if 
virtio_transport_seqpacket_do_dequeue() does not fail, otherwise the 
error.


I mean something like this:

static ssize_t virtio_transport_seqpacket_do_dequeue(...)
{
size_t msg_len;
ssize_t ret;

msg_len = virtio_transport_seqpacket_seq_get_len(vsk);
if (msg_len == 0)
return -EAGAIN;

ret = virtio_transport_seqpacket_do_dequeue(vsk, msg, msg_ready);
if (ret < 0)
return ret;

return msg_len;
}


return virtio_transport_seqpacket_do_dequeue(vsk, msg, msg_ready);
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);
-- 2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 11/22] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:12:41PM +0300, Arseny Krasnov wrote:

This adds transport callback and it's logic for SEQPACKET dequeue.
Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data. If during copy SEQ_BEGIN
was found(and not all data was copied), copying is restarted by reset
user's iov iterator(previous unfinished data is dropped).

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
1) 'struct virtio_vsock_seqpacket_state' now renamed to
   'struct virtio_vsock_seq_state'.
2) Field 'seqpacket_state' of 'struct virtio_vsock_sock' now
   renamed to 'seq_state'.
3) Current message length to process('user_read_seq_len') is
   set to 0 on error or message dequeue completed sucecssfully.

include/linux/virtio_vsock.h|  14 +++
include/uapi/linux/virtio_vsock.h   |  16 
net/vmw_vsock/virtio_transport_common.c | 121 
3 files changed, 151 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..0e3aa395c07c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -18,6 +18,12 @@ enum {
VSOCK_VQ_MAX= 3,
};

+struct virtio_vsock_seq_state {
+   u32 user_read_seq_len;
+   u32 user_read_copied;
+   u32 curr_rx_msg_id;
+};
+
/* Per-socket state (accessed via vsk->trans) */
struct virtio_vsock_sock {
struct vsock_sock *vsk;
@@ -36,6 +42,8 @@ struct virtio_vsock_sock {
u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue;
+
+   struct virtio_vsock_seq_state seq_state;
};

struct virtio_vsock_pkt {
@@ -80,6 +88,12 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+int
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  int flags,
+  bool *msg_ready,
+  size_t *msg_len);
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 1d57ed3d84d2..692f8078cced 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h


Maybe we can move the following changes to this file (that should match 
the virtio-spec discussed in the separate thread) in a separate patch 
with a reference to the spec.


You can include this in the series before this patch.


@@ -63,8 +63,14 @@ struct virtio_vsock_hdr {
__le32  fwd_cnt;
} __attribute__((packed));

+struct virtio_vsock_seq_hdr {
+   __le32  msg_id;
+   __le32  msg_len;
+} __attribute__((packed));
+
enum virtio_vsock_type {
VIRTIO_VSOCK_TYPE_STREAM = 1,
+   VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
};

enum virtio_vsock_op {
@@ -83,6 +89,11 @@ enum virtio_vsock_op {
VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
/* Request the peer to send the credit info to us */
VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+
+   /* Record begin for SOCK_SEQPACKET */
+   VIRTIO_VSOCK_OP_SEQ_BEGIN = 8,
+   /* Record end for SOCK_SEQPACKET */
+   VIRTIO_VSOCK_OP_SEQ_END = 9,
};

/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
@@ -91,4 +102,9 @@ enum virtio_vsock_shutdown {
VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
};

+/* VIRTIO_VSOCK_OP_RW flags values */
+enum virtio_vsock_rw {
+   VIRTIO_VSOCK_RW_EOR = 1,
+};
+
#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 833104b71a1c..a8f4326e45e8 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,114 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static inline void virtio_transport_remove_pkt(struct virtio_vsock_pkt *pkt)
+{
+   list_del(>list);
+   virtio_transport_free_pkt(pkt);
+}
+
+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+struct msghdr *msg,
+bool *msg_ready)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt *pkt;
+   int err = 0;
+   size_t user_buf_len = msg->msg_iter.count;
+
+   *msg_ready = false;
+   spin_lock_bh(>rx_lock);
+
+   while (!*msg_ready && !list_empty(>rx_queue) && !err) {
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);

Re: [RFC PATCH v7 06/22] af_vsock: implement send logic for SEQPACKET

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:10:42PM +0300, Arseny Krasnov wrote:

This adds some logic to current stream enqueue function for SEQPACKET
support:
1) Use transport's seqpacket enqueue callback.
2) Return value from enqueue function is whole record length or error
  for SOCK_SEQPACKET.

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
'seqpacket_enqueue' callback interface changed, 'flags' argument was
removed, because it was 'msg_flags' field of 'msg' argument which is
already exists.

include/net/af_vsock.h   |  2 ++
net/vmw_vsock/af_vsock.c | 21 +++--
2 files changed, 17 insertions(+), 6 deletions(-)



Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v7 05/22] af_vsock: separate wait space loop

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:10:23PM +0300, Arseny Krasnov wrote:

This moves loop that waits for space on send to separate function,
because it will be used for SEQ_BEGIN/SEQ_END sending before and
after data transmission. Waiting for SEQ_BEGIN/SEQ_END is needed
because such packets carries SEQPACKET header that couldn't be
fragmented by credit mechanism, so to avoid it, sender waits until
enough space will be ready.

Signed-off-by: Arseny Krasnov 
---
include/net/af_vsock.h   |  2 +
net/vmw_vsock/af_vsock.c | 99 +---
2 files changed, 63 insertions(+), 38 deletions(-)


I had already reviewed this one as well and it doesn't seem to have 
changed :-)


Reviewed-by: Stefano Garzarella 



diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 74ac8a4c4168..7232f6c42a36 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -204,6 +204,8 @@ void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
bool vsock_find_cid(unsigned int cid);
+int vsock_wait_space(struct sock *sk, size_t space, int flags,
+struct vsock_transport_send_notify_data *send_data);

/ TAP /

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index fa0c37f97330..617ffe42693d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1692,6 +1692,65 @@ static int vsock_connectible_getsockopt(struct socket 
*sock,
return 0;
}

+int vsock_wait_space(struct sock *sk, size_t space, int flags,
+struct vsock_transport_send_notify_data *send_data)
+{
+   const struct vsock_transport *transport;
+   struct vsock_sock *vsk;
+   long timeout;
+   int err;
+
+   DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+   vsk = vsock_sk(sk);
+   transport = vsk->transport;
+   timeout = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+   err = 0;
+
+   add_wait_queue(sk_sleep(sk), );
+
+   while (vsock_stream_has_space(vsk) < space &&
+  sk->sk_err == 0 &&
+  !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+  !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
+
+   /* Don't wait for non-blocking sockets. */
+   if (timeout == 0) {
+   err = -EAGAIN;
+   goto out_err;
+   }
+
+   if (send_data) {
+   err = transport->notify_send_pre_block(vsk, send_data);
+   if (err < 0)
+   goto out_err;
+   }
+
+   release_sock(sk);
+   timeout = wait_woken(, TASK_INTERRUPTIBLE, timeout);
+   lock_sock(sk);
+   if (signal_pending(current)) {
+   err = sock_intr_errno(timeout);
+   goto out_err;
+   } else if (timeout == 0) {
+   err = -EAGAIN;
+   goto out_err;
+   }
+   }
+
+   if (sk->sk_err) {
+   err = -sk->sk_err;
+   } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+  (vsk->peer_shutdown & RCV_SHUTDOWN)) {
+   err = -EPIPE;
+   }
+
+out_err:
+   remove_wait_queue(sk_sleep(sk), );
+   return err;
+}
+EXPORT_SYMBOL_GPL(vsock_wait_space);
+
static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg,
 size_t len)
{
@@ -1699,10 +1758,8 @@ static int vsock_connectible_sendmsg(struct socket 
*sock, struct msghdr *msg,
struct vsock_sock *vsk;
const struct vsock_transport *transport;
ssize_t total_written;
-   long timeout;
int err;
struct vsock_transport_send_notify_data send_data;
-   DEFINE_WAIT_FUNC(wait, woken_wake_function);

sk = sock->sk;
vsk = vsock_sk(sk);
@@ -1740,9 +1797,6 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
goto out;
}

-   /* Wait for room in the produce queue to enqueue our user's data. */
-   timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
err = transport->notify_send_init(vsk, _data);
if (err < 0)
goto out;
@@ -1750,39 +1804,8 @@ static int vsock_connectible_sendmsg(struct socket 
*sock, struct msghdr *msg,
while (total_written < len) {
ssize_t written;

-   add_wait_queue(sk_sleep(sk), );
-   while (vsock_stream_has_space(vsk) == 0 &&
-  sk->sk_err == 0 &&
-  !(sk->sk_shutdown & SEND_SHUTDOWN) &&
-  !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
-
-   /* Don't wait for non-blocking sockets. */
-   if (timeout == 0) {
-   err = -EAGAIN;
-   

Re: [RFC PATCH v7 04/22] af_vsock: implement SEQPACKET receive loop

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:10:03PM +0300, Arseny Krasnov wrote:

This adds receive loop for SEQPACKET. It looks like receive loop for
STREAM, but there is a little bit difference:
1) It doesn't call notify callbacks.
2) It doesn't care about 'SO_SNDLOWAT' and 'SO_RCVLOWAT' values, because
  there is no sense for these values in SEQPACKET case.
3) It waits until whole record is received or error is found during
  receiving.
4) It processes and sets 'MSG_TRUNC' flag.

So to avoid extra conditions for two types of socket inside one loop, two
independent functions were created.

Signed-off-by: Arseny Krasnov 
---
v6 -> v7:
'seqpacket_get_len' callback now removed, length of message is returned
 by 'seqpacket_dequeue' callback.

include/net/af_vsock.h   |  4 ++
net/vmw_vsock/af_vsock.c | 88 +++-
2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b1c717286993..74ac8a4c4168 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -135,6 +135,10 @@ struct vsock_transport {
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);

+   /* SEQ_PACKET. */
+   int (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
+int flags, bool *msg_ready, size_t 
*record_len);
+


Why not using ssize_t as return value and return the length or a 
negative value in case of error?


In this way we can remove the 'record_len' parameter.


/* Notification. */
int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 0bc661e54262..fa0c37f97330 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1973,6 +1973,89 @@ static int __vsock_stream_recvmsg(struct sock 
*sk, struct msghdr *msg,

return err;
}

+static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr 
*msg,

+size_t len, int flags)
+{
+   const struct vsock_transport *transport;
+   const struct iovec *orig_iov;
+   unsigned long orig_nr_segs;
+   bool msg_ready;
+   struct vsock_sock *vsk;
+   size_t record_len;
+   long timeout;
+   int err = 0;
+   DEFINE_WAIT(wait);
+
+   vsk = vsock_sk(sk);
+   transport = vsk->transport;
+
+   timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+   orig_nr_segs = msg->msg_iter.nr_segs;
+   orig_iov = msg->msg_iter.iov;
+   msg_ready = false;
+   record_len = 0;
+
+   while (1) {
+   err = vsock_wait_data(sk, , timeout, NULL, 0);
+
+   if (err <= 0) {
+   /* In case of any loop break(timeout, signal
+* interrupt or shutdown), we report user that
+* nothing was copied.
+*/
+   err = 0;
+   break;
+   }
+
+   err = transport->seqpacket_dequeue(vsk, msg, flags, _ready, 
_len);
+
+   if (err < 0) {
+   if (err == -EAGAIN) {
+   iov_iter_init(>msg_iter, READ,
+ orig_iov, orig_nr_segs,
+ len);
+   /* Clear 'MSG_EOR' here, because dequeue
+* callback above set it again if it was
+* set by sender. This 'MSG_EOR' is from
+* dropped record.
+*/
+   msg->msg_flags &= ~MSG_EOR;
+   record_len = 0;
+   continue;
+   }
+
+   err = -ENOMEM;
+   break;
+   }
+
+   if (msg_ready)
+   break;
+   }
+
+   if (sk->sk_err)
+   err = -sk->sk_err;
+   else if (sk->sk_shutdown & RCV_SHUTDOWN)
+   err = 0;
+
+   if (msg_ready) {


If 'err' is not 0, should we skip this branch?


+   /* User sets MSG_TRUNC, so return real length of
+* packet.
+*/
+   if (flags & MSG_TRUNC)
+   err = record_len;
+   else
+   err = len - msg->msg_iter.count;
+
+   /* Always set MSG_TRUNC if real length of packet is
+* bigger than user's buffer.
+*/
+   if (record_len > len)
+   msg->msg_flags |= MSG_TRUNC;
+   }
+
+   return err;
+}
+
static int
vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
  int flags)
@@ -2028,7 +2111,10 @@ 

Re: [RFC PATCH v7 03/22] af_vsock: separate receive data loop

2021-03-25 Thread Stefano Garzarella

On Tue, Mar 23, 2021 at 04:09:36PM +0300, Arseny Krasnov wrote:

Move STREAM specific data receive logic to '__vsock_stream_recvmsg()'
dedicated function, while checks, that will be same for both STREAM
and SEQPACKET sockets, stays in 'vsock_connectible_recvmsg()' shared
functions.

Signed-off-by: Arseny Krasnov 
---
net/vmw_vsock/af_vsock.c | 116 ++-
1 file changed, 67 insertions(+), 49 deletions(-)


I had already reviewed this in v5 and in v6 you reported the R-b tag.

Usually the tag gets removed if you make changes to the patch or the 
reviewer is no longer happy. But this doesn't seem to be the case.


So please keep the tags between versions :-)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 421c0303b26f..0bc661e54262 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1895,65 +1895,22 @@ static int vsock_wait_data(struct sock *sk, struct 
wait_queue_entry *wait,
return data;
}

-static int
-vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags)
{
-   struct sock *sk;
-   struct vsock_sock *vsk;
+   struct vsock_transport_recv_notify_data recv_data;
const struct vsock_transport *transport;
-   int err;
-   size_t target;
+   struct vsock_sock *vsk;
ssize_t copied;
+   size_t target;
long timeout;
-   struct vsock_transport_recv_notify_data recv_data;
+   int err;

DEFINE_WAIT(wait);

-   sk = sock->sk;
vsk = vsock_sk(sk);
-   err = 0;
-
-   lock_sock(sk);
-
transport = vsk->transport;

-   if (!transport || sk->sk_state != TCP_ESTABLISHED) {
-   /* Recvmsg is supposed to return 0 if a peer performs an
-* orderly shutdown. Differentiate between that case and when a
-* peer has not connected or a local shutdown occured with the
-* SOCK_DONE flag.
-*/
-   if (sock_flag(sk, SOCK_DONE))
-   err = 0;
-   else
-   err = -ENOTCONN;
-
-   goto out;
-   }
-
-   if (flags & MSG_OOB) {
-   err = -EOPNOTSUPP;
-   goto out;
-   }
-
-   /* We don't check peer_shutdown flag here since peer may actually shut
-* down, but there can be data in the queue that a local socket can
-* receive.
-*/
-   if (sk->sk_shutdown & RCV_SHUTDOWN) {
-   err = 0;
-   goto out;
-   }
-
-   /* It is valid on Linux to pass in a zero-length receive buffer.  This
-* is not an error.  We may as well bail out now.
-*/
-   if (!len) {
-   err = 0;
-   goto out;
-   }
-
/* We must not copy less than target bytes into the user's buffer
 * before returning successfully, so we wait for the consume queue to
 * have that much data to consume before dequeueing.  Note that this
@@ -2012,6 +1969,67 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
if (copied > 0)
err = copied;

+out:
+   return err;
+}
+
+static int
+vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+   struct sock *sk;
+   struct vsock_sock *vsk;
+   const struct vsock_transport *transport;
+   int err;
+
+   DEFINE_WAIT(wait);
+
+   sk = sock->sk;
+   vsk = vsock_sk(sk);
+   err = 0;
+
+   lock_sock(sk);
+
+   transport = vsk->transport;
+
+   if (!transport || sk->sk_state != TCP_ESTABLISHED) {
+   /* Recvmsg is supposed to return 0 if a peer performs an
+* orderly shutdown. Differentiate between that case and when a
+* peer has not connected or a local shutdown occurred with the
+* SOCK_DONE flag.
+*/
+   if (sock_flag(sk, SOCK_DONE))
+   err = 0;
+   else
+   err = -ENOTCONN;
+
+   goto out;
+   }
+
+   if (flags & MSG_OOB) {
+   err = -EOPNOTSUPP;
+   goto out;
+   }
+
+   /* We don't check peer_shutdown flag here since peer may actually shut
+* down, but there can be data in the queue that a local socket can
+* receive.
+*/
+   if (sk->sk_shutdown & RCV_SHUTDOWN) {
+   err = 0;
+   goto out;
+   }
+
+   /* It is valid on Linux to pass in a zero-length receive buffer.  This
+* is not an error.  We may as well bail out now.
+*/
+   if (!len) {
+   err = 0;
+   goto out;
+   }
+
+   

Re: [PATCH v5 01/11] file: Export __receive_fd() to modules

2021-03-25 Thread Christoph Hellwig
On Mon, Mar 15, 2021 at 05:46:43PM +0800, Yongji Xie wrote:
> On Mon, Mar 15, 2021 at 5:08 PM Christoph Hellwig  wrote:
> >
> > On Mon, Mar 15, 2021 at 01:37:11PM +0800, Xie Yongji wrote:
> > > Export __receive_fd() so that some modules can use
> > > it to pass file descriptor between processes.
> >
> > I really don't think any non-core code should do that, especilly not
> > modular mere driver code.
> 
> Do you see any issue? Now I think we're able to do that with the help
> of get_unused_fd_flags() and fd_install() in modules. But we may miss
> some security stuff in this way. So I try to export __receive_fd() and
> use it instead.

The real problem is now what helper to use, but rather that random
drivers should not just mess with the FD table like that.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 15/17] iommu: remove DOMAIN_ATTR_NESTING

2021-03-25 Thread Christoph Hellwig
On Thu, Mar 25, 2021 at 06:12:37AM +, Tian, Kevin wrote:
> Agree. The vSVA series is still undergoing a refactor according to Jason's
> comment thus won't be ready in short term. It's better to let this one
> go in first.

Would be great to get a few more reviews while we're at it :)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 09/11] vduse: Introduce VDUSE - vDPA Device in Userspace

2021-03-25 Thread Jason Wang


在 2021/3/24 下午4:55, Yongji Xie 写道:

On Wed, Mar 24, 2021 at 12:43 PM Jason Wang  wrote:


在 2021/3/15 下午1:37, Xie Yongji 写道:

This VDUSE driver enables implementing vDPA devices in userspace.
Both control path and data path of vDPA devices will be able to
be handled in userspace.

In the control path, the VDUSE driver will make use of message
mechnism to forward the config operation from vdpa bus driver
to userspace. Userspace can use read()/write() to receive/reply
those control messages.

In the data path, userspace can use mmap() to access vDPA device's
iova regions obtained through VDUSE_IOTLB_GET_ENTRY ioctl. Besides,
userspace can use ioctl() to inject interrupt and use the eventfd
mechanism to receive virtqueue kicks.

Signed-off-by: Xie Yongji 
---
   Documentation/userspace-api/ioctl/ioctl-number.rst |1 +
   drivers/vdpa/Kconfig   |   10 +
   drivers/vdpa/Makefile  |1 +
   drivers/vdpa/vdpa_user/Makefile|5 +
   drivers/vdpa/vdpa_user/vduse_dev.c | 1281 

   include/uapi/linux/vduse.h |  153 +++
   6 files changed, 1451 insertions(+)
   create mode 100644 drivers/vdpa/vdpa_user/Makefile
   create mode 100644 drivers/vdpa/vdpa_user/vduse_dev.c
   create mode 100644 include/uapi/linux/vduse.h

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a4c75a28c839..71722e6f8f23 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -300,6 +300,7 @@ Code  Seq#Include File  
 Comments
   'z'   10-4F  drivers/s390/crypto/zcrypt_api.h
conflict!
   '|'   00-7F  linux/media.h
   0x80  00-1F  linux/fb.h
+0x81  00-1F  linux/vduse.h
   0x89  00-06  arch/x86/include/asm/sockios.h
   0x89  0B-DF  linux/sockios.h
   0x89  E0-EF  linux/sockios.h 
SIOCPROTOPRIVATE range
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index a245809c99d0..77a1da522c21 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -25,6 +25,16 @@ config VDPA_SIM_NET
   help
 vDPA networking device simulator which loops TX traffic back to RX.

+config VDPA_USER
+ tristate "VDUSE (vDPA Device in Userspace) support"
+ depends on EVENTFD && MMU && HAS_DMA
+ select DMA_OPS
+ select VHOST_IOTLB
+ select IOMMU_IOVA
+ help
+   With VDUSE it is possible to emulate a vDPA Device
+   in a userspace program.
+
   config IFCVF
   tristate "Intel IFC VF vDPA driver"
   depends on PCI_MSI
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index 67fe7f3d6943..f02ebed33f19 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -1,6 +1,7 @@
   # SPDX-License-Identifier: GPL-2.0
   obj-$(CONFIG_VDPA) += vdpa.o
   obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_VDPA_USER) += vdpa_user/
   obj-$(CONFIG_IFCVF)+= ifcvf/
   obj-$(CONFIG_MLX5_VDPA) += mlx5/
   obj-$(CONFIG_VP_VDPA)+= virtio_pci/
diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile
new file mode 100644
index ..260e0b26af99
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+vduse-y := vduse_dev.o iova_domain.o
+
+obj-$(CONFIG_VDPA_USER) += vduse.o
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
new file mode 100644
index ..07d0ae92d470
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -0,0 +1,1281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDUSE: vDPA Device in Userspace
+ *
+ * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights 
reserved.
+ *
+ * Author: Xie Yongji 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "iova_domain.h"
+
+#define DRV_VERSION  "1.0"
+#define DRV_AUTHOR   "Yongji Xie "
+#define DRV_DESC "vDPA Device in Userspace"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDUSE_DEV_MAX (1U << MINORBITS)
+
+struct vduse_virtqueue {
+ u16 index;
+ bool ready;
+ spinlock_t kick_lock;
+ spinlock_t irq_lock;
+ struct eventfd_ctx *kickfd;
+ struct vdpa_callback cb;
+ struct work_struct inject;
+};
+
+struct vduse_dev;
+
+struct vduse_vdpa {
+ struct vdpa_device vdpa;
+ struct vduse_dev *dev;
+};
+
+struct vduse_dev {
+ struct vduse_vdpa *vdev;
+ struct device dev;
+ struct cdev cdev;
+ struct vduse_virtqueue *vqs;
+ struct vduse_iova_domain *domain;
+ spinlock_t msg_lock;
+ atomic64_t msg_unique;
+ wait_queue_head_t waitq;
+ struct list_head send_list;
+ struct list_head recv_list;
+ struct list_head 

RE: [PATCH 15/17] iommu: remove DOMAIN_ATTR_NESTING

2021-03-25 Thread Tian, Kevin
> From: Auger Eric
> Sent: Monday, March 15, 2021 3:52 PM
> To: Christoph Hellwig 
> Cc: k...@vger.kernel.org; Will Deacon ; linuxppc-
> d...@lists.ozlabs.org; dri-de...@lists.freedesktop.org; Li Yang
> ; io...@lists.linux-foundation.org;
> 
> Hi Christoph,
> 
> On 3/14/21 4:58 PM, Christoph Hellwig wrote:
> > On Sun, Mar 14, 2021 at 11:44:52AM +0100, Auger Eric wrote:
> >> As mentionned by Robin, there are series planning to use
> >> DOMAIN_ATTR_NESTING to get info about the nested caps of the iommu
> (ARM
> >> and Intel):
> >>
> >> [Patch v8 00/10] vfio: expose virtual Shared Virtual Addressing to VMs
> >> patches 1, 2, 3
> >>
> >> Is the plan to introduce a new domain_get_nesting_info ops then?
> >
> > The plan as usual would be to add it the series adding that support.
> > Not sure what the merge plans are - if the series is ready to be
> > merged I could rebase on top of it, otherwise that series will need
> > to add the method.
> OK I think your series may be upstreamed first.
> 

Agree. The vSVA series is still undergoing a refactor according to Jason's
comment thus won't be ready in short term. It's better to let this one
go in first.

Thanks
Kevin
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization