Re: [PATCH net 0/4] vsock: fix server prevents clients from reconnecting

2023-11-06 Thread Stefano Garzarella

On Fri, Nov 03, 2023 at 06:55:47PM +0100, f.storniol...@gmail.com wrote:

From: Filippo Storniolo 

This patch series introduce fix and tests for the following vsock bug:
If the same remote peer, using the same port, tries to connect
to a server on a listening port more than once, the server will
reject the connection, causing a "connection reset by peer"
error on the remote peer. This is due to the presence of a
dangling socket from a previous connection in both the connected
and bound socket lists.
The inconsistency of the above lists only occurs when the remote
peer disconnects and the server remains active.
This bug does not occur when the server socket is closed.

More details on the first patch changelog.
The remaining patches are refactoring and test.


Thanks for the fix and the test!

I only left a small comment in patch 2 which I don't think justifies a
v2 by itself though. If for some other reason you have to send a v2,
then maybe I would fix it.

I reviewed the series and ran the tests. Everything seems to be fine.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net 4/4] test/vsock: add dobule bind connect test

2023-11-06 Thread Stefano Garzarella

On Fri, Nov 03, 2023 at 06:55:51PM +0100, f.storniol...@gmail.com wrote:

From: Filippo Storniolo 

This add bind connect test which creates a listening server socket
and tries to connect a client with a bound local port to it twice.

Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Filippo Storniolo 
---
tools/testing/vsock/util.c   | 47 ++
tools/testing/vsock/util.h   |  3 ++
tools/testing/vsock/vsock_test.c | 50 
3 files changed, 100 insertions(+)


LGTM!

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 2fc96f29bdf2..ae2b33c21c45 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -85,6 +85,48 @@ void vsock_wait_remote_close(int fd)
close(epollfd);
}

+/* Bind to , connect to  and return the file descriptor. 
*/
+int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int 
bind_port, int type)
+{
+   struct sockaddr_vm sa_client = {
+   .svm_family = AF_VSOCK,
+   .svm_cid = VMADDR_CID_ANY,
+   .svm_port = bind_port,
+   };
+   struct sockaddr_vm sa_server = {
+   .svm_family = AF_VSOCK,
+   .svm_cid = cid,
+   .svm_port = port,
+   };
+
+   int client_fd, ret;
+
+   client_fd = socket(AF_VSOCK, type, 0);
+   if (client_fd < 0) {
+   perror("socket");
+   exit(EXIT_FAILURE);
+   }
+
+   if (bind(client_fd, (struct sockaddr *)_client, sizeof(sa_client))) {
+   perror("bind");
+   exit(EXIT_FAILURE);
+   }
+
+   timeout_begin(TIMEOUT);
+   do {
+   ret = connect(client_fd, (struct sockaddr *)_server, 
sizeof(sa_server));
+   timeout_check("connect");
+   } while (ret < 0 && errno == EINTR);
+   timeout_end();
+
+   if (ret < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   return client_fd;
+}
+
/* Connect to  and return the file descriptor. */
static int vsock_connect(unsigned int cid, unsigned int port, int type)
{
@@ -223,6 +265,11 @@ int vsock_stream_accept(unsigned int cid, unsigned int 
port,
return vsock_accept(cid, port, clientaddrp, SOCK_STREAM);
}

+int vsock_stream_listen(unsigned int cid, unsigned int port)
+{
+   return vsock_listen(cid, port, SOCK_STREAM);
+}
+
int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
   struct sockaddr_vm *clientaddrp)
{
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index a77175d25864..03c88d0cb861 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -36,9 +36,12 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
int vsock_stream_connect(unsigned int cid, unsigned int port);
+int vsock_bind_connect(unsigned int cid, unsigned int port,
+  unsigned int bind_port, int type);
int vsock_seqpacket_connect(unsigned int cid, unsigned int port);
int vsock_stream_accept(unsigned int cid, unsigned int port,
struct sockaddr_vm *clientaddrp);
+int vsock_stream_listen(unsigned int cid, unsigned int port);
int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
   struct sockaddr_vm *clientaddrp);
void vsock_wait_remote_close(int fd);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index c1f7bc9abd22..5b0e93f9996c 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1180,6 +1180,51 @@ static void test_stream_shutrd_server(const struct 
test_opts *opts)
close(fd);
}

+static void test_double_bind_connect_server(const struct test_opts *opts)
+{
+   int listen_fd, client_fd, i;
+   struct sockaddr_vm sa_client;
+   socklen_t socklen_client = sizeof(sa_client);
+
+   listen_fd = vsock_stream_listen(VMADDR_CID_ANY, 1234);
+
+   for (i = 0; i < 2; i++) {
+   control_writeln("LISTENING");
+
+   timeout_begin(TIMEOUT);
+   do {
+   client_fd = accept(listen_fd, (struct sockaddr 
*)_client,
+  _client);
+   timeout_check("accept");
+   } while (client_fd < 0 && errno == EINTR);
+   timeout_end();
+
+   if (client_fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Waiting for remote peer to close connection */
+   vsock_wait_remote_close(client_fd);
+   }
+
+   close(listen_fd);
+}
+
+static void test_double_bind_connect_client(const struct test_opts *opts)
+{
+   int i, clie

Re: [PATCH net 3/4] test/vsock: refactor vsock_accept

2023-11-06 Thread Stefano Garzarella

On Fri, Nov 03, 2023 at 06:55:50PM +0100, f.storniol...@gmail.com wrote:

From: Filippo Storniolo 

This is a preliminary patch to introduce SOCK_STREAM bind connect test.
vsock_accept() is split into vsock_listen() and vsock_accept().

Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Filippo Storniolo 
---
tools/testing/vsock/util.c | 32 
1 file changed, 20 insertions(+), 12 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 698b0b44a2ee..2fc96f29bdf2 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -136,11 +136,8 @@ int vsock_seqpacket_connect(unsigned int cid, unsigned int 
port)
return vsock_connect(cid, port, SOCK_SEQPACKET);
}

-/* Listen on  and return the first incoming connection.  The remote
- * address is stored to clientaddrp.  clientaddrp may be NULL.
- */
-static int vsock_accept(unsigned int cid, unsigned int port,
-   struct sockaddr_vm *clientaddrp, int type)
+/* Listen on  and return the file descriptor. */
+static int vsock_listen(unsigned int cid, unsigned int port, int type)
{
union {
struct sockaddr sa;
@@ -152,14 +149,7 @@ static int vsock_accept(unsigned int cid, unsigned int 
port,
.svm_cid = cid,
},
};
-   union {
-   struct sockaddr sa;
-   struct sockaddr_vm svm;
-   } clientaddr;
-   socklen_t clientaddr_len = sizeof(clientaddr.svm);
int fd;
-   int client_fd;
-   int old_errno;

fd = socket(AF_VSOCK, type, 0);
if (fd < 0) {
@@ -177,6 +167,24 @@ static int vsock_accept(unsigned int cid, unsigned int 
port,
exit(EXIT_FAILURE);
}

+   return fd;
+}
+
+/* Listen on  and return the first incoming connection.  The remote
+ * address is stored to clientaddrp.  clientaddrp may be NULL.
+ */
+static int vsock_accept(unsigned int cid, unsigned int port,
+   struct sockaddr_vm *clientaddrp, int type)
+{
+   union {
+   struct sockaddr sa;
+   struct sockaddr_vm svm;
+   } clientaddr;
+   socklen_t clientaddr_len = sizeof(clientaddr.svm);
+   int fd, client_fd, old_errno;
+
+   fd = vsock_listen(cid, port, type);
+
control_writeln("LISTENING");

timeout_begin(TIMEOUT);
--
2.41.0



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net 2/4] test/vsock fix: add missing check on socket creation

2023-11-06 Thread Stefano Garzarella

On Fri, Nov 03, 2023 at 06:55:49PM +0100, f.storniol...@gmail.com wrote:

From: Filippo Storniolo 

Add check on socket() return value in vsock_listen()
and vsock_connect()

Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Filippo Storniolo 
---
tools/testing/vsock/util.c | 8 
1 file changed, 8 insertions(+)


If you need to resend the entire series, maybe you can remove "fix"
from the commit title.

But it's a minor thing, so I would only change it if there's something
else that justifies sending a v2:

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 92336721321a..698b0b44a2ee 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -104,6 +104,10 @@ static int vsock_connect(unsigned int cid, unsigned int 
port, int type)
control_expectln("LISTENING");

fd = socket(AF_VSOCK, type, 0);
+   if (fd < 0) {
+   perror("socket");
+   exit(EXIT_FAILURE);
+   }

timeout_begin(TIMEOUT);
do {
@@ -158,6 +162,10 @@ static int vsock_accept(unsigned int cid, unsigned int 
port,
int old_errno;

fd = socket(AF_VSOCK, type, 0);
+   if (fd < 0) {
+   perror("socket");
+   exit(EXIT_FAILURE);
+   }

if (bind(fd, , sizeof(addr.svm)) < 0) {
perror("bind");
--
2.41.0



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net 1/4] vsock/virtio: remove socket from connected/bound list on shutdown

2023-11-06 Thread Stefano Garzarella

On Fri, Nov 03, 2023 at 06:55:48PM +0100, f.storniol...@gmail.com wrote:

From: Filippo Storniolo 

If the same remote peer, using the same port, tries to connect
to a server on a listening port more than once, the server will
reject the connection, causing a "connection reset by peer"
error on the remote peer. This is due to the presence of a
dangling socket from a previous connection in both the connected
and bound socket lists.
The inconsistency of the above lists only occurs when the remote
peer disconnects and the server remains active.

This bug does not occur when the server socket is closed:
virtio_transport_release() will eventually schedule a call to
virtio_transport_do_close() and the latter will remove the socket
from the bound and connected socket lists and clear the sk_buff.

However, virtio_transport_do_close() will only perform the above
actions if it has been scheduled, and this will not happen
if the server is processing the shutdown message from a remote peer.

To fix this, introduce a call to vsock_remove_sock()
when the server is handling a client disconnect.
This is to remove the socket from the bound and connected socket
lists without clearing the sk_buff.

Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")
Reported-by: Daan De Meyer 
Tested-by: Daan De Meyer 
Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Filippo Storniolo 
---
net/vmw_vsock/virtio_transport_common.c | 16 +++-
1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e22c81435ef7..4c595dd1fd64 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1369,11 +1369,17 @@ virtio_transport_recv_connected(struct sock *sk,
vsk->peer_shutdown |= RCV_SHUTDOWN;
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
vsk->peer_shutdown |= SEND_SHUTDOWN;
-   if (vsk->peer_shutdown == SHUTDOWN_MASK &&
-   vsock_stream_has_data(vsk) <= 0 &&
-   !sock_flag(sk, SOCK_DONE)) {
-   (void)virtio_transport_reset(vsk, NULL);
-   virtio_transport_do_close(vsk, true);
+   if (vsk->peer_shutdown == SHUTDOWN_MASK) {
+   if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, 
SOCK_DONE)) {
+   (void)virtio_transport_reset(vsk, NULL);
+   virtio_transport_do_close(vsk, true);
+   }
+   /* Remove this socket anyway because the remote peer 
sent
+* the shutdown. This way a new connection will succeed
+* if the remote peer uses the same source port,
+* even if the old socket is still unreleased, but now 
disconnected.
+*/
+   vsock_remove_sock(vsk);
}
if (le32_to_cpu(virtio_vsock_hdr(skb)->flags))
sk->sk_state_change(sk);
--
2.41.0



Thanks for fixing this issue! LGTM.

Just to inform other maintainers as well. Daan reported this issue to me
at DevConf.cz, I shared it with Filippo and Luigi who analyzed and
solved it.

Reviewed-by: Stefano Garzarella 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net v2] virtio/vsock: Fix uninit-value in virtio_transport_recv_pkt()

2023-11-06 Thread Stefano Garzarella

On Sun, Nov 05, 2023 at 12:05:31AM +0900, Shigeru Yoshida wrote:

KMSAN reported the following uninit-value access issue:

=
BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1dfb/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1421
virtio_transport_recv_pkt+0x1dfb/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1421
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was stored to memory at:
virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 
[inline]
virtio_transport_recv_pkt+0x1ee8/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1415
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was created at:
slab_post_alloc_hook+0x105/0xad0 mm/slab.h:767
slab_alloc_node mm/slub.c:3478 [inline]
kmem_cache_alloc_node+0x5a2/0xaf0 mm/slub.c:3523
kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:559
__alloc_skb+0x2fd/0x770 net/core/skbuff.c:650
alloc_skb include/linux/skbuff.h:1286 [inline]
virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline]
virtio_transport_alloc_skb+0x90/0x11e0 
net/vmw_vsock/virtio_transport_common.c:58
virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 
[inline]
virtio_transport_recv_pkt+0x1279/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1387
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

CPU: 1 PID: 10664 Comm: kworker/1:5 Not tainted 6.6.0-rc3-00146-g9f3ebbef746f #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 
04/01/2014
Workqueue: vsock-loopback vsock_loopback_work
=

The following simple reproducer can cause the issue described above:

int main(void)
{
 int sock;
 struct sockaddr_vm addr = {
   .svm_family = AF_VSOCK,
   .svm_cid = VMADDR_CID_ANY,
   .svm_port = 1234,
 };

 sock = socket(AF_VSOCK, SOCK_STREAM, 0);
 connect(sock, (struct sockaddr *), sizeof(addr));
 return 0;
}

This issue occurs because the `buf_alloc` and `fwd_cnt` fields of the
`struct virtio_vsock_hdr` are not initialized when a new skb is allocated
in `virtio_transport_init_hdr()`. This patch resolves the issue by
initializing these fields during allocation.

Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff")
Reported-and-tested-by: syzbot+0c8ce1da0ac31abba...@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=0c8ce1da0ac31abbadcd
Signed-off-by: Shigeru Yoshida 
---
v1->v2:
- Rebase on the latest net tree
https://lore.kernel.org/all/20231026150154.3536433-1-syosh...@redhat.com/
---
net/vmw_vsock/virtio_transport_common.c | 2 ++
1 file changed, 2 insertions(+)


The patch remained the same, so you could bring back my R-b ;-)
In any case:

Reviewed-by: Stefano Garzarella 

Thanks,
Stefano



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e22c81435ef7..dc65dd4d26df 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -130,6 +130,8 @@ static void virtio_transport_init_hdr(struct sk_buff *skb,
hdr->dst_port= cpu_to_le32(dst_port);
hdr->flags   = cpu_to_le32(info->flags);
hdr->len = cpu_to_le32(payload_len);
+   hdr->buf_alloc   = cpu_to_le32(0);
+   hdr->fwd_cnt = cpu_to_le32(0);
}

static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb,
--
2.41.0



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4] ALSA: virtio: use ack callback

2023-10-31 Thread Stefano Garzarella

On Fri, Oct 27, 2023 at 10:10:30AM -0400, Michael S. Tsirkin wrote:

On Fri, Oct 27, 2023 at 12:18:00PM +0200, Stefano Garzarella wrote:

On Fri, Oct 27, 2023 at 11:27:40AM +0200, Takashi Iwai wrote:
> On Wed, 25 Oct 2023 11:49:19 +0200,
> Matias Ezequiel Vara Larsen wrote:
> >
> > This commit uses the ack() callback to determine when a buffer has been
> > updated, then exposes it to guest.
> >
> > The current mechanism splits a dma buffer into descriptors that are
> > exposed to the device. This dma buffer is shared with the user
> > application. When the device consumes a buffer, the driver moves the
> > request from the used ring to available ring.
> >
> > The driver exposes the buffer to the device without knowing if the
> > content has been updated from the user. The section 2.8.21.1 of the
> > virtio spec states that: "The device MAY access the descriptor chains
> > the driver created and the memory they refer to immediately". If the
> > device picks up buffers from the available ring just after it is
> > notified, it happens that the content may be old.
> >
> > When the ack() callback is invoked, the driver exposes only the buffers
> > that have already been updated, i.e., enqueued in the available ring.
> > Thus, the device always picks up a buffer that is updated.
> >
> > For capturing, the driver starts by exposing all the available buffers
> > to device. After device updates the content of a buffer, it enqueues it
> > in the used ring. It is only after the ack() for capturing is issued
> > that the driver re-enqueues the buffer in the available ring.
> >
> > Co-developed-by: Anton Yakovlev 
> > Signed-off-by: Anton Yakovlev 
> > Signed-off-by: Matias Ezequiel Vara Larsen 
>
> Applied now to for-next branch.

Cool, thanks for that!

I just wonder if we should CC stable since we are fixing a virtio
specification violation.

@Michael what do you think?

Thanks,
Stefano



Acked-by: Michael S. Tsirkin 
Fixes: de3a9980d8c3 ("ALSA: virtio: add virtio sound driver")


The patch is too big for stable - more than 100 lines added. See:
Documentation/process/stable-kernel-rules.rst


Yeah, I see, thanks for sharing!

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] vdpa_sim_blk: allocate the buffer zeroed

2023-10-31 Thread Stefano Garzarella
Deleting and recreating a device can lead to having the same
content as the old device, so let's always allocate buffers
completely zeroed out.

Fixes: abebb16254b3 ("vdpa_sim_blk: support shared backend")
Suggested-by: Qing Wang 
Signed-off-by: Stefano Garzarella 
---
 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index b3a3cb165795..b137f3679343 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -437,7 +437,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, 
const char *name,
if (blk->shared_backend) {
blk->buffer = shared_buffer;
} else {
-   blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+   blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
   GFP_KERNEL);
if (!blk->buffer) {
ret = -ENOMEM;
@@ -495,7 +495,7 @@ static int __init vdpasim_blk_init(void)
goto parent_err;
 
if (shared_backend) {
-   shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+   shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
 GFP_KERNEL);
if (!shared_buffer) {
ret = -ENOMEM;
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4] ALSA: virtio: use ack callback

2023-10-27 Thread Stefano Garzarella

On Fri, Oct 27, 2023 at 11:27:40AM +0200, Takashi Iwai wrote:

On Wed, 25 Oct 2023 11:49:19 +0200,
Matias Ezequiel Vara Larsen wrote:


This commit uses the ack() callback to determine when a buffer has been
updated, then exposes it to guest.

The current mechanism splits a dma buffer into descriptors that are
exposed to the device. This dma buffer is shared with the user
application. When the device consumes a buffer, the driver moves the
request from the used ring to available ring.

The driver exposes the buffer to the device without knowing if the
content has been updated from the user. The section 2.8.21.1 of the
virtio spec states that: "The device MAY access the descriptor chains
the driver created and the memory they refer to immediately". If the
device picks up buffers from the available ring just after it is
notified, it happens that the content may be old.

When the ack() callback is invoked, the driver exposes only the buffers
that have already been updated, i.e., enqueued in the available ring.
Thus, the device always picks up a buffer that is updated.

For capturing, the driver starts by exposing all the available buffers
to device. After device updates the content of a buffer, it enqueues it
in the used ring. It is only after the ack() for capturing is issued
that the driver re-enqueues the buffer in the available ring.

Co-developed-by: Anton Yakovlev 
Signed-off-by: Anton Yakovlev 
Signed-off-by: Matias Ezequiel Vara Larsen 


Applied now to for-next branch.


Cool, thanks for that!

I just wonder if we should CC stable since we are fixing a virtio
specification violation.

@Michael what do you think?

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [syzbot] [net?] KMSAN: uninit-value in virtio_transport_recv_pkt

2023-10-27 Thread Stefano Garzarella

On Fri, Oct 27, 2023 at 10:48:39AM +0200, Eric Dumazet wrote:

On Fri, Oct 27, 2023 at 10:25 AM Stefano Garzarella  wrote:


On Fri, Oct 27, 2023 at 01:11:24AM -0700, syzbot wrote:
>Hello,
>
>syzbot found the following issue on:
>
>HEAD commit:d90b0276af8f Merge tag 'hardening-v6.6-rc3' of git://git.k..
>git tree:   upstream
>console+strace: https://syzkaller.appspot.com/x/log.txt?x=102c8b2268
>kernel config:  https://syzkaller.appspot.com/x/.config?x=6f1a4029b69273f3
>dashboard link: https://syzkaller.appspot.com/bug?extid=0c8ce1da0ac31abbadcd
>compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
2.40
>syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=101e58ec68
>C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17f7adb668
>
>Downloadable assets:
>disk image: 
https://storage.googleapis.com/syzbot-assets/83ae10beee39/disk-d90b0276.raw.xz
>vmlinux: 
https://storage.googleapis.com/syzbot-assets/c231992300f6/vmlinux-d90b0276.xz
>kernel image: 
https://storage.googleapis.com/syzbot-assets/6377c9c2ea97/bzImage-d90b0276.xz
>
>IMPORTANT: if you fix the issue, please add the following tag to the commit:
>Reported-by: syzbot+0c8ce1da0ac31abba...@syzkaller.appspotmail.com
>
>=
>BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1c42/0x2580 
net/vmw_vsock/virtio_transport_common.c:1421
> virtio_transport_recv_pkt+0x1c42/0x2580 
net/vmw_vsock/virtio_transport_common.c:1421
> vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
> process_one_work kernel/workqueue.c:2630 [inline]
> process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
> worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
> kthread+0x3e8/0x540 kernel/kthread.c:388
> ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
>
>Uninit was stored to memory at:
> virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 
[inline]
> virtio_transport_recv_pkt+0x1ea4/0x2580 
net/vmw_vsock/virtio_transport_common.c:1415
> vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
> process_one_work kernel/workqueue.c:2630 [inline]
> process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
> worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
> kthread+0x3e8/0x540 kernel/kthread.c:388
> ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
>
>Uninit was created at:
> slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767
> slab_alloc_node mm/slub.c:3478 [inline]
> kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523
> kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559
> __alloc_skb+0x318/0x740 net/core/skbuff.c:650
> alloc_skb include/linux/skbuff.h:1286 [inline]
> virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline]
> virtio_transport_alloc_skb+0x8b/0x1170 
net/vmw_vsock/virtio_transport_common.c:58
> virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 
[inline]
> virtio_transport_recv_pkt+0x1531/0x2580 
net/vmw_vsock/virtio_transport_common.c:1387
> vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
> process_one_work kernel/workqueue.c:2630 [inline]
> process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
> worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
> kthread+0x3e8/0x540 kernel/kthread.c:388
> ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
>
>CPU: 0 PID: 8 Comm: kworker/0:0 Not tainted 
6.6.0-rc2-syzkaller-00337-gd90b0276af8f #0
>Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
08/04/2023
>Workqueue: vsock-loopback vsock_loopback_work
>=
>

Shigeru Yoshida already posted a patch here:

https://lore.kernel.org/netdev/20231026150154.3536433-1-syosh...@redhat.com/


Sure thing, this is why I released this syzbot report from my queue.



Thanks for that ;-)

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [syzbot] [net?] KMSAN: uninit-value in virtio_transport_recv_pkt

2023-10-27 Thread Stefano Garzarella

On Fri, Oct 27, 2023 at 01:11:24AM -0700, syzbot wrote:

Hello,

syzbot found the following issue on:

HEAD commit:d90b0276af8f Merge tag 'hardening-v6.6-rc3' of git://git.k..
git tree:   upstream
console+strace: https://syzkaller.appspot.com/x/log.txt?x=102c8b2268
kernel config:  https://syzkaller.appspot.com/x/.config?x=6f1a4029b69273f3
dashboard link: https://syzkaller.appspot.com/bug?extid=0c8ce1da0ac31abbadcd
compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
2.40
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=101e58ec68
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17f7adb668

Downloadable assets:
disk image: 
https://storage.googleapis.com/syzbot-assets/83ae10beee39/disk-d90b0276.raw.xz
vmlinux: 
https://storage.googleapis.com/syzbot-assets/c231992300f6/vmlinux-d90b0276.xz
kernel image: 
https://storage.googleapis.com/syzbot-assets/6377c9c2ea97/bzImage-d90b0276.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+0c8ce1da0ac31abba...@syzkaller.appspotmail.com

=
BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1c42/0x2580 
net/vmw_vsock/virtio_transport_common.c:1421
virtio_transport_recv_pkt+0x1c42/0x2580 
net/vmw_vsock/virtio_transport_common.c:1421
vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
kthread+0x3e8/0x540 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was stored to memory at:
virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 
[inline]
virtio_transport_recv_pkt+0x1ea4/0x2580 
net/vmw_vsock/virtio_transport_common.c:1415
vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
kthread+0x3e8/0x540 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was created at:
slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767
slab_alloc_node mm/slub.c:3478 [inline]
kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523
kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559
__alloc_skb+0x318/0x740 net/core/skbuff.c:650
alloc_skb include/linux/skbuff.h:1286 [inline]
virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline]
virtio_transport_alloc_skb+0x8b/0x1170 
net/vmw_vsock/virtio_transport_common.c:58
virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 
[inline]
virtio_transport_recv_pkt+0x1531/0x2580 
net/vmw_vsock/virtio_transport_common.c:1387
vsock_loopback_work+0x3e2/0x5d0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2703
worker_thread+0xf45/0x1490 kernel/workqueue.c:2784
kthread+0x3e8/0x540 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

CPU: 0 PID: 8 Comm: kworker/0:0 Not tainted 
6.6.0-rc2-syzkaller-00337-gd90b0276af8f #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
08/04/2023
Workqueue: vsock-loopback vsock_loopback_work
=



Shigeru Yoshida already posted a patch here:

https://lore.kernel.org/netdev/20231026150154.3536433-1-syosh...@redhat.com/

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net] virtio/vsock: Fix uninit-value in virtio_transport_recv_pkt()

2023-10-27 Thread Stefano Garzarella
On Fri, Oct 27, 2023 at 10:01 AM Stefano Garzarella  wrote:
>
> On Fri, Oct 27, 2023 at 12:01:54AM +0900, Shigeru Yoshida wrote:
> >KMSAN reported the following uninit-value access issue:
> >
> >=
> >BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1dfb/0x26a0 
> >net/vmw_vsock/virtio_transport_common.c:1421
> > virtio_transport_recv_pkt+0x1dfb/0x26a0 
> > net/vmw_vsock/virtio_transport_common.c:1421
> > vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
> > process_one_work kernel/workqueue.c:2630 [inline]
> > process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
> > worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
> > kthread+0x3cc/0x520 kernel/kthread.c:388
> > ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> > ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
> >
> >Uninit was stored to memory at:
> > virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 
> > [inline]
> > virtio_transport_recv_pkt+0x1ee8/0x26a0 
> > net/vmw_vsock/virtio_transport_common.c:1415
> > vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
> > process_one_work kernel/workqueue.c:2630 [inline]
> > process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
> > worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
> > kthread+0x3cc/0x520 kernel/kthread.c:388
> > ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> > ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
> >
> >Uninit was created at:
> > slab_post_alloc_hook+0x105/0xad0 mm/slab.h:767
> > slab_alloc_node mm/slub.c:3478 [inline]
> > kmem_cache_alloc_node+0x5a2/0xaf0 mm/slub.c:3523
> > kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:559
> > __alloc_skb+0x2fd/0x770 net/core/skbuff.c:650
> > alloc_skb include/linux/skbuff.h:1286 [inline]
> > virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline]
> > virtio_transport_alloc_skb+0x90/0x11e0 
> > net/vmw_vsock/virtio_transport_common.c:58
> > virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 
> > [inline]
> > virtio_transport_recv_pkt+0x1279/0x26a0 
> > net/vmw_vsock/virtio_transport_common.c:1387
> > vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
> > process_one_work kernel/workqueue.c:2630 [inline]
> > process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
> > worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
> > kthread+0x3cc/0x520 kernel/kthread.c:388
> > ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
> > ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
> >
> >CPU: 1 PID: 10664 Comm: kworker/1:5 Not tainted 
> >6.6.0-rc3-00146-g9f3ebbef746f #3
> >Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 
> >04/01/2014
> >Workqueue: vsock-loopback vsock_loopback_work
> >=
> >
> >The following simple reproducer can cause the issue described above:
> >
> >int main(void)
> >{
> >  int sock;
> >  struct sockaddr_vm addr = {
> >.svm_family = AF_VSOCK,
> >.svm_cid = VMADDR_CID_ANY,
> >.svm_port = 1234,
> >  };
> >
> >  sock = socket(AF_VSOCK, SOCK_STREAM, 0);
> >  connect(sock, (struct sockaddr *), sizeof(addr));
> >  return 0;
> >}
> >
> >This issue occurs because the `buf_alloc` and `fwd_cnt` fields of the
> >`struct virtio_vsock_hdr` are not initialized when a new skb is allocated
> >in `virtio_transport_alloc_skb()`. This patch resolves the issue by
> >initializing these fields during allocation.
> >
> >Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff")
>
> CCin Bobby, the original author, for any additional comments/checks.
>
> Yeah, I see, before that commit we used kzalloc() to allocate the
> header so we forgot to reset these 2 fields, and checking they are
> the only 2 missing.
>
> I was thinking of putting a memset(hdr, 0, sizeof(*hdr)) in
> virtio_vsock_alloc_skb() but I think it's just extra unnecessary work,
> since here we set all the fields (thanks to this fix), in vhost/vsock.c
> we copy all the header we receive from the guest and in
> virtio_transport.c we already set it all to 0 because we are
> preallocating the receive buffers.
>
> So I'm fine with this fix!
>
> >Signed-off-by: Shigeru Yoshida 
> >---
> > net/vmw_vsock/virtio_transport_common.c | 2 ++
> > 1 file changed, 2 insertions(+)
> >
> >diff --git a/net/vmw_vsock/virt

Re: [PATCH net] virtio/vsock: Fix uninit-value in virtio_transport_recv_pkt()

2023-10-27 Thread Stefano Garzarella

On Fri, Oct 27, 2023 at 12:01:54AM +0900, Shigeru Yoshida wrote:

KMSAN reported the following uninit-value access issue:

=
BUG: KMSAN: uninit-value in virtio_transport_recv_pkt+0x1dfb/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1421
virtio_transport_recv_pkt+0x1dfb/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1421
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was stored to memory at:
virtio_transport_space_update net/vmw_vsock/virtio_transport_common.c:1274 
[inline]
virtio_transport_recv_pkt+0x1ee8/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1415
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

Uninit was created at:
slab_post_alloc_hook+0x105/0xad0 mm/slab.h:767
slab_alloc_node mm/slub.c:3478 [inline]
kmem_cache_alloc_node+0x5a2/0xaf0 mm/slub.c:3523
kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:559
__alloc_skb+0x2fd/0x770 net/core/skbuff.c:650
alloc_skb include/linux/skbuff.h:1286 [inline]
virtio_vsock_alloc_skb include/linux/virtio_vsock.h:66 [inline]
virtio_transport_alloc_skb+0x90/0x11e0 
net/vmw_vsock/virtio_transport_common.c:58
virtio_transport_reset_no_sock net/vmw_vsock/virtio_transport_common.c:957 
[inline]
virtio_transport_recv_pkt+0x1279/0x26a0 
net/vmw_vsock/virtio_transport_common.c:1387
vsock_loopback_work+0x3bb/0x5a0 net/vmw_vsock/vsock_loopback.c:120
process_one_work kernel/workqueue.c:2630 [inline]
process_scheduled_works+0xff6/0x1e60 kernel/workqueue.c:2703
worker_thread+0xeca/0x14d0 kernel/workqueue.c:2784
kthread+0x3cc/0x520 kernel/kthread.c:388
ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304

CPU: 1 PID: 10664 Comm: kworker/1:5 Not tainted 6.6.0-rc3-00146-g9f3ebbef746f #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 
04/01/2014
Workqueue: vsock-loopback vsock_loopback_work
=

The following simple reproducer can cause the issue described above:

int main(void)
{
 int sock;
 struct sockaddr_vm addr = {
   .svm_family = AF_VSOCK,
   .svm_cid = VMADDR_CID_ANY,
   .svm_port = 1234,
 };

 sock = socket(AF_VSOCK, SOCK_STREAM, 0);
 connect(sock, (struct sockaddr *), sizeof(addr));
 return 0;
}

This issue occurs because the `buf_alloc` and `fwd_cnt` fields of the
`struct virtio_vsock_hdr` are not initialized when a new skb is allocated
in `virtio_transport_alloc_skb()`. This patch resolves the issue by
initializing these fields during allocation.

Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff")


CCin Bobby, the original author, for any additional comments/checks.

Yeah, I see, before that commit we used kzalloc() to allocate the
header so we forgot to reset these 2 fields, and checking they are
the only 2 missing.

I was thinking of putting a memset(hdr, 0, sizeof(*hdr)) in
virtio_vsock_alloc_skb() but I think it's just extra unnecessary work,
since here we set all the fields (thanks to this fix), in vhost/vsock.c
we copy all the header we receive from the guest and in
virtio_transport.c we already set it all to 0 because we are
preallocating the receive buffers.

So I'm fine with this fix!


Signed-off-by: Shigeru Yoshida 
---
net/vmw_vsock/virtio_transport_common.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 352d042b130b..102673bef189 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -68,6 +68,8 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
hdr->dst_port= cpu_to_le32(dst_port);
hdr->flags   = cpu_to_le32(info->flags);
hdr->len = cpu_to_le32(len);
+   hdr->buf_alloc   = cpu_to_le32(0);
+   hdr->fwd_cnt = cpu_to_le32(0);

if (info->msg && len > 0) {
payload = skb_put(skb, len);
--
2.41.0



Reviewed-by: Stefano Garzarella 

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4] vsock/virtio: initialize the_virtio_vsock before using VQs

2023-10-25 Thread Stefano Garzarella

On Tue, Oct 24, 2023 at 10:17:42PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host, it can send
connection requests. If the_virtio_vsock is not initialized before,
replies are silently dropped and do not reach the host.

virtio_transport_send_pkt() can queue packets once the_virtio_vsock is
set, but they won't be processed until vsock->tx_run is set to true. We
queue vsock->send_pkt_work when initialization finishes to send those
packets queued earlier.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
v4:
- moved queue_work for send_pkt_work in vqs_start and added comment explaining 
why
v3:
- renamed vqs_fill to vqs_start and moved tx_run initialization to it
- queued send_pkt_work at the end of initialization to send packets queued 
earlier
v2:
- split virtio_vsock_vqs_init in vqs_init and vqs_fill and moved
 the_virtio_vsock initialization after vqs_init

net/vmw_vsock/virtio_transport.c | 18 +-
1 file changed, 17 insertions(+), 1 deletion(-)


LGTM!

Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3] vsock/virtio: initialize the_virtio_vsock before using VQs

2023-10-24 Thread Stefano Garzarella

On Mon, Oct 23, 2023 at 10:22:07PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host, it can send
connection requests. If the_virtio_vsock is not initialized before,
replies are silently dropped and do not reach the host.

virtio_transport_send_pkt() can queue packets once the_virtio_vsock is
set, but they won't be processed until vsock->tx_run is set to true. We
queue vsock->send_pkt_work when initialization finishes to send those
packets queued earlier.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
v3:
- renamed vqs_fill to vqs_start and moved tx_run initialization to it
- queued send_pkt_work at the end of initialization to send packets queued 
earlier
v2:
- split virtio_vsock_vqs_init in vqs_init and vqs_fill and moved
 the_virtio_vsock initialization after vqs_init

net/vmw_vsock/virtio_transport.c | 13 +++--
1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..c0333f9a8002 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -555,6 +555,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock 
*vsock)

virtio_device_ready(vdev);

+   return 0;
+}
+
+static void virtio_vsock_vqs_start(struct virtio_vsock *vsock)
+{
mutex_lock(>tx_lock);
vsock->tx_run = true;
mutex_unlock(>tx_lock);
@@ -568,8 +573,6 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock)
virtio_vsock_event_fill(vsock);
vsock->event_run = true;
mutex_unlock(>event_lock);
-
-   return 0;
}

static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
@@ -664,6 +667,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_start(vsock);
+
+   queue_work(virtio_vsock_workqueue, >send_pkt_work);


I would move this call in virtio_vsock_vqs_start() adding also a comment 
on top, bringing back what you wrote in the commit. Something like this:


/* virtio_transport_send_pkt() can queue packets once
 * the_virtio_vsock is set, but they won't be processed until
 * vsock->tx_run is set to true. We queue vsock->send_pkt_work
 * when initialization finishes to send those packets queued
 * earlier.
 */

Just as a consideration, we don't need to queue the other workers (rx, 
event) because as long as we don't fill the queues with empty buffers, 
the host can't send us any notification. (We could add it in the comment 
if you want).


The rest LGTM!

Thanks,
Stefano



mutex_unlock(_virtio_vsock_mutex);

@@ -736,6 +742,9 @@ static int virtio_vsock_restore(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_start(vsock);
+
+   queue_work(virtio_vsock_workqueue, >send_pkt_work);

out:
mutex_unlock(_virtio_vsock_mutex);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] vsock/virtio: initialize the_virtio_vsock before using VQs

2023-10-23 Thread Stefano Garzarella

On Mon, Oct 23, 2023 at 06:36:21PM +0300, Alexandru Matei wrote:

On 10/23/2023 6:13 PM, Stefano Garzarella wrote:

On Mon, Oct 23, 2023 at 05:59:45PM +0300, Alexandru Matei wrote:

On 10/23/2023 5:52 PM, Alexandru Matei wrote:

On 10/23/2023 5:29 PM, Stefano Garzarella wrote:

On Mon, Oct 23, 2023 at 05:08:33PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host,
it can send connection requests.  If 'the_virtio_vsock' is not
initialized before, replies are silently dropped and do not reach the host.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
v2:
- split virtio_vsock_vqs_init in vqs_init and vqs_fill and moved
 the_virtio_vsock initialization after vqs_init

net/vmw_vsock/virtio_transport.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..92738d1697c1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -559,6 +559,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock 
*vsock)
vsock->tx_run = true;
mutex_unlock(>tx_lock);

+    return 0;
+}
+
+static void virtio_vsock_vqs_fill(struct virtio_vsock *vsock)


What about renaming this function in virtio_vsock_vqs_start() and move also the 
setting of `tx_run` here?


It works but in this case we also need to move rcu_assign_pointer in 
virtio_vsock_vqs_start(),
the assignment needs to be right after setting tx_run to true and before 
filling the VQs.


Why?

If `rx_run` is false, we shouldn't need to send replies to the host IIUC.

If we need this instead, please add a comment in the code, but also in the 
commit, because it's not clear why.



We need rcu_assign_pointer after setting tx_run to true for connections 
that are initiated from the guest -> host.
virtio_transport_connect() calls virtio_transport_send_pkt().  Once 
'the_virtio_vsock' is initialized, virtio_transport_send_pkt() will 
queue the packet,

but virtio_transport_send_pkt_work() will exit if tx_run is false.


Okay, but in this case we could safely queue >send_pkt_work after 
finishing initialization to send those packets queued earlier.


In the meantime I'll try to see if we can leave the initialization of 
`the_virtio_vsock` as the ulitmate step and maybe go out first in the 
workers if it's not set.


That way just queue all the workers after everything is done and we 
should be fine.








And if we move rcu_assign_pointer then there is no need to split the function 
in two,
We can move rcu_assign_pointer() directly inside virtio_vsock_vqs_init() after 
setting tx_run.


Yep, this could be another option, but we need to change the name of that 
function in this case.



OK, how does virtio_vsock_vqs_setup() sound?


Or virtio_vsock_start() (without vqs)

Stefano




Stefano





Thanks,
Stefano


+{
mutex_lock(>rx_lock);
virtio_vsock_rx_fill(vsock);
vsock->rx_run = true;
@@ -568,8 +573,6 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock)
virtio_vsock_event_fill(vsock);
vsock->event_run = true;
mutex_unlock(>event_lock);
-
-    return 0;
}

static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
@@ -664,6 +667,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
    goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+    virtio_vsock_vqs_fill(vsock);

mutex_unlock(_virtio_vsock_mutex);

@@ -736,6 +740,7 @@ static int virtio_vsock_restore(struct virtio_device *vdev)
    goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+    virtio_vsock_vqs_fill(vsock);

out:
mutex_unlock(_virtio_vsock_mutex);
-- 
2.34.1











___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] vsock/virtio: initialize the_virtio_vsock before using VQs

2023-10-23 Thread Stefano Garzarella

On Mon, Oct 23, 2023 at 05:59:45PM +0300, Alexandru Matei wrote:

On 10/23/2023 5:52 PM, Alexandru Matei wrote:

On 10/23/2023 5:29 PM, Stefano Garzarella wrote:

On Mon, Oct 23, 2023 at 05:08:33PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host,
it can send connection requests.  If 'the_virtio_vsock' is not
initialized before, replies are silently dropped and do not reach the host.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
v2:
- split virtio_vsock_vqs_init in vqs_init and vqs_fill and moved
 the_virtio_vsock initialization after vqs_init

net/vmw_vsock/virtio_transport.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..92738d1697c1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -559,6 +559,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock 
*vsock)
vsock->tx_run = true;
mutex_unlock(>tx_lock);

+    return 0;
+}
+
+static void virtio_vsock_vqs_fill(struct virtio_vsock *vsock)


What about renaming this function in virtio_vsock_vqs_start() and move also the 
setting of `tx_run` here?


It works but in this case we also need to move rcu_assign_pointer in 
virtio_vsock_vqs_start(),
the assignment needs to be right after setting tx_run to true and before 
filling the VQs.


Why?

If `rx_run` is false, we shouldn't need to send replies to the host 
IIUC.


If we need this instead, please add a comment in the code, but also in 
the commit, because it's not clear why.






And if we move rcu_assign_pointer then there is no need to split the function 
in two,
We can move rcu_assign_pointer() directly inside virtio_vsock_vqs_init() after 
setting tx_run.


Yep, this could be another option, but we need to change the name of 
that function in this case.


Stefano





Thanks,
Stefano


+{
mutex_lock(>rx_lock);
virtio_vsock_rx_fill(vsock);
vsock->rx_run = true;
@@ -568,8 +573,6 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock)
virtio_vsock_event_fill(vsock);
vsock->event_run = true;
mutex_unlock(>event_lock);
-
-    return 0;
}

static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
@@ -664,6 +667,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
    goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+    virtio_vsock_vqs_fill(vsock);

mutex_unlock(_virtio_vsock_mutex);

@@ -736,6 +740,7 @@ static int virtio_vsock_restore(struct virtio_device *vdev)
    goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+    virtio_vsock_vqs_fill(vsock);

out:
mutex_unlock(_virtio_vsock_mutex);
-- 
2.34.1







___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] vsock/virtio: initialize the_virtio_vsock before using VQs

2023-10-23 Thread Stefano Garzarella

On Mon, Oct 23, 2023 at 05:08:33PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host,
it can send connection requests.  If 'the_virtio_vsock' is not
initialized before, replies are silently dropped and do not reach the host.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
v2:
- split virtio_vsock_vqs_init in vqs_init and vqs_fill and moved
 the_virtio_vsock initialization after vqs_init

net/vmw_vsock/virtio_transport.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..92738d1697c1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -559,6 +559,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock 
*vsock)
vsock->tx_run = true;
mutex_unlock(>tx_lock);

+   return 0;
+}
+
+static void virtio_vsock_vqs_fill(struct virtio_vsock *vsock)


What about renaming this function in virtio_vsock_vqs_start() and move 
also the setting of `tx_run` here?


Thanks,
Stefano


+{
mutex_lock(>rx_lock);
virtio_vsock_rx_fill(vsock);
vsock->rx_run = true;
@@ -568,8 +573,6 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock)
virtio_vsock_event_fill(vsock);
vsock->event_run = true;
mutex_unlock(>event_lock);
-
-   return 0;
}

static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
@@ -664,6 +667,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_fill(vsock);

mutex_unlock(_virtio_vsock_mutex);

@@ -736,6 +740,7 @@ static int virtio_vsock_restore(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_fill(vsock);

out:
mutex_unlock(_virtio_vsock_mutex);
--
2.34.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vsock: initialize the_virtio_vsock before using VQs

2023-10-20 Thread Stefano Garzarella

On Fri, Oct 20, 2023 at 12:12:04AM +0300, Alexandru Matei wrote:

On 10/19/2023 11:54 AM, Stefano Garzarella wrote:

On Wed, Oct 18, 2023 at 09:32:47PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host, it can send
connection requests. If 'the_virtio_vsock' is not initialized before,
replies are silently dropped and do not reach the host.


Are replies really dropped or we just miss the notification?

Could the reverse now happen, i.e., the guest wants to send a connection 
request, finds the pointer assigned but can't use virtqueues because they 
haven't been initialized yet?

Perhaps to avoid your problem, we could just queue vsock->rx_work at the bottom 
of the probe to see if anything was queued in the meantime.

Nit: please use "vsock/virtio" to point out that this problem is of the virtio 
transport.

Thanks,
Stefano


The replies are dropped , the scenario goes like this:

 Once rx_run is set to true and rx queue is filled with empty buffers, the host 
sends a connection request.


Oh, I see now, I thought virtio_transport_rx_work() returned early if 
'the_virtio_vsock' was not set.



 The request is processed in virtio_transport_recv_pkt(), and since there is no 
bound socket, it calls virtio_transport_reset_no_sock() which tries to send a 
reset packet.
 In virtio_transport_send_pkt() it checks 'the_virtio_vsock' and because it is 
null it exits with -ENODEV, basically dropping the packet.

I looked on your scenario and there is an issue from the moment we set 
the_virtio_vsock (in this patch) up until vsock->tx_run is set to TRUE.
virtio_transport_send_pkt() will queue the packet, but 
virtio_transport_send_pkt_work() will exit because tx_run is FALSE. This could 
be fixed by moving rcu_assign_pointer() after tx_run is set to TRUE.
virtio_transport_cancel_pkt() uses the rx virtqueue once the_virtio_vsock is 
set, so rcu_assign_pointer() should be moved after virtio_find_vqs() is called.

I think the way to go is to split virtio_vsock_vqs_init() in two: 
virtio_vsock_vqs_init() and virtio_vsock_vqs_fill(), as Vadim 
suggested. This should fix all the cases:


Yep, LGTM!

Thank you both for the fix, please send a v2 with this approach!

Stefano



---
net/vmw_vsock/virtio_transport.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index ad64f403536a..1f95f98ddd3f 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -594,6 +594,11 @@ static int virtio_vsock_vqs_init(struct virtio_vsock 
*vsock)
vsock->tx_run = true;
mutex_unlock(>tx_lock);

+   return 0;
+}
+
+static void virtio_vsock_vqs_fill(struct virtio_vsock *vsock)
+{
mutex_lock(>rx_lock);
virtio_vsock_rx_fill(vsock);
vsock->rx_run = true;
@@ -603,8 +608,6 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock)
virtio_vsock_event_fill(vsock);
vsock->event_run = true;
mutex_unlock(>event_lock);
-
-   return 0;
}

static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
@@ -707,6 +710,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_fill(vsock);

mutex_unlock(_virtio_vsock_mutex);

@@ -779,6 +783,7 @@ static int virtio_vsock_restore(struct virtio_device *vdev)
goto out;

rcu_assign_pointer(the_virtio_vsock, vsock);
+   virtio_vsock_vqs_fill(vsock);

out:
mutex_unlock(_virtio_vsock_mutex);
--



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v2 PATCH] vdpa_sim: implement .reset_map support

2023-10-19 Thread Stefano Garzarella

On Wed, Oct 18, 2023 at 04:47:48PM -0700, Si-Wei Liu wrote:



On 10/18/2023 1:05 AM, Stefano Garzarella wrote:

On Tue, Oct 17, 2023 at 10:11:33PM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.
Works fine with vdpa-sim-net which uses physical address to map.

This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/

Signed-off-by: Si-Wei Liu 

---
RFC v2:
 - initialize iotlb to passthrough mode in device add


I tested this version and I didn't see any issue ;-)

Great, thank you so much for your help on testing my patch, Stefano!


You're welcome :-)

Just for my own interest/curiosity, currently there's no vhost-vdpa 
backend client implemented for vdpa-sim-blk


Yep, we developed libblkio [1]. libblkio exposes common API to access 
block devices in userspace. It supports several drivers.
The one useful for this use case is `virtio-blk-vhost-vdpa`. Here [2] 
some examples on how to use the libblkio test suite with the 
vdpa-sim-blk.


Since QEMU 7.2, it supports libblkio drivers, so you can use the 
following options to attach a vdpa-blk device to a VM:


  -blockdev 
node-name=drive_src1,driver=virtio-blk-vhost-vdpa,path=/dev/vhost-vdpa-0,cache.direct=on
 \
  -device virtio-blk-pci,id=src1,bootindex=2,drive=drive_src1 \

For now only what we called slow-path [3][4] is supported, since the VQs 
are not directly exposed to the guest, but QEMU allocates other VQs 
(similar to shadow VQs for net) to support live-migration and QEMU 
storage features. Fast-path is on the agenda, but on pause for now.


or any vdpa block device in userspace as yet, correct? 


Do you mean with VDUSE?
In this case, yes, qemu-storage-daemon supports it, and can implement a 
virtio-blk in user space, exposing a disk image thorough VDUSE.


There is an example in libblkio as well [5] on how to start it.

So there was no test specific to vhost-vdpa that needs to be exercised, 
right?




I hope I answered above :-)
This reminded me that I need to write a blog post with all this 
information, I hope to do that soon!


Stefano

[1] https://gitlab.com/libblkio/libblkio
[2] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L42
[3] 
https://kvmforum2022.sched.com/event/15jK5/qemu-storage-daemon-and-libblkio-exploring-new-shores-for-the-qemu-block-layer-kevin-wolf-stefano-garzarella-red-hat
[4] 
https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat
[5] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L58

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vsock: initialize the_virtio_vsock before using VQs

2023-10-19 Thread Stefano Garzarella

On Wed, Oct 18, 2023 at 09:32:47PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host, it can send
connection requests. If 'the_virtio_vsock' is not initialized before,
replies are silently dropped and do not reach the host.


Are replies really dropped or we just miss the notification?

Could the reverse now happen, i.e., the guest wants to send a connection 
request, finds the pointer assigned but can't use virtqueues because 
they haven't been initialized yet?


Perhaps to avoid your problem, we could just queue vsock->rx_work at the 
bottom of the probe to see if anything was queued in the meantime.


Nit: please use "vsock/virtio" to point out that this problem is of the 
virtio transport.


Thanks,
Stefano



Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
net/vmw_vsock/virtio_transport.c | 7 ---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..eae0867133f8 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -658,12 +658,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->seqpacket_allow = true;

vdev->priv = vsock;
+   rcu_assign_pointer(the_virtio_vsock, vsock);

ret = virtio_vsock_vqs_init(vsock);
-   if (ret < 0)
+   if (ret < 0) {
+   rcu_assign_pointer(the_virtio_vsock, NULL);
goto out;
-
-   rcu_assign_pointer(the_virtio_vsock, vsock);
+   }

mutex_unlock(_virtio_vsock_mutex);

--
2.34.1




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v2 PATCH] vdpa_sim: implement .reset_map support

2023-10-18 Thread Stefano Garzarella

On Tue, Oct 17, 2023 at 10:11:33PM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.
Works fine with vdpa-sim-net which uses physical address to map.

This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/

Signed-off-by: Si-Wei Liu 

---
RFC v2:
 - initialize iotlb to passthrough mode in device add


I tested this version and I didn't see any issue ;-)

Tested-by: Stefano Garzarella 


---
drivers/vdpa/vdpa_sim/vdpa_sim.c | 34 
1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 76d41058add9..2a0a6042d61d 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -151,13 +151,6 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim)
 >iommu_lock);
}

-   for (i = 0; i < vdpasim->dev_attr.nas; i++) {
-   vhost_iotlb_reset(>iommu[i]);
-   vhost_iotlb_add_range(>iommu[i], 0, ULONG_MAX,
- 0, VHOST_MAP_RW);
-   vdpasim->iommu_pt[i] = true;
-   }
-
vdpasim->running = true;
spin_unlock(>iommu_lock);

@@ -259,8 +252,12 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr 
*dev_attr,
if (!vdpasim->iommu_pt)
goto err_iommu;

-   for (i = 0; i < vdpasim->dev_attr.nas; i++)
+   for (i = 0; i < vdpasim->dev_attr.nas; i++) {
vhost_iotlb_init(>iommu[i], max_iotlb_entries, 0);
+   vhost_iotlb_add_range(>iommu[i], 0, ULONG_MAX, 0,
+ VHOST_MAP_RW);
+   vdpasim->iommu_pt[i] = true;
+   }

for (i = 0; i < dev_attr->nvqs; i++)
vringh_set_iotlb(>vqs[i].vring, >iommu[0],
@@ -637,6 +634,25 @@ static int vdpasim_set_map(struct vdpa_device *vdpa, 
unsigned int asid,
return ret;
}

+static int vdpasim_reset_map(struct vdpa_device *vdpa, unsigned int asid)
+{
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   if (asid >= vdpasim->dev_attr.nas)
+   return -EINVAL;
+
+   spin_lock(>iommu_lock);
+   if (vdpasim->iommu_pt[asid])
+   goto out;
+   vhost_iotlb_reset(>iommu[asid]);
+   vhost_iotlb_add_range(>iommu[asid], 0, ULONG_MAX,
+ 0, VHOST_MAP_RW);
+   vdpasim->iommu_pt[asid] = true;
+out:
+   spin_unlock(>iommu_lock);
+   return 0;
+}
+
static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm)
{
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
@@ -759,6 +775,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
.set_group_asid = vdpasim_set_group_asid,
.dma_map= vdpasim_dma_map,
.dma_unmap  = vdpasim_dma_unmap,
+   .reset_map  = vdpasim_reset_map,
.bind_mm= vdpasim_bind_mm,
.unbind_mm  = vdpasim_unbind_mm,
.free   = vdpasim_free,
@@ -796,6 +813,7 @@ static const struct vdpa_config_ops 
vdpasim_batch_config_ops = {
.get_iova_range = vdpasim_get_iova_range,
.set_group_asid = vdpasim_set_group_asid,
.set_map= vdpasim_set_map,
+   .reset_map  = vdpasim_reset_map,
.bind_mm= vdpasim_bind_mm,
.unbind_mm  = vdpasim_unbind_mm,
.free   = vdpasim_free,
--
2.39.3



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH] vdpa_sim: implement .reset_map support

2023-10-17 Thread Stefano Garzarella

On Fri, Oct 13, 2023 at 10:29:26AM -0700, Si-Wei Liu wrote:

Hi Stefano,

On 10/13/2023 2:22 AM, Stefano Garzarella wrote:

Hi Si-Wei,

On Fri, Oct 13, 2023 at 01:23:40AM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.


I can test it, but what I should stress?
Great, thank you! As you see, my patch moved vhost_iotlb_reset out of 
vdpasim_reset for the sake of decoupling mapping from vdpa device 
reset. For hardware devices this decoupling makes sense as platform 
IOMMU already did it. But I'm not sure if there's something in the 
software device (esp. with vdpa-blk and the userspace library stack) 
that may have to rely on the current .reset behavior that clears the 
vhost_iotlb. So perhaps you can try to exercise every possible case 
involving blk device reset, and see if anything (related to mapping) 
breaks?


I just tried these steps without using a VM and the host kernel hangs
after adding the device:

[root@f38-vm-build ~]# modprobe virtio-vdpa
[root@f38-vm-build ~]# modprobe vdpa-sim-blk
[root@f38-vm-build ~]# vdpa dev add mgmtdev vdpasim_blk name blk0
[   35.284575][  T563] virtio_blk virtio6: 1/0/0 default/read/poll queues
[   35.286372][  T563] virtio_blk virtio6: [vdb] 262144 512-byte logical blocks 
(134 MB/128 MiB)
[   35.295271][  T564] vringh:

Reverting this patch (so building "vdpa/mlx5: implement .reset_map 
driver op") worked here.







Works fine with vdpa-sim-net which uses physical address to map.


Can you share your tests? so I'll try to do the same with blk.
Basically everything involving virtio device reset in the guest, e.g.  
reboot the VM, remove/unbind then reprobe/bind the virtio-net 
module/driver, then see if device I/O (which needs mapping properly) is 
still flowing as expected. And then everything else that could trigger 
QEMU's vhost_dev_start/stop paths ending up as passive vhos-vdpa 
backend reset, for e.g. link status change, suspend/hibernate, SVQ 
switch and live migration. I am not sure if vdpa-blk supports live 
migration through SVQ or not, if not you don't need to worry about.






This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/


The series does not apply well on master or vhost tree.
Where should I apply it?

Sent the link through another email offline.


Received thanks!

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH] ALSA: virtio: use copy and fill_silence callbacks

2023-10-17 Thread Stefano Garzarella

On Thu, Oct 12, 2023 at 11:16:54AM -0400, Michael S. Tsirkin wrote:

On Thu, Oct 12, 2023 at 05:10:50PM +0200, Matias Ezequiel Vara Larsen wrote:

This commit replaces the mmap mechanism with the copy() and
fill_silence() callbacks for both capturing and playback for the
virtio-sound driver. This change is required to prevent the updating of
the content of a buffer that is already in the available ring.

The current mechanism splits a dma buffer into descriptors that are
exposed to the device. This dma buffer is shared with the user
application. When the device consumes a buffer, the driver moves the
request from the used ring to available ring.

The driver exposes the buffer to the device without knowing if the
content has been updated from the user. The section 2.8.21.1 of the
virtio spec states that: "The device MAY access the descriptor chains
the driver created and the memory they refer to immediately". If the
device picks up buffers from the available ring just after it is
notified, it happens that the content may be old.

By providing the copy() callback, the driver first updates the content
of the buffer, and then, exposes the buffer to the device by enqueuing
it in the available ring. Thus, device always picks up a buffer that is
updated.

For capturing, the driver starts by exposing all the available buffers
to device. After device updates the content of a buffer, it enqueues it
in the used ring. It is only after the copy() for capturing is issued
that the driver re-enqueues the buffer in the available ring.

Note that the copy() function assumes that user is always writing a
period. Testing shows that this is true but I may be wrong. This RFC
aims at clarifying this.

Signed-off-by: Matias Ezequiel Vara Larsen 



Thank you for working on this!


Yep, +1!

@Michael do you think we should cc stable and add a Fixes tag since
the driver is not following the virtio spec?

Or it is too risky?

IIUC snd_pcm_ops is changed a bit from previous versions, so we may have
to adapt the patch for stable branches.

Stefano




---
 sound/virtio/virtio_pcm.c | 11 ++--
 sound/virtio/virtio_pcm.h |  9 +++-
 sound/virtio/virtio_pcm_msg.c | 50 ---
 sound/virtio/virtio_pcm_ops.c | 94 +++
 4 files changed, 137 insertions(+), 27 deletions(-)

diff --git a/sound/virtio/virtio_pcm.c b/sound/virtio/virtio_pcm.c
index c10d91fff2fb..bfe982952303 100644
--- a/sound/virtio/virtio_pcm.c
+++ b/sound/virtio/virtio_pcm.c
@@ -104,8 +104,6 @@ static int virtsnd_pcm_build_hw(struct virtio_pcm_substream 
*vss,
 * only message-based transport.
 */
vss->hw.info =
-   SNDRV_PCM_INFO_MMAP |
-   SNDRV_PCM_INFO_MMAP_VALID |
SNDRV_PCM_INFO_BATCH |
SNDRV_PCM_INFO_BLOCK_TRANSFER |
SNDRV_PCM_INFO_INTERLEAVED |
@@ -471,12 +469,11 @@ int virtsnd_pcm_build_devs(struct virtio_snd *snd)
for (kss = ks->substream; kss; kss = kss->next)
vs->substreams[kss->number]->substream = kss;

-   snd_pcm_set_ops(vpcm->pcm, i, _pcm_ops);
+   if (i == SNDRV_PCM_STREAM_CAPTURE)
+   snd_pcm_set_ops(vpcm->pcm, i, 
_pcm_capture_ops);
+   else
+   snd_pcm_set_ops(vpcm->pcm, i, 
_pcm_playback_ops);
}
-
-   snd_pcm_set_managed_buffer_all(vpcm->pcm,
-  SNDRV_DMA_TYPE_VMALLOC, NULL,
-  0, 0);
}

return 0;
diff --git a/sound/virtio/virtio_pcm.h b/sound/virtio/virtio_pcm.h
index 062eb8e8f2cf..1c1106ec971f 100644
--- a/sound/virtio/virtio_pcm.h
+++ b/sound/virtio/virtio_pcm.h
@@ -50,6 +50,8 @@ struct virtio_pcm_substream {
struct work_struct elapsed_period;
spinlock_t lock;
size_t buffer_bytes;
+   u8 *buffer;
+   size_t buffer_sz;
size_t hw_ptr;
bool xfer_enabled;
bool xfer_xrun;
@@ -90,7 +92,8 @@ struct virtio_pcm {
struct virtio_pcm_stream streams[SNDRV_PCM_STREAM_LAST + 1];
 };

-extern const struct snd_pcm_ops virtsnd_pcm_ops;
+extern const struct snd_pcm_ops virtsnd_pcm_playback_ops;
+extern const struct snd_pcm_ops virtsnd_pcm_capture_ops;

 int virtsnd_pcm_validate(struct virtio_device *vdev);

@@ -117,7 +120,9 @@ int virtsnd_pcm_msg_alloc(struct virtio_pcm_substream *vss,

 void virtsnd_pcm_msg_free(struct virtio_pcm_substream *vss);

-int virtsnd_pcm_msg_send(struct virtio_pcm_substream *vss);
+int virtsnd_pcm_msg_send(struct virtio_pcm_substream *vss, bool single);
+
+int virtsnd_pcm_msg_send_locked(struct virtio_pcm_substream *vss, bool single);

 unsigned int virtsnd_pcm_msg_pending_num(struct virtio_pcm_substream *vss);

diff --git a/sound/virtio/virtio_pcm_msg.c b/sound/virtio/virtio_pcm_msg.c
index aca2dc1989ba..9a5f9814cb62 100644
--- 

Re: [RFC PATCH] vdpa_sim: implement .reset_map support

2023-10-13 Thread Stefano Garzarella

Hi Si-Wei,

On Fri, Oct 13, 2023 at 01:23:40AM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.


I can test it, but what I should stress?


Works fine with vdpa-sim-net which uses physical address to map.


Can you share your tests? so I'll try to do the same with blk.



This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/


The series does not apply well on master or vhost tree.
Where should I apply it?

If you have a tree with all of them applied, will be easy for me ;-)

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v4 00/12] vsock/virtio: continue MSG_ZEROCOPY support

2023-10-11 Thread Stefano Garzarella

On Tue, Oct 10, 2023 at 10:15:12PM +0300, Arseniy Krasnov wrote:

Hello,

this patchset contains second and third parts of another big patchset
for MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) (merged to net-next, see
  link below)
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/. <-- this patchset
3) Updates for tests and utils. <-- this patchset

Part 1) was merged:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=71b263e79370348349553ecdf46f4a69eb436dc7

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=19537e125cc7cf2da43a606f5bcebbe0c9aea4cc

Link to v1:
https://lore.kernel.org/netdev/20230922052428.4005676-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20230930210308.2394919-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231007172139.1338644-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v2 -> v3:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v3 -> v4:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.


I think I fully reviewed the series ;-)

Tests are all passing here, including the new ones. I also added
vsock_perf and vsock_uring_test to my test suite!

So for vsock point of view everything looks fine.

Let's see if there is anything about net (MSG_ZEROCOPY flags, etc.)

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v4 12/12] test/vsock: io_uring rx/tx tests

2023-10-11 Thread Stefano Garzarella

On Tue, Oct 10, 2023 at 10:15:24PM +0300, Arseniy Krasnov wrote:

This adds set of tests which use io_uring for rx/tx. This test suite is
implemented as separated util like 'vsock_test' and has the same set of
input arguments as 'vsock_test'. These tests only cover cases of data
transmission (no connect/bind/accept etc).

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Add 'LDLIBS = -luring' to the target 'vsock_uring_test'.
 * Add 'vsock_uring_test' to the target 'test'.
v2 -> v3:
 * Make 'struct vsock_test_data' private by placing it to the .c file.
   Rename it and add comments to this struct to clarify sense of its
   fields.
 * Add 'vsock_uring_test' to the '.gitignore'.
 * Add receive loop to the server side - this is needed to read entire
   data sent by client.
v3 -> v4:
 * Link with 'msg_zerocopy_common.o'.
 * Use '#ifndef' around '#define PAGE_SIZE 4096'.

tools/testing/vsock/.gitignore |   1 +
tools/testing/vsock/Makefile   |   7 +-
tools/testing/vsock/vsock_uring_test.c | 342 +
3 files changed, 348 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/vsock/vsock_uring_test.c


Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/.gitignore b/tools/testing/vsock/.gitignore
index a8adcfdc292b..d9f798713cd7 100644
--- a/tools/testing/vsock/.gitignore
+++ b/tools/testing/vsock/.gitignore
@@ -3,3 +3,4 @@
vsock_test
vsock_diag_test
vsock_perf
+vsock_uring_test
diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 228470ae33c2..a7f56a09ca9f 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
-test: vsock_test vsock_diag_test
+test: vsock_test vsock_diag_test vsock_uring_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o 
msg_zerocopy_common.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o msg_zerocopy_common.o

+vsock_uring_test: LDLIBS = -luring
+vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o 
msg_zerocopy_common.o
+
CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
-   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
+   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d
diff --git a/tools/testing/vsock/vsock_uring_test.c 
b/tools/testing/vsock/vsock_uring_test.c
new file mode 100644
index ..d976d35f0ba9
--- /dev/null
+++ b/tools/testing/vsock/vsock_uring_test.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* io_uring tests for vsock
+ *
+ * Copyright (C) 2023 SberDevices.
+ *
+ * Author: Arseniy Krasnov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "util.h"
+#include "control.h"
+#include "msg_zerocopy_common.h"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE  4096
+#endif
+
+#define RING_ENTRIES_NUM   4
+
+#define VSOCK_TEST_DATA_MAX_IOV 3
+
+struct vsock_io_uring_test {
+   /* Number of valid elements in 'vecs'. */
+   int vecs_cnt;
+   struct iovec vecs[VSOCK_TEST_DATA_MAX_IOV];
+};
+
+static struct vsock_io_uring_test test_data_array[] = {
+   /* All elements have page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { NULL, 2 * PAGE_SIZE },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   },
+   /* Middle element has both non-page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { (void *)1, 200  },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   }
+};
+
+static void vsock_io_uring_client(const struct test_opts *opts,
+ const struct vsock_io_uring_test *test_data,
+ bool msg_zerocopy)
+{
+   struct io_uring_sqe *sqe;
+   struct io_uring_cqe *cqe;
+   struct io_uring ring;
+   struct iovec *iovec;
+   struct msghdr msg;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   if (msg_zerocopy)
+   enable_so_zerocopy(fd);
+
+   iovec = alloc_test_iovec(test_data->vecs, test_data->vecs_cnt);
+
+   if (io_uring_queue_init(RING_ENTRIES_NUM, , 0))
+   error(1, errno, "io_uring_queue_init");
+
+   if (io_uring_register_buffers(, iovec, test_data->vecs_cnt))
+   error(1, errno, "io_ur

Re: [PATCH net-next v4 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-10-11 Thread Stefano Garzarella

On Tue, Oct 10, 2023 at 10:15:22PM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Patch also works as preparation for the next patches for tools in this
patchset: vsock_perf and vsock_uring_test:
1) Adds several new functions to util.c - they will be also used by
  vsock_uring_test.
2) Adds two new functions for MSG_ZEROCOPY handling to a new source
  file - such source will be shared between vsock_test, vsock_perf and
  vsock_uring_test, thus avoiding code copy-pasting.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.
v2 -> v3:
 * Patch was reworked. Now it is also preparation patch (see commit
   message). Shared stuff for 'vsock_perf' and tests is placed to a
   new header file, while shared code between current test tool and
   future uring test is placed to the 'util.c'. I think, that making
   this patch as preparation allows to reduce number of changes in the
   next patches in this patchset.
 * Make 'struct vsock_test_data' private by placing it to the .c file.
   Also add comments to this struct to clarify sense of its fields.
v3 -> v4:
 * Move code from 'msg_zerocopy_common.h' to 'msg_zerocopy_common.c'
   to avoid warning about unused functions.
 * Rename 'iovec_from_test_data()' and 'free_iovec_test_data()' to
   'alloc_test_iovec()' and 'free_test_iovec(). Also add comments for
   both functions.
 * Use '#ifndef' around '#define PAGE_SIZE 4096'.

tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/msg_zerocopy_common.c |  87 ++
tools/testing/vsock/msg_zerocopy_common.h |  18 ++
tools/testing/vsock/util.c| 133 
tools/testing/vsock/util.h|   5 +
tools/testing/vsock/vsock_test.c  |  16 +
tools/testing/vsock/vsock_test_zerocopy.c | 358 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 +
8 files changed, 633 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/msg_zerocopy_common.c
create mode 100644 tools/testing/vsock/msg_zerocopy_common.h
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h


Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..bb938e4790b5 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o 
msg_zerocopy_common.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/msg_zerocopy_common.c 
b/tools/testing/vsock/msg_zerocopy_common.c
new file mode 100644
index ..5a4bdf7b5132
--- /dev/null
+++ b/tools/testing/vsock/msg_zerocopy_common.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Some common code for MSG_ZEROCOPY logic
+ *
+ * Copyright (C) 2023 SberDevices.
+ *
+ * Author: Arseniy Krasnov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "msg_zerocopy_common.h"
+
+void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+   perror("setsockopt");
+   exit(EXIT_FAILURE);
+   }
+}
+
+void vsock_recv_completion(int fd, const bool *zerocopied)
+{
+   struct sock_extended_err *serr;
+   struct msghdr msg = { 0 };
+   char cmsg_data[128];
+   struct cmsghdr *cm;
+   ssize_t res;
+
+   msg.msg_control = cmsg_data;
+   msg.msg_controllen = sizeof(cmsg_data);
+
+   res = recvmsg(fd, , MSG_ERRQUEUE);
+   if (res) {
+   fprintf(stderr, "failed to read error queue: %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   cm = CMSG_FIRSTHDR();
+   if (!cm) {
+   fprintf(stderr, "cmsg: no cmsg\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (cm->cmsg_level != SOL_VSOCK) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_level'\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (cm->cmsg_type != VSOCK_RECVERR) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_type'\n");
+   exit(EXIT_FAILURE);
+   }
+
+   serr = (void *)CMSG_DATA(cm);
+   if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+   fprintf(stderr, "serr: wrong origin: %u\n", serr->ee_origin);
+   exit(EXIT_FAILURE);
+   }
+
+   if (serr->ee_errno) {
+   fpri

Re: [PATCH net-next v4 02/12] vsock: read from socket's error queue

2023-10-11 Thread Stefano Garzarella

On Tue, Oct 10, 2023 at 10:15:14PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Place new defines for userspace to the existing file 'vm_sockets.h'
   instead of creating new one.
v2 -> v3:
 * Add comments to describe 'SOL_VSOCK' and 'VSOCK_RECVERR' in the file
   'vm_sockets.h'.
 * Reorder includes in 'af_vsock.c' in alphabetical order.
v3 -> v4:
 * Update comments for 'SOL_VSOCK' and 'VSOCK_RECVERR' by adding more
   details.

include/linux/socket.h  |  1 +
include/uapi/linux/vm_sockets.h | 17 +
net/vmw_vsock/af_vsock.c|  6 ++
3 files changed, 24 insertions(+)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-10-10 Thread Stefano Garzarella

On Mon, Oct 09, 2023 at 11:24:18PM +0300, Arseniy Krasnov wrote:



On 09.10.2023 18:17, Stefano Garzarella wrote:

On Sat, Oct 07, 2023 at 08:21:37PM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Patch also works as preparation for the next patches for tools in this
patchset: vsock_perf and vsock_uring_test:
1) Adds several new functions to util.c - they will be also used by
  vsock_uring_test.
2) Adds two new functions for MSG_ZEROCOPY handling to a new header
  file - such header will be shared between vsock_test, vsock_perf and
  vsock_uring_test, thus avoiding code copy-pasting.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.
v2 -> v3:
 * Patch was reworked. Now it is also preparation patch (see commit
   message). Shared stuff for 'vsock_perf' and tests is placed to a
   new header file, while shared code between current test tool and
   future uring test is placed to the 'util.c'. I think, that making
   this patch as preparation allows to reduce number of changes in the
   next patches in this patchset.
 * Make 'struct vsock_test_data' private by placing it to the .c file.
   Also add comments to this struct to clarify sense of its fields.

tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/msg_zerocopy_common.h |  92 ++
tools/testing/vsock/util.c    | 110 +++
tools/testing/vsock/util.h    |   5 +
tools/testing/vsock/vsock_test.c  |  16 +
tools/testing/vsock/vsock_test_zerocopy.c | 367 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 +
7 files changed, 606 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/msg_zerocopy_common.h
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..1a26f60a596c 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/msg_zerocopy_common.h 
b/tools/testing/vsock/msg_zerocopy_common.h
new file mode 100644
index ..ce89f1281584
--- /dev/null
+++ b/tools/testing/vsock/msg_zerocopy_common.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef MSG_ZEROCOPY_COMMON_H
+#define MSG_ZEROCOPY_COMMON_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifndef SOL_VSOCK
+#define SOL_VSOCK    287
+#endif
+
+#ifndef VSOCK_RECVERR
+#define VSOCK_RECVERR    1
+#endif
+
+static void enable_so_zerocopy(int fd)
+{
+    int val = 1;
+
+    if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+    perror("setsockopt");
+    exit(EXIT_FAILURE);
+    }
+}
+
+static void vsock_recv_completion(int fd, const bool *zerocopied) 
__maybe_unused;


To avoid this, maybe we can implement those functions in .c file and
link the object.

WDYT?

Ah, here (cc (GCC) 13.2.1 20230728 (Red Hat 13.2.1-1)) the build is
failing:

In file included from vsock_perf.c:23:
msg_zerocopy_common.h: In function ‘vsock_recv_completion’:
msg_zerocopy_common.h:29:67: error: expected declaration specifiers before 
‘__maybe_unused’
   29 | static void vsock_recv_completion(int fd, const bool *zerocopied) 
__maybe_unused;
  |   
^~
msg_zerocopy_common.h:31:1: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or 
‘__attribute__’ before ‘{’ token
   31 | {
  | ^


+static void vsock_recv_completion(int fd, const bool *zerocopied)
+{
+    struct sock_extended_err *serr;
+    struct msghdr msg = { 0 };
+    char cmsg_data[128];
+    struct cmsghdr *cm;
+    ssize_t res;
+
+    msg.msg_control = cmsg_data;
+    msg.msg_controllen = sizeof(cmsg_data);
+
+    res = recvmsg(fd, , MSG_ERRQUEUE);
+    if (res) {
+    fprintf(stderr, "failed to read error queue: %zi\n", res);
+    exit(EXIT_FAILURE);
+    }
+
+    cm = CMSG_FIRSTHDR();
+    if (!cm) {
+    fprintf(stderr, "cmsg: no cmsg\n");
+    exit(EXIT_FAILURE);
+    }
+
+    if (cm->cmsg_level != SOL_VSOCK) {
+    fprintf(stderr, "cmsg: unexpected 'cmsg_level'\n");
+    exit(EXIT_FAILURE);
+    }
+
+    if (cm->cmsg_type != VSOCK_RECVERR) {
+    fprintf(stderr, "cmsg: unexpected 'cmsg_type'\n");
+    exit(EXIT_FAILURE);
+    }
+
+    serr = (void *)CMSG_DATA(cm);
+    if (serr-

Re: [PATCH net-next v3 11/12] test/vsock: MSG_ZEROCOPY support for vsock_perf

2023-10-09 Thread Stefano Garzarella

On Sat, Oct 07, 2023 at 08:21:38PM +0300, Arseniy Krasnov wrote:

To use this option pass '--zerocopy' parameter:

./vsock_perf --zerocopy --sender  ...

With this option MSG_ZEROCOPY flag will be passed to the 'send()' call.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.
v2 -> v3:
 * Use 'msg_zerocopy_common.h' for MSG_ZEROCOPY related things.
 * Rename '--zc' option to '--zerocopy'.
 * Add detail in help that zerocopy mode is for sender mode only.

tools/testing/vsock/vsock_perf.c | 80 
1 file changed, 71 insertions(+), 9 deletions(-)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-10-09 Thread Stefano Garzarella

On Sat, Oct 07, 2023 at 08:21:37PM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Patch also works as preparation for the next patches for tools in this
patchset: vsock_perf and vsock_uring_test:
1) Adds several new functions to util.c - they will be also used by
  vsock_uring_test.
2) Adds two new functions for MSG_ZEROCOPY handling to a new header
  file - such header will be shared between vsock_test, vsock_perf and
  vsock_uring_test, thus avoiding code copy-pasting.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.
v2 -> v3:
 * Patch was reworked. Now it is also preparation patch (see commit
   message). Shared stuff for 'vsock_perf' and tests is placed to a
   new header file, while shared code between current test tool and
   future uring test is placed to the 'util.c'. I think, that making
   this patch as preparation allows to reduce number of changes in the
   next patches in this patchset.
 * Make 'struct vsock_test_data' private by placing it to the .c file.
   Also add comments to this struct to clarify sense of its fields.

tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/msg_zerocopy_common.h |  92 ++
tools/testing/vsock/util.c| 110 +++
tools/testing/vsock/util.h|   5 +
tools/testing/vsock/vsock_test.c  |  16 +
tools/testing/vsock/vsock_test_zerocopy.c | 367 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 +
7 files changed, 606 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/msg_zerocopy_common.h
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..1a26f60a596c 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/msg_zerocopy_common.h 
b/tools/testing/vsock/msg_zerocopy_common.h
new file mode 100644
index ..ce89f1281584
--- /dev/null
+++ b/tools/testing/vsock/msg_zerocopy_common.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef MSG_ZEROCOPY_COMMON_H
+#define MSG_ZEROCOPY_COMMON_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifndef SOL_VSOCK
+#define SOL_VSOCK  287
+#endif
+
+#ifndef VSOCK_RECVERR
+#define VSOCK_RECVERR  1
+#endif
+
+static void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+   perror("setsockopt");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void vsock_recv_completion(int fd, const bool *zerocopied) 
__maybe_unused;


To avoid this, maybe we can implement those functions in .c file and
link the object.

WDYT?

Ah, here (cc (GCC) 13.2.1 20230728 (Red Hat 13.2.1-1)) the build is
failing:

In file included from vsock_perf.c:23:
msg_zerocopy_common.h: In function ‘vsock_recv_completion’:
msg_zerocopy_common.h:29:67: error: expected declaration specifiers before 
‘__maybe_unused’
   29 | static void vsock_recv_completion(int fd, const bool *zerocopied) 
__maybe_unused;
  |   
^~
msg_zerocopy_common.h:31:1: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or 
‘__attribute__’ before ‘{’ token
   31 | {
  | ^


+static void vsock_recv_completion(int fd, const bool *zerocopied)
+{
+   struct sock_extended_err *serr;
+   struct msghdr msg = { 0 };
+   char cmsg_data[128];
+   struct cmsghdr *cm;
+   ssize_t res;
+
+   msg.msg_control = cmsg_data;
+   msg.msg_controllen = sizeof(cmsg_data);
+
+   res = recvmsg(fd, , MSG_ERRQUEUE);
+   if (res) {
+   fprintf(stderr, "failed to read error queue: %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   cm = CMSG_FIRSTHDR();
+   if (!cm) {
+   fprintf(stderr, "cmsg: no cmsg\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (cm->cmsg_level != SOL_VSOCK) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_level'\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (cm->cmsg_type != VSOCK_RECVERR) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_type'\n");
+   exit(EXIT_FAILURE);
+   }
+
+   serr = (void *)CMSG_DATA(cm);
+   if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+  

Re: [PATCH net-next v3 02/12] vsock: read from socket's error queue

2023-10-09 Thread Stefano Garzarella

On Sat, Oct 07, 2023 at 08:21:29PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Place new defines for userspace to the existing file 'vm_sockets.h'
   instead of creating new one.
v2 -> v3:
 * Add comments to describe 'SOL_VSOCK' and 'VSOCK_RECVERR' in the file
   'vm_sockets.h'.
 * Reorder includes in 'af_vsock.c' in alphabetical order.

include/linux/socket.h  |  1 +
include/uapi/linux/vm_sockets.h | 12 
net/vmw_vsock/af_vsock.c|  6 ++
3 files changed, 19 insertions(+)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
#define SOL_MPTCP   284
#define SOL_MCTP285
#define SOL_SMC 286
+#define SOL_VSOCK  287

/* IPX options */
#define IPX_TYPE1
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index c60ca33eac59..d9d703b2d45a 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -191,4 +191,16 @@ struct sockaddr_vm {

#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

+/* For reading completion in case of MSG_ZEROCOPY flag transmission.
+ * This is value of 'cmsg_level' field of the 'struct cmsghdr'.
+ */
+
+#define SOL_VSOCK  287
+
+/* For reading completion in case of MSG_ZEROCOPY flag transmission.
+ * This is value of 'cmsg_type' field of the 'struct cmsghdr'.
+ */
+
+#define VSOCK_RECVERR  1


I would suggest a bit more context here, something like this:

diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index d9d703b2d45a..ed07181d4eff 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -191,14 +191,19 @@ struct sockaddr_vm {

 #define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)

-/* For reading completion in case of MSG_ZEROCOPY flag transmission.
- * This is value of 'cmsg_level' field of the 'struct cmsghdr'.
+/* MSG_ZEROCOPY notifications are encoded in the standard error format,
+ * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
+ * kernel source tree for more details.
+ */
+
+/* 'cmsg_level' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
  */

 #define SOL_VSOCK  287

-/* For reading completion in case of MSG_ZEROCOPY flag transmission.
- * This is value of 'cmsg_type' field of the 'struct cmsghdr'.
+/* 'cmsg_type' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
  */

 #define VSOCK_RECVERR  1

The rest LGTM.

Stefano


+
#endif /* _UAPI_VM_SOCKETS_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..38486efd3d05 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -89,6 +89,7 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
@@ -110,6 +111,7 @@
#include 
#include 
#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
int err;

sk = sock->sk;
+
+   if (unlikely(flags & MSG_ERRQUEUE))
+   return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 
VSOCK_RECVERR);
+
vsk = vsock_sk(sk);
err = 0;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 12/12] test/vsock: io_uring rx/tx tests

2023-10-09 Thread Stefano Garzarella

On Sat, Oct 07, 2023 at 08:21:39PM +0300, Arseniy Krasnov wrote:

This adds set of tests which use io_uring for rx/tx. This test suite is
implemented as separated util like 'vsock_test' and has the same set of
input arguments as 'vsock_test'. These tests only cover cases of data
transmission (no connect/bind/accept etc).

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Add 'LDLIBS = -luring' to the target 'vsock_uring_test'.
 * Add 'vsock_uring_test' to the target 'test'.
v2 -> v3:
 * Make 'struct vsock_test_data' private by placing it to the .c file.
   Rename it and add comments to this struct to clarify sense of its
   fields.
 * Add 'vsock_uring_test' to the '.gitignore'.
 * Add receive loop to the server side - this is needed to read entire
   data sent by client.

tools/testing/vsock/.gitignore |   1 +
tools/testing/vsock/Makefile   |   7 +-
tools/testing/vsock/vsock_uring_test.c | 350 +
3 files changed, 356 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/vsock/vsock_uring_test.c

diff --git a/tools/testing/vsock/.gitignore b/tools/testing/vsock/.gitignore
index a8adcfdc292b..d9f798713cd7 100644
--- a/tools/testing/vsock/.gitignore
+++ b/tools/testing/vsock/.gitignore
@@ -3,3 +3,4 @@
vsock_test
vsock_diag_test
vsock_perf
+vsock_uring_test
diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 1a26f60a596c..b80e7c7def1e 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
-test: vsock_test vsock_diag_test
+test: vsock_test vsock_diag_test vsock_uring_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

+vsock_uring_test: LDLIBS = -luring
+vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o
+
CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
-   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
+   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d
diff --git a/tools/testing/vsock/vsock_uring_test.c 
b/tools/testing/vsock/vsock_uring_test.c
new file mode 100644
index ..889887cf3989
--- /dev/null
+++ b/tools/testing/vsock/vsock_uring_test.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* io_uring tests for vsock
+ *
+ * Copyright (C) 2023 SberDevices.
+ *
+ * Author: Arseniy Krasnov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "util.h"
+#include "control.h"
+#include "msg_zerocopy_common.h"
+
+#define PAGE_SIZE  4096


Ditto.


+#define RING_ENTRIES_NUM   4
+
+#define VSOCK_TEST_DATA_MAX_IOV 3
+
+struct vsock_io_uring_test {
+   /* Number of valid elements in 'vecs'. */
+   int vecs_cnt;
+   /* Array how to allocate buffers for test.
+* 'iov_base' == NULL -> valid buf: mmap('iov_len').
+*
+* 'iov_base' == MAP_FAILED -> invalid buf:
+*   mmap('iov_len'), then munmap('iov_len').
+*   'iov_base' still contains result of
+*   mmap().
+*
+* 'iov_base' == number -> unaligned valid buf:
+*   mmap('iov_len') + number.
+*/
+   struct iovec vecs[VSOCK_TEST_DATA_MAX_IOV];
+};
+
+static struct vsock_io_uring_test test_data_array[] = {
+   /* All elements have page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { NULL, 2 * PAGE_SIZE },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   },
+   /* Middle element has both non-page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { (void *)1, 200  },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   }
+};
+
+static void vsock_io_uring_client(const struct test_opts *opts,
+ const struct vsock_io_uring_test *test_data,
+ bool msg_zerocopy)
+{
+   struct io_uring_sqe *sqe;
+   struct io_uring_cqe *cqe;
+   struct io_uring ring;
+   struct iovec *iovec;
+   struct msghdr msg;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   if (msg_zerocopy)
+   enable_so_zerocopy(fd);
+
+   iovec = iovec_from_test_data(test_data->vecs, test_data->vecs_cnt);


Ah, I see this is used also here, so now I get why in 

Re: [PATCH net-next v2 00/12] vsock/virtio: continue MSG_ZEROCOPY support

2023-10-04 Thread Stefano Garzarella

On Wed, Oct 04, 2023 at 07:22:04PM +0300, Arseniy Krasnov wrote:



On 04.10.2023 08:25, Arseniy Krasnov wrote:



On 03.10.2023 19:26, Stefano Garzarella wrote:

Hi Arseniy,

On Sun, Oct 01, 2023 at 12:02:56AM +0300, Arseniy Krasnov wrote:

Hello,

this patchset contains second and third parts of another big patchset
for MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) (merged to net-next, see
  link below)
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/. <-- this patchset
3) Updates for tests and utils. <-- this patchset

Part 1) was merged:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=71b263e79370348349553ecdf46f4a69eb436dc7

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=236f3873b517acfaf949c23bb2d5dec13bfd2da2

Link to v1:
https://lore.kernel.org/netdev/20230922052428.4005676-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.


Thanks for this new version.
I started to include vsock_uring_test in my test suite and tests are
going well.

I reviewed code patches, I still need to review the tests.
I'll do that by the end of the week, but they looks good!


Thanks for review! Ok, I'll wait for tests review, and then send next
version.


Got your comments from review. I'll update patches by:
1) Trying to avoid touching util.c/util.h


I mean, we can touch it ;-) but for this case it looks like we don't
need most of that functions to be there.

At least for now. If we need them to be used in more places, then it
makes sense.


2) Add new header with functions shared between util vsock_perf and
tests


We can do this also later in another PR as cleanup if you prefer.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 12/12] test/vsock: io_uring rx/tx tests

2023-10-04 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:08AM +0300, Arseniy Krasnov wrote:

This adds set of tests which use io_uring for rx/tx. This test suite is
implemented as separated util like 'vsock_test' and has the same set of
input arguments as 'vsock_test'. These tests only cover cases of data
transmission (no connect/bind/accept etc).

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Add 'LDLIBS = -luring' to the target 'vsock_uring_test'.
 * Add 'vsock_uring_test' to the target 'test'.

tools/testing/vsock/Makefile   |   7 +-
tools/testing/vsock/vsock_uring_test.c | 321 +
2 files changed, 326 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/vsock/vsock_uring_test.c

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 1a26f60a596c..b80e7c7def1e 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
-test: vsock_test vsock_diag_test
+test: vsock_test vsock_diag_test vsock_uring_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

+vsock_uring_test: LDLIBS = -luring
+vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o
+
CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
-   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
+   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d
diff --git a/tools/testing/vsock/vsock_uring_test.c 
b/tools/testing/vsock/vsock_uring_test.c
new file mode 100644
index ..725895350697
--- /dev/null
+++ b/tools/testing/vsock/vsock_uring_test.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* io_uring tests for vsock
+ *
+ * Copyright (C) 2023 SberDevices.
+ *
+ * Author: Arseniy Krasnov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "util.h"
+#include "control.h"
+
+#define PAGE_SIZE  4096
+#define RING_ENTRIES_NUM   4
+
+static struct vsock_test_data test_data_array[] = {


Ah, I see vsock_test_data is used here, but we are using a subset
of fields that are not exposed outside of this file.

So, let's define a custom struct in this file for this
(e.g. struct vsock_io_uring_tests)

The rest LGTM!


+   /* All elements have page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { NULL, 2 * PAGE_SIZE },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   },
+   /* Middle element has both non-page aligned base and size. */
+   {
+   .vecs_cnt = 3,
+   {
+   { NULL, PAGE_SIZE },
+   { (void *)1, 200  },
+   { NULL, 3 * PAGE_SIZE },
+   }
+   }
+};
+
+static void vsock_io_uring_client(const struct test_opts *opts,
+ const struct vsock_test_data *test_data,
+ bool msg_zerocopy)
+{
+   struct io_uring_sqe *sqe;
+   struct io_uring_cqe *cqe;
+   struct io_uring ring;
+   struct iovec *iovec;
+   struct msghdr msg;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   if (msg_zerocopy)
+   enable_so_zerocopy(fd);
+
+   iovec = iovec_from_test_data(test_data);
+
+   if (io_uring_queue_init(RING_ENTRIES_NUM, , 0))
+   error(1, errno, "io_uring_queue_init");
+
+   if (io_uring_register_buffers(, iovec, test_data->vecs_cnt))
+   error(1, errno, "io_uring_register_buffers");
+
+   memset(, 0, sizeof(msg));
+   msg.msg_iov = iovec;
+   msg.msg_iovlen = test_data->vecs_cnt;
+   sqe = io_uring_get_sqe();
+
+   if (msg_zerocopy)
+   io_uring_prep_sendmsg_zc(sqe, fd, , 0);
+   else
+   io_uring_prep_sendmsg(sqe, fd, , 0);
+
+   if (io_uring_submit() != 1)
+   error(1, errno, "io_uring_submit");
+
+   if (io_uring_wait_cqe(, ))
+   error(1, errno, "io_uring_wait_cqe");
+
+   io_uring_cqe_seen(, cqe);
+
+   control_writeulong(iovec_hash_djb2(iovec, test_data->vecs_cnt));
+
+   control_writeln("DONE");
+   io_uring_queue_exit();
+   free_iovec_test_data(test_data, iovec);
+   close(fd);
+}
+
+static void vsock_io_uring_server(const struct test_opts *opts,
+ const struct vsock_test_data *test_data)
+{
+   unsigned long remote_hash;
+   unsigned long local_hash;
+ 

Re: [PATCH net-next v2 11/12] test/vsock: MSG_ZEROCOPY support for vsock_perf

2023-10-04 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:07AM +0300, Arseniy Krasnov wrote:

To use this option pass '--zc' parameter:


--zerocopy would be better IMHO



./vsock_perf --zc --sender  --port  --bytes 

With this option MSG_ZEROCOPY flag will be passed to the 'send()' call.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_perf.c | 143 +--
1 file changed, 134 insertions(+), 9 deletions(-)

diff --git a/tools/testing/vsock/vsock_perf.c b/tools/testing/vsock/vsock_perf.c
index a72520338f84..f0f183f3f9e8 100644
--- a/tools/testing/vsock/vsock_perf.c
+++ b/tools/testing/vsock/vsock_perf.c
@@ -18,6 +18,8 @@
#include 
#include 
#include 
+#include 
+#include 

#define DEFAULT_BUF_SIZE_BYTES  (128 * 1024)
#define DEFAULT_TO_SEND_BYTES   (64 * 1024)
@@ -28,9 +30,18 @@
#define BYTES_PER_GB(1024 * 1024 * 1024ULL)
#define NSEC_PER_SEC(10ULL)

+#ifndef SOL_VSOCK
+#define SOL_VSOCK  287
+#endif
+
+#ifndef VSOCK_RECVERR
+#define VSOCK_RECVERR  1
+#endif
+
static unsigned int port = DEFAULT_PORT;
static unsigned long buf_size_bytes = DEFAULT_BUF_SIZE_BYTES;
static unsigned long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES;
+static bool zerocopy;

static void error(const char *s)
{
@@ -247,15 +258,76 @@ static void run_receiver(unsigned long rcvlowat_bytes)
close(fd);
}

+static void recv_completion(int fd)
+{
+   struct sock_extended_err *serr;
+   char cmsg_data[128];
+   struct cmsghdr *cm;
+   struct msghdr msg = { 0 };
+   ssize_t ret;
+
+   msg.msg_control = cmsg_data;
+   msg.msg_controllen = sizeof(cmsg_data);
+
+   ret = recvmsg(fd, , MSG_ERRQUEUE);
+   if (ret) {
+   fprintf(stderr, "recvmsg: failed to read err: %zi\n", ret);
+   return;
+   }
+
+   cm = CMSG_FIRSTHDR();
+   if (!cm) {
+   fprintf(stderr, "cmsg: no cmsg\n");
+   return;
+   }
+
+   if (cm->cmsg_level != SOL_VSOCK) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_level'\n");
+   return;
+   }
+
+   if (cm->cmsg_type != VSOCK_RECVERR) {
+   fprintf(stderr, "cmsg: unexpected 'cmsg_type'\n");
+   return;
+   }
+
+   serr = (void *)CMSG_DATA(cm);
+   if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+   fprintf(stderr, "serr: wrong origin\n");
+   return;
+   }
+
+   if (serr->ee_errno) {
+   fprintf(stderr, "serr: wrong error code\n");
+   return;
+   }
+
+   if (zerocopy && (serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED))
+   fprintf(stderr, "warning: copy instead of zerocopy\n");
+}
+
+static void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val)))
+   error("setsockopt(SO_ZEROCOPY)");
+}


We use enable_so_zerocopy() in a single place, maybe we can put this
code there.

Anyway it seems we are copy & paste some codes from util, etc.

Would make sense create a new header to use on both tests and perf?



+
static void run_sender(int peer_cid, unsigned long to_send_bytes)
{
time_t tx_begin_ns;
time_t tx_total_ns;
size_t total_send;
+   time_t time_in_send;
void *data;
int fd;

-   printf("Run as sender\n");
+   if (zerocopy)
+   printf("Run as sender MSG_ZEROCOPY\n");
+   else
+   printf("Run as sender\n");
+
printf("Connect to %i:%u\n", peer_cid, port);
printf("Send %lu bytes\n", to_send_bytes);
printf("TX buffer %lu bytes\n", buf_size_bytes);
@@ -265,38 +337,82 @@ static void run_sender(int peer_cid, unsigned long 
to_send_bytes)
if (fd < 0)
exit(EXIT_FAILURE);

-   data = malloc(buf_size_bytes);
+   if (zerocopy) {
+   enable_so_zerocopy(fd);

-   if (!data) {
-   fprintf(stderr, "'malloc()' failed\n");
-   exit(EXIT_FAILURE);
+   data = mmap(NULL, buf_size_bytes, PROT_READ | PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+   if (data == MAP_FAILED) {
+   perror("mmap");
+   exit(EXIT_FAILURE);
+   }
+   } else {
+   data = malloc(buf_size_bytes);
+
+   if (!data) {
+   fprintf(stderr, "'malloc()' failed\n");
+   exit(EXIT_FAILURE);
+   }
}

memset(data, 0, buf_size_bytes);
total_send = 0;
+   time_in_send = 0;
tx_begin_ns = current_nsec();

while (total_send < to_send_bytes) {
ssize_t sent;
+   size_t rest_bytes;
+   time_t before;

-   sent = write(fd, data, buf_size_bytes);
+   rest_bytes = to_send_bytes - total_send;
+
+   before = current_nsec();
+   sent = send(fd, data, 

Re: [PATCH net-next v2 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-10-04 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:06AM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.

tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/util.c| 214 +++
tools/testing/vsock/util.h|  27 ++
tools/testing/vsock/vsock_test.c  |  16 ++
tools/testing/vsock/vsock_test_zerocopy.c | 314 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 ++
6 files changed, 587 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..1a26f60a596c 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 6779d5008b27..2a641ab38f08 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -11,10 +11,14 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
#include 
+#include 
+#include 
+#include 

#include "timeout.h"
#include "control.h"
@@ -444,3 +448,213 @@ unsigned long hash_djb2(const void *data, size_t len)

return hash;
}
+
+void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+   perror("setsockopt");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void *mmap_no_fail(size_t bytes)
+{
+   void *res;
+
+   res = mmap(NULL, bytes, PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+   if (res == MAP_FAILED) {
+   perror("mmap");
+   exit(EXIT_FAILURE);
+   }
+
+   return res;
+}
+
+size_t iovec_bytes(const struct iovec *iov, size_t iovnum)
+{
+   size_t bytes;
+   int i;
+
+   for (bytes = 0, i = 0; i < iovnum; i++)
+   bytes += iov[i].iov_len;
+
+   return bytes;
+}
+
+static void iovec_random_init(struct iovec *iov,
+ const struct vsock_test_data *test_data)
+{
+   int i;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   int j;
+
+   if (test_data->vecs[i].iov_base == MAP_FAILED)
+   continue;
+
+   for (j = 0; j < iov[i].iov_len; j++)
+   ((uint8_t *)iov[i].iov_base)[j] = rand() & 0xff;
+   }
+}
+
+unsigned long iovec_hash_djb2(struct iovec *iov, size_t iovnum)
+{
+   unsigned long hash;
+   size_t iov_bytes;
+   size_t offs;
+   void *tmp;
+   int i;
+
+   iov_bytes = iovec_bytes(iov, iovnum);
+
+   tmp = malloc(iov_bytes);
+   if (!tmp) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   for (offs = 0, i = 0; i < iovnum; i++) {
+   memcpy(tmp + offs, iov[i].iov_base, iov[i].iov_len);
+   offs += iov[i].iov_len;
+   }
+
+   hash = hash_djb2(tmp, iov_bytes);
+   free(tmp);
+
+   return hash;
+}
+
+struct iovec *iovec_from_test_data(const struct vsock_test_data *test_data)
+{
+   const struct iovec *test_iovec;
+   struct iovec *iovec;
+   int i;
+
+   iovec = malloc(sizeof(*iovec) * test_data->vecs_cnt);
+   if (!iovec) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   test_iovec = test_data->vecs;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   iovec[i].iov_len = test_iovec[i].iov_len;
+   iovec[i].iov_base = mmap_no_fail(test_iovec[i].iov_len);
+
+   if (test_iovec[i].iov_base != MAP_FAILED &&
+   test_iovec[i].iov_base)
+   iovec[i].iov_base += (uintptr_t)test_iovec[i].iov_base;
+   }
+
+   /* Unmap "invalid" elements. */
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   if (test_iovec[i].iov_base == MAP_FAILED) {
+   if (munmap(iovec[i].iov_base, iovec[i].iov_len)) {
+   perror("munmap");
+   exit(EXIT_FAILURE);
+   }
+   }
+   }
+
+   iovec_random_init(iovec, test_data);
+
+   return iovec;
+}
+
+void free_iovec_test_data(const struct vsock_test_data *test_data,
+   

Re: [PATCH net-next v2 00/12] vsock/virtio: continue MSG_ZEROCOPY support

2023-10-03 Thread Stefano Garzarella

Hi Arseniy,

On Sun, Oct 01, 2023 at 12:02:56AM +0300, Arseniy Krasnov wrote:

Hello,

this patchset contains second and third parts of another big patchset
for MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) (merged to net-next, see
  link below)
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/. <-- this patchset
3) Updates for tests and utils. <-- this patchset

Part 1) was merged:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=71b263e79370348349553ecdf46f4a69eb436dc7

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=236f3873b517acfaf949c23bb2d5dec13bfd2da2

Link to v1:
https://lore.kernel.org/netdev/20230922052428.4005676-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.


Thanks for this new version.
I started to include vsock_uring_test in my test suite and tests are
going well.

I reviewed code patches, I still need to review the tests.
I'll do that by the end of the week, but they looks good!

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-10-03 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:06AM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Move 'SOL_VSOCK' and 'VSOCK_RECVERR' from 'util.c' to 'util.h'.

tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/util.c| 214 +++
tools/testing/vsock/util.h|  27 ++
tools/testing/vsock/vsock_test.c  |  16 ++
tools/testing/vsock/vsock_test_zerocopy.c | 314 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 ++
6 files changed, 587 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..1a26f60a596c 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 6779d5008b27..2a641ab38f08 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -11,10 +11,14 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
#include 
+#include 
+#include 
+#include 

#include "timeout.h"
#include "control.h"
@@ -444,3 +448,213 @@ unsigned long hash_djb2(const void *data, size_t len)

return hash;
}
+
+void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+   perror("setsockopt");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void *mmap_no_fail(size_t bytes)
+{
+   void *res;
+
+   res = mmap(NULL, bytes, PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+   if (res == MAP_FAILED) {
+   perror("mmap");
+   exit(EXIT_FAILURE);
+   }
+
+   return res;
+}
+
+size_t iovec_bytes(const struct iovec *iov, size_t iovnum)
+{
+   size_t bytes;
+   int i;
+
+   for (bytes = 0, i = 0; i < iovnum; i++)
+   bytes += iov[i].iov_len;
+
+   return bytes;
+}
+
+static void iovec_random_init(struct iovec *iov,
+ const struct vsock_test_data *test_data)
+{
+   int i;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   int j;
+
+   if (test_data->vecs[i].iov_base == MAP_FAILED)
+   continue;
+
+   for (j = 0; j < iov[i].iov_len; j++)
+   ((uint8_t *)iov[i].iov_base)[j] = rand() & 0xff;
+   }
+}
+
+unsigned long iovec_hash_djb2(struct iovec *iov, size_t iovnum)
+{
+   unsigned long hash;
+   size_t iov_bytes;
+   size_t offs;
+   void *tmp;
+   int i;
+
+   iov_bytes = iovec_bytes(iov, iovnum);
+
+   tmp = malloc(iov_bytes);
+   if (!tmp) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   for (offs = 0, i = 0; i < iovnum; i++) {
+   memcpy(tmp + offs, iov[i].iov_base, iov[i].iov_len);
+   offs += iov[i].iov_len;
+   }
+
+   hash = hash_djb2(tmp, iov_bytes);
+   free(tmp);
+
+   return hash;
+}
+
+struct iovec *iovec_from_test_data(const struct vsock_test_data *test_data)
+{
+   const struct iovec *test_iovec;
+   struct iovec *iovec;
+   int i;
+
+   iovec = malloc(sizeof(*iovec) * test_data->vecs_cnt);
+   if (!iovec) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   test_iovec = test_data->vecs;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   iovec[i].iov_len = test_iovec[i].iov_len;
+   iovec[i].iov_base = mmap_no_fail(test_iovec[i].iov_len);
+
+   if (test_iovec[i].iov_base != MAP_FAILED &&
+   test_iovec[i].iov_base)
+   iovec[i].iov_base += (uintptr_t)test_iovec[i].iov_base;
+   }
+
+   /* Unmap "invalid" elements. */
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   if (test_iovec[i].iov_base == MAP_FAILED) {
+   if (munmap(iovec[i].iov_base, iovec[i].iov_len)) {
+   perror("munmap");
+   exit(EXIT_FAILURE);
+   }
+   }
+   }
+
+   iovec_random_init(iovec, test_data);
+
+   return iovec;
+}
+
+void free_iovec_test_data(const struct vsock_test_data *test_data,
+   

Re: [PATCH net-next v2 09/12] docs: net: description of MSG_ZEROCOPY for AF_VSOCK

2023-10-03 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:05AM +0300, Arseniy Krasnov wrote:

This adds description of MSG_ZEROCOPY flag support for AF_VSOCK type of
socket.

Signed-off-by: Arseniy Krasnov 
---
Documentation/networking/msg_zerocopy.rst | 13 +++--
1 file changed, 11 insertions(+), 2 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/Documentation/networking/msg_zerocopy.rst 
b/Documentation/networking/msg_zerocopy.rst
index b3ea96af9b49..78fb70e748b7 100644
--- a/Documentation/networking/msg_zerocopy.rst
+++ b/Documentation/networking/msg_zerocopy.rst
@@ -7,7 +7,8 @@ Intro
=

The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
-The feature is currently implemented for TCP and UDP sockets.
+The feature is currently implemented for TCP, UDP and VSOCK (with
+virtio transport) sockets.


Opportunity and Caveats
@@ -174,7 +175,9 @@ read_notification() call in the previous snippet. A 
notification
is encoded in the standard error format, sock_extended_err.

The level and type fields in the control data are protocol family
-specific, IP_RECVERR or IPV6_RECVERR.
+specific, IP_RECVERR or IPV6_RECVERR (for TCP or UDP socket).
+For VSOCK socket, cmsg_level will be SOL_VSOCK and cmsg_type will be
+VSOCK_RECVERR.

Error origin is the new type SO_EE_ORIGIN_ZEROCOPY. ee_errno is zero,
as explained before, to avoid blocking read and write system calls on
@@ -235,12 +238,15 @@ Implementation
Loopback


+For TCP and UDP:
Data sent to local sockets can be queued indefinitely if the receive
process does not read its socket. Unbound notification latency is not
acceptable. For this reason all packets generated with MSG_ZEROCOPY
that are looped to a local socket will incur a deferred copy. This
includes looping onto packet sockets (e.g., tcpdump) and tun devices.

+For VSOCK:
+Data path sent to local sockets is the same as for non-local sockets.

Testing
===
@@ -254,3 +260,6 @@ instance when run with msg_zerocopy.sh between a veth pair 
across
namespaces, the test will not show any improvement. For testing, the
loopback restriction can be temporarily relaxed by making
skb_orphan_frags_rx identical to skb_orphan_frags.
+
+For VSOCK type of socket example can be found in
+tools/testing/vsock/vsock_test_zerocopy.c.
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 08/12] vsock: enable setting SO_ZEROCOPY

2023-10-03 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:03:04AM +0300, Arseniy Krasnov wrote:

For AF_VSOCK, zerocopy tx mode depends on transport, so this option must
be set in AF_VSOCK implementation where transport is accessible (if
transport is not set during setting SO_ZEROCOPY: for example socket is
not connected, then SO_ZEROCOPY will be enabled, but once transport will
be assigned, support of this type of transmission will be checked).

To handle SO_ZEROCOPY, AF_VSOCK implementation uses SOCK_CUSTOM_SOCKOPT
bit, thus handling SOL_SOCKET option operations, but all of them except
SO_ZEROCOPY will be forwarded to the generic handler by calling
'sock_setsockopt()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Place 'sock_valbool_flag()' in a single line.

net/vmw_vsock/af_vsock.c | 45 ++--
1 file changed, 43 insertions(+), 2 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ff44bab05191..a84f242466cf 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1406,8 +1406,16 @@ static int vsock_connect(struct socket *sock, struct 
sockaddr *addr,
goto out;
}

-   if (vsock_msgzerocopy_allow(transport))
+   if (vsock_msgzerocopy_allow(transport)) {
set_bit(SOCK_SUPPORT_ZC, >sk_socket->flags);
+   } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+   /* If this option was set before 'connect()',
+* when transport was unknown, check that this
+* feature is supported here.
+*/
+   err = -EOPNOTSUPP;
+   goto out;
+   }

err = vsock_auto_bind(vsk);
if (err)
@@ -1643,7 +1651,7 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,
const struct vsock_transport *transport;
u64 val;

-   if (level != AF_VSOCK)
+   if (level != AF_VSOCK && level != SOL_SOCKET)
return -ENOPROTOOPT;

#define COPY_IN(_v)   \
@@ -1666,6 +1674,33 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,

transport = vsk->transport;

+   if (level == SOL_SOCKET) {
+   int zerocopy;
+
+   if (optname != SO_ZEROCOPY) {
+   release_sock(sk);
+   return sock_setsockopt(sock, level, optname, optval, 
optlen);
+   }
+
+   /* Use 'int' type here, because variable to
+* set this option usually has this type.
+*/
+   COPY_IN(zerocopy);
+
+   if (zerocopy < 0 || zerocopy > 1) {
+   err = -EINVAL;
+   goto exit;
+   }
+
+   if (transport && !vsock_msgzerocopy_allow(transport)) {
+   err = -EOPNOTSUPP;
+   goto exit;
+   }
+
+   sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy);
+   goto exit;
+   }
+
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
@@ -2322,6 +2357,12 @@ static int vsock_create(struct net *net, struct socket 
*sock,
}
}

+   /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its
+* proto_ops, so there is no handler for custom logic.
+*/
+   if (sock_type_connectible(sock->type))
+   set_bit(SOCK_CUSTOM_SOCKOPT, >sk_socket->flags);
+
vsock_insert_unbound(vsk);

return 0;
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 02/12] vsock: read from socket's error queue

2023-10-03 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:02:58AM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Place new defines for userspace to the existing file 'vm_sockets.h'
   instead of creating new one.

include/linux/socket.h  | 1 +
include/uapi/linux/vm_sockets.h | 4 
net/vmw_vsock/af_vsock.c| 6 ++
3 files changed, 11 insertions(+)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
#define SOL_MPTCP   284
#define SOL_MCTP285
#define SOL_SMC 286
+#define SOL_VSOCK  287

/* IPX options */
#define IPX_TYPE1
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index c60ca33eac59..b1a66c1a7054 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -191,4 +191,8 @@ struct sockaddr_vm {

#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

+#define SOL_VSOCK  287
+
+#define VSOCK_RECVERR  1


Please add good documentation for both of them. This is an header
exposed to the user space.


+
#endif /* _UAPI_VM_SOCKETS_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..0365382beab6 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -110,6 +110,8 @@
#include 
#include 
#include 
+#include 
+#include 


Let's keep the alphabetic order as it was before this change.

`net/af_vsock.h` already includes the `uapi/linux/vm_sockets.h`,
and we also use several defines from it in this file, so you can also
skip it.

On the other end it would be better to directly include the headers that
we use, so it's also okay to keep it. As you prefer.



static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
int err;

sk = sock->sk;
+
+   if (unlikely(flags & MSG_ERRQUEUE))
+   return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 
VSOCK_RECVERR);
+
vsk = vsock_sk(sk);
err = 0;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 01/12] vsock: set EPOLLERR on non-empty error queue

2023-10-03 Thread Stefano Garzarella

On Sun, Oct 01, 2023 at 12:02:57AM +0300, Arseniy Krasnov wrote:

If socket's error queue is not empty, EPOLLERR must be set. Otherwise,
reader of error queue won't detect data in it using EPOLLERR bit.
Currently for AF_VSOCK this is actual only with MSG_ZEROCOPY, as this
feature is the only user of an error queue of the socket.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'fix' word.


Reviewed-by: Stefano Garzarella 



net/vmw_vsock/af_vsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 013b65241b65..d841f4de33b0 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1030,7 +1030,7 @@ static __poll_t vsock_poll(struct file *file, struct 
socket *sock,
poll_wait(file, sk_sleep(sk), wait);
mask = 0;

-   if (sk->sk_err)
+   if (sk->sk_err || !skb_queue_empty_lockless(>sk_error_queue))
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 12/12] test/vsock: io_uring rx/tx tests

2023-09-27 Thread Stefano Garzarella

On Tue, Sep 26, 2023 at 11:00:19PM +0300, Arseniy Krasnov wrote:



On 26.09.2023 16:04, Stefano Garzarella wrote:

On Fri, Sep 22, 2023 at 08:24:28AM +0300, Arseniy Krasnov wrote:

This adds set of tests which use io_uring for rx/tx. This test suite is
implemented as separated util like 'vsock_test' and has the same set of
input arguments as 'vsock_test'. These tests only cover cases of data
transmission (no connect/bind/accept etc).

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Use LDLIBS instead of LDFLAGS.

tools/testing/vsock/Makefile   |   7 +-
tools/testing/vsock/vsock_uring_test.c | 321 +
2 files changed, 327 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/vsock_uring_test.c

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 1a26f60a596c..c84380bfc18d 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,12 +1,17 @@
# SPDX-License-Identifier: GPL-2.0-only
+ifeq ($(MAKECMDGOALS),vsock_uring_test)
+LDLIBS = -luring
+endif
+


This will fails if for example we call make with more targets,
e.g. `make vsock_test vsock_uring_test`.

I'd suggest to use something like this:

--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
-ifeq ($(MAKECMDGOALS),vsock_uring_test)
-LDLIBS = -luring
-endif
-
 all: test vsock_perf
 test: vsock_test vsock_diag_test
 vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
 vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
 vsock_perf: vsock_perf.o
+
+vsock_uring_test: LDLIBS = -luring
 vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o

 CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE


all: test vsock_perf
test: vsock_test vsock_diag_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o
+vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o


Shoud we add this new test to the "test" target as well?


Ok, but in this case, this target will always depend on liburing.


I think it's fine.

If they want to run all the tests, they need liburing. If they don't
want to build io_uring tests, they can just do `make vsock_test`.

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 08/12] vsock: enable setting SO_ZEROCOPY

2023-09-27 Thread Stefano Garzarella

On Tue, Sep 26, 2023 at 10:38:06PM +0300, Arseniy Krasnov wrote:



On 26.09.2023 15:56, Stefano Garzarella wrote:

On Fri, Sep 22, 2023 at 08:24:24AM +0300, Arseniy Krasnov wrote:

For AF_VSOCK, zerocopy tx mode depends on transport, so this option must
be set in AF_VSOCK implementation where transport is accessible (if
transport is not set during setting SO_ZEROCOPY: for example socket is
not connected, then SO_ZEROCOPY will be enabled, but once transport will
be assigned, support of this type of transmission will be checked).

To handle SO_ZEROCOPY, AF_VSOCK implementation uses SOCK_CUSTOM_SOCKOPT
bit, thus handling SOL_SOCKET option operations, but all of them except
SO_ZEROCOPY will be forwarded to the generic handler by calling
'sock_setsockopt()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Compact 'if' conditions.
 * Rename 'zc_val' to 'zerocopy'.
 * Use 'zerocopy' value directly in 'sock_valbool_flag()', without
   ?: operator.
 * Set 'SOCK_CUSTOM_SOCKOPT' bit for connectible sockets only, as
   suggested by Bobby Eshleman .

net/vmw_vsock/af_vsock.c | 46 ++--
1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 482300eb88e0..c05a42e02a17 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1406,8 +1406,16 @@ static int vsock_connect(struct socket *sock, struct 
sockaddr *addr,
    goto out;
    }

-    if (vsock_msgzerocopy_allow(transport))
+    if (vsock_msgzerocopy_allow(transport)) {
    set_bit(SOCK_SUPPORT_ZC, >sk_socket->flags);
+    } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+    /* If this option was set before 'connect()',
+ * when transport was unknown, check that this
+ * feature is supported here.
+ */
+    err = -EOPNOTSUPP;
+    goto out;
+    }

    err = vsock_auto_bind(vsk);
    if (err)
@@ -1643,7 +1651,7 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,
const struct vsock_transport *transport;
u64 val;

-    if (level != AF_VSOCK)
+    if (level != AF_VSOCK && level != SOL_SOCKET)
    return -ENOPROTOOPT;

#define COPY_IN(_v)   \
@@ -1666,6 +1674,34 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,

transport = vsk->transport;

+    if (level == SOL_SOCKET) {
+    int zerocopy;
+
+    if (optname != SO_ZEROCOPY) {
+    release_sock(sk);
+    return sock_setsockopt(sock, level, optname, optval, optlen);
+    }
+
+    /* Use 'int' type here, because variable to
+ * set this option usually has this type.
+ */
+    COPY_IN(zerocopy);
+
+    if (zerocopy < 0 || zerocopy > 1) {
+    err = -EINVAL;
+    goto exit;
+    }
+
+    if (transport && !vsock_msgzerocopy_allow(transport)) {
+    err = -EOPNOTSUPP;
+    goto exit;
+    }
+
+    sock_valbool_flag(sk, SOCK_ZEROCOPY,
+  zerocopy);


it's not necessary to wrap this call.


Sorry, what do you mean ?


I mean that can be on the same line:

sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy);

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 02/12] vsock: read from socket's error queue

2023-09-27 Thread Stefano Garzarella

On Tue, Sep 26, 2023 at 10:36:58PM +0300, Arseniy Krasnov wrote:



On 26.09.2023 15:55, Stefano Garzarella wrote:

On Fri, Sep 22, 2023 at 08:24:18AM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * R-b tag removed, due to added defines to 'include/uapi/linux/vsock.h'.
   Both 'SOL_VSOCK' and 'VSOCK_RECVERR' are needed by userspace, so
   they were placed to 'include/uapi/linux/vsock.h'. At the same time,
   the same define for 'SOL_VSOCK' was placed to 'include/linux/socket.h'.
   This is needed because this file contains SOL_XXX defines for different
   types of socket, so it prevents situation when another new SOL_XXX
   will use constant 287.

include/linux/socket.h | 1 +
include/uapi/linux/vsock.h | 9 +
net/vmw_vsock/af_vsock.c   | 6 ++
3 files changed, 16 insertions(+)
create mode 100644 include/uapi/linux/vsock.h

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
#define SOL_MPTCP    284
#define SOL_MCTP    285
#define SOL_SMC    286
+#define SOL_VSOCK    287

/* IPX options */
#define IPX_TYPE    1
diff --git a/include/uapi/linux/vsock.h b/include/uapi/linux/vsock.h
new file mode 100644
index ..b25c1347a3b8
--- /dev/null
+++ b/include/uapi/linux/vsock.h


We already have include/uapi/linux/vm_sockets.h

Should we include these changes there instead of creating a new header?


@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_VSOCK_H
+#define _UAPI_LINUX_VSOCK_H
+
+#define SOL_VSOCK    287


Why we need to re-define this also here?


Reason of this re-define is that SOL_VSOCK must be exported to userspace, so
i place it to include/uapi/XXX. At the same time include/linux/socket.h contains
constants for SOL_XXX and they goes sequentially in this file (e.g. one by one,
each new value is +1 to the previous). So if I add SOL_VSOCK to include/uapi/XXX
only, it is possible that someone will add new SOL_VERY_NEW_SOCKET == 287 to
include/linux/socket.h in future. I think it is not good that two SOL_XXX will
have same value.

For example SOL_RDS and SOL_TIPS uses the same approach - there are two same 
defines:
one in include/uapi/ and another is in include/linux/socket.h


Okay, I was confused, I though socket.h was the uapi one.
If others do the same, it's fine.

But why adding a new vsock.h instead of reusing vm_sockets.h?





In that case, should we protect with some guards to avoid double
defines?


May be:

in include/linux/socket.h

#ifndef SOL_VSOCK
#define SOL_VSOCK 287
#endif

But not sure...


Nope, let's follow others definition.

Sorry for the confusion ;-)






+
+#define VSOCK_RECVERR    1
+
+#endif /* _UAPI_LINUX_VSOCK_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..4fd11bf34bc7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -110,6 +110,8 @@
#include 
#include 
#include 
+#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
int err;

sk = sock->sk;
+
+    if (unlikely(flags & MSG_ERRQUEUE))
+    return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 
VSOCK_RECVERR);

+
vsk = vsock_sk(sk);
err = 0;

-- 
2.25.1







___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 00/12] vsock/virtio: continue MSG_ZEROCOPY support

2023-09-26 Thread Stefano Garzarella

Hi Arseniy,

On Fri, Sep 22, 2023 at 08:24:16AM +0300, Arseniy Krasnov wrote:

Hello,

this patchset contains second and third parts of another big patchset
for MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) (merged to net-next, see
  link below)
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/. <-- this patchset
3) Updates for tests and utils. <-- this patchset

Part 1) was merged:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=71b263e79370348349553ecdf46f4a69eb436dc7

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=71b263e79370348349553ecdf46f4a69eb436dc7


Thanks for the series.
I did a quick review highlighting some things that need to be changed.

Overall, the series seems to be in good shape. The tests went well.

In the next few days I'll see if I can get a better look at the larger 
patches like the tests, or I'll check in the next version.


Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 12/12] test/vsock: io_uring rx/tx tests

2023-09-26 Thread Stefano Garzarella

On Fri, Sep 22, 2023 at 08:24:28AM +0300, Arseniy Krasnov wrote:

This adds set of tests which use io_uring for rx/tx. This test suite is
implemented as separated util like 'vsock_test' and has the same set of
input arguments as 'vsock_test'. These tests only cover cases of data
transmission (no connect/bind/accept etc).

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Use LDLIBS instead of LDFLAGS.

tools/testing/vsock/Makefile   |   7 +-
tools/testing/vsock/vsock_uring_test.c | 321 +
2 files changed, 327 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/vsock_uring_test.c

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 1a26f60a596c..c84380bfc18d 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,12 +1,17 @@
# SPDX-License-Identifier: GPL-2.0-only
+ifeq ($(MAKECMDGOALS),vsock_uring_test)
+LDLIBS = -luring
+endif
+


This will fails if for example we call make with more targets,
e.g. `make vsock_test vsock_uring_test`.

I'd suggest to use something like this:

--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
-ifeq ($(MAKECMDGOALS),vsock_uring_test)
-LDLIBS = -luring
-endif
-
 all: test vsock_perf
 test: vsock_test vsock_diag_test
 vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
 vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
 vsock_perf: vsock_perf.o
+
+vsock_uring_test: LDLIBS = -luring
 vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o

 CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE


all: test vsock_perf
test: vsock_test vsock_diag_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o
+vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o


Shoud we add this new test to the "test" target as well?

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 10/12] test/vsock: MSG_ZEROCOPY flag tests

2023-09-26 Thread Stefano Garzarella

On Fri, Sep 22, 2023 at 08:24:26AM +0300, Arseniy Krasnov wrote:

This adds three tests for MSG_ZEROCOPY feature:
1) SOCK_STREAM tx with different buffers.
2) SOCK_SEQPACKET tx with different buffers.
3) SOCK_STREAM test to read empty error queue of the socket.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/Makefile  |   2 +-
tools/testing/vsock/util.c| 222 +++
tools/testing/vsock/util.h|  19 ++
tools/testing/vsock/vsock_test.c  |  16 ++
tools/testing/vsock/vsock_test_zerocopy.c | 314 ++
tools/testing/vsock/vsock_test_zerocopy.h |  15 ++
6 files changed, 587 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.c
create mode 100644 tools/testing/vsock/vsock_test_zerocopy.h

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 21a98ba565ab..1a26f60a596c 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
-vsock_test: vsock_test.o timeout.o control.o util.o
+vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 6779d5008b27..d531dbbfa8ff 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -11,15 +11,27 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
#include 
+#include 
+#include 
+#include 

#include "timeout.h"
#include "control.h"
#include "util.h"

+#ifndef SOL_VSOCK
+#define SOL_VSOCK  287
+#endif
+
+#ifndef VSOCK_RECVERR
+#define VSOCK_RECVERR  1
+#endif


Maybe better to re-define them in util.h where we include vm_socktes.h


+
/* Install signal handlers */
void init_signals(void)
{
@@ -444,3 +456,213 @@ unsigned long hash_djb2(const void *data, size_t len)

return hash;
}
+
+void enable_so_zerocopy(int fd)
+{
+   int val = 1;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, , sizeof(val))) {
+   perror("setsockopt");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void *mmap_no_fail(size_t bytes)
+{
+   void *res;
+
+   res = mmap(NULL, bytes, PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+   if (res == MAP_FAILED) {
+   perror("mmap");
+   exit(EXIT_FAILURE);
+   }
+
+   return res;
+}
+
+size_t iovec_bytes(const struct iovec *iov, size_t iovnum)
+{
+   size_t bytes;
+   int i;
+
+   for (bytes = 0, i = 0; i < iovnum; i++)
+   bytes += iov[i].iov_len;
+
+   return bytes;
+}
+
+static void iovec_random_init(struct iovec *iov,
+ const struct vsock_test_data *test_data)
+{
+   int i;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   int j;
+
+   if (test_data->vecs[i].iov_base == MAP_FAILED)
+   continue;
+
+   for (j = 0; j < iov[i].iov_len; j++)
+   ((uint8_t *)iov[i].iov_base)[j] = rand() & 0xff;
+   }
+}
+
+unsigned long iovec_hash_djb2(struct iovec *iov, size_t iovnum)
+{
+   unsigned long hash;
+   size_t iov_bytes;
+   size_t offs;
+   void *tmp;
+   int i;
+
+   iov_bytes = iovec_bytes(iov, iovnum);
+
+   tmp = malloc(iov_bytes);
+   if (!tmp) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   for (offs = 0, i = 0; i < iovnum; i++) {
+   memcpy(tmp + offs, iov[i].iov_base, iov[i].iov_len);
+   offs += iov[i].iov_len;
+   }
+
+   hash = hash_djb2(tmp, iov_bytes);
+   free(tmp);
+
+   return hash;
+}
+
+struct iovec *iovec_from_test_data(const struct vsock_test_data *test_data)
+{
+   const struct iovec *test_iovec;
+   struct iovec *iovec;
+   int i;
+
+   iovec = malloc(sizeof(*iovec) * test_data->vecs_cnt);
+   if (!iovec) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   test_iovec = test_data->vecs;
+
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   iovec[i].iov_len = test_iovec[i].iov_len;
+   iovec[i].iov_base = mmap_no_fail(test_iovec[i].iov_len);
+
+   if (test_iovec[i].iov_base != MAP_FAILED &&
+   test_iovec[i].iov_base)
+   iovec[i].iov_base += (uintptr_t)test_iovec[i].iov_base;
+   }
+
+   /* Unmap "invalid" elements. */
+   for (i = 0; i < test_data->vecs_cnt; i++) {
+   if (test_iovec[i].iov_base == MAP_FAILED) {
+   if (munmap(iovec[i].iov_base, iovec[i].iov_len)) {
+   perror("munmap");
+   exit(EXIT_FAILURE);
+   }
+  

Re: [PATCH net-next v1 08/12] vsock: enable setting SO_ZEROCOPY

2023-09-26 Thread Stefano Garzarella

On Fri, Sep 22, 2023 at 08:24:24AM +0300, Arseniy Krasnov wrote:

For AF_VSOCK, zerocopy tx mode depends on transport, so this option must
be set in AF_VSOCK implementation where transport is accessible (if
transport is not set during setting SO_ZEROCOPY: for example socket is
not connected, then SO_ZEROCOPY will be enabled, but once transport will
be assigned, support of this type of transmission will be checked).

To handle SO_ZEROCOPY, AF_VSOCK implementation uses SOCK_CUSTOM_SOCKOPT
bit, thus handling SOL_SOCKET option operations, but all of them except
SO_ZEROCOPY will be forwarded to the generic handler by calling
'sock_setsockopt()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Compact 'if' conditions.
 * Rename 'zc_val' to 'zerocopy'.
 * Use 'zerocopy' value directly in 'sock_valbool_flag()', without
   ?: operator.
 * Set 'SOCK_CUSTOM_SOCKOPT' bit for connectible sockets only, as
   suggested by Bobby Eshleman .

net/vmw_vsock/af_vsock.c | 46 ++--
1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 482300eb88e0..c05a42e02a17 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1406,8 +1406,16 @@ static int vsock_connect(struct socket *sock, struct 
sockaddr *addr,
goto out;
}

-   if (vsock_msgzerocopy_allow(transport))
+   if (vsock_msgzerocopy_allow(transport)) {
set_bit(SOCK_SUPPORT_ZC, >sk_socket->flags);
+   } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+   /* If this option was set before 'connect()',
+* when transport was unknown, check that this
+* feature is supported here.
+*/
+   err = -EOPNOTSUPP;
+   goto out;
+   }

err = vsock_auto_bind(vsk);
if (err)
@@ -1643,7 +1651,7 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,
const struct vsock_transport *transport;
u64 val;

-   if (level != AF_VSOCK)
+   if (level != AF_VSOCK && level != SOL_SOCKET)
return -ENOPROTOOPT;

#define COPY_IN(_v)   \
@@ -1666,6 +1674,34 @@ static int vsock_connectible_setsockopt(struct socket 
*sock,

transport = vsk->transport;

+   if (level == SOL_SOCKET) {
+   int zerocopy;
+
+   if (optname != SO_ZEROCOPY) {
+   release_sock(sk);
+   return sock_setsockopt(sock, level, optname, optval, 
optlen);
+   }
+
+   /* Use 'int' type here, because variable to
+* set this option usually has this type.
+*/
+   COPY_IN(zerocopy);
+
+   if (zerocopy < 0 || zerocopy > 1) {
+   err = -EINVAL;
+   goto exit;
+   }
+
+   if (transport && !vsock_msgzerocopy_allow(transport)) {
+   err = -EOPNOTSUPP;
+   goto exit;
+   }
+
+   sock_valbool_flag(sk, SOCK_ZEROCOPY,
+ zerocopy);


it's not necessary to wrap this call.


+   goto exit;
+   }
+
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
@@ -2322,6 +2358,12 @@ static int vsock_create(struct net *net, struct socket 
*sock,
}
}

+   /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its
+* proto_ops, so there is no handler for custom logic.
+*/
+   if (sock_type_connectible(sock->type))
+   set_bit(SOCK_CUSTOM_SOCKOPT, >sk_socket->flags);
+
vsock_insert_unbound(vsk);

return 0;
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 02/12] vsock: read from socket's error queue

2023-09-26 Thread Stefano Garzarella

On Fri, Sep 22, 2023 at 08:24:18AM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * R-b tag removed, due to added defines to 'include/uapi/linux/vsock.h'.
   Both 'SOL_VSOCK' and 'VSOCK_RECVERR' are needed by userspace, so
   they were placed to 'include/uapi/linux/vsock.h'. At the same time,
   the same define for 'SOL_VSOCK' was placed to 'include/linux/socket.h'.
   This is needed because this file contains SOL_XXX defines for different
   types of socket, so it prevents situation when another new SOL_XXX
   will use constant 287.

include/linux/socket.h | 1 +
include/uapi/linux/vsock.h | 9 +
net/vmw_vsock/af_vsock.c   | 6 ++
3 files changed, 16 insertions(+)
create mode 100644 include/uapi/linux/vsock.h

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
#define SOL_MPTCP   284
#define SOL_MCTP285
#define SOL_SMC 286
+#define SOL_VSOCK  287

/* IPX options */
#define IPX_TYPE1
diff --git a/include/uapi/linux/vsock.h b/include/uapi/linux/vsock.h
new file mode 100644
index ..b25c1347a3b8
--- /dev/null
+++ b/include/uapi/linux/vsock.h


We already have include/uapi/linux/vm_sockets.h

Should we include these changes there instead of creating a new header?


@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_VSOCK_H
+#define _UAPI_LINUX_VSOCK_H
+
+#define SOL_VSOCK  287


Why we need to re-define this also here?

In that case, should we protect with some guards to avoid double
defines?


+
+#define VSOCK_RECVERR  1
+
+#endif /* _UAPI_LINUX_VSOCK_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..4fd11bf34bc7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -110,6 +110,8 @@
#include 
#include 
#include 
+#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
int err;

sk = sock->sk;
+
+   if (unlikely(flags & MSG_ERRQUEUE))
+   return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 
VSOCK_RECVERR);
+
vsk = vsock_sk(sk);
err = 0;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 01/12] vsock: fix EPOLLERR set on non-empty error queue

2023-09-26 Thread Stefano Garzarella

On Fri, Sep 22, 2023 at 08:24:17AM +0300, Arseniy Krasnov wrote:

If socket's error queue is not empty, EPOLLERR must be set. Otherwise,
reader of error queue won't detect data in it using EPOLLERR bit.
Currently for AF_VSOCK this is reproducible only with MSG_ZEROCOPY, as
this feature is the only user of an error queue of the socket.


So this is not really a fix. I'd use a different title to avoid
confusion on backporting this on stable branches or not.

Maybe just "vsock: set EPOLLERR on non-empty error queue"

The change LGTM.

Stefano



Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 013b65241b65..d841f4de33b0 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1030,7 +1030,7 @@ static __poll_t vsock_poll(struct file *file, struct 
socket *sock,
poll_wait(file, sk_sleep(sk), wait);
mask = 0;

-   if (sk->sk_err)
+   if (sk->sk_err || !skb_queue_empty_lockless(>sk_error_queue))
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] vringh: don't use vringh_kiov_advance() in vringh_iov_xfer()

2023-09-25 Thread Stefano Garzarella
In the while loop of vringh_iov_xfer(), `partlen` could be 0 if one of
the `iov` has 0 lenght.
In this case, we should skip the iov and go to the next one.
But calling vringh_kiov_advance() with 0 lenght does not cause the
advancement, since it returns immediately if asked to advance by 0 bytes.

Let's restore the code that was there before commit b8c06ad4d67d
("vringh: implement vringh_kiov_advance()"), avoiding using
vringh_kiov_advance().

Fixes: b8c06ad4d67d ("vringh: implement vringh_kiov_advance()")
Cc: sta...@vger.kernel.org
Reported-by: Jason Wang 
Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vringh.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 955d938eb663..7b8fd977f71c 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -123,8 +123,18 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
done += partlen;
len -= partlen;
ptr += partlen;
+   iov->consumed += partlen;
+   iov->iov[iov->i].iov_len -= partlen;
+   iov->iov[iov->i].iov_base += partlen;
 
-   vringh_kiov_advance(iov, partlen);
+   if (!iov->iov[iov->i].iov_len) {
+   /* Fix up old iov element then increment. */
+   iov->iov[iov->i].iov_len = iov->consumed;
+   iov->iov[iov->i].iov_base -= iov->consumed;
+
+   iov->consumed = 0;
+   iov->i++;
+   }
}
return done;
 }
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v9 0/4] vsock/virtio/vhost: MSG_ZEROCOPY preparations

2023-09-19 Thread Stefano Garzarella

On Tue, Sep 19, 2023 at 03:19:54PM +0200, Paolo Abeni wrote:

On Tue, 2023-09-19 at 09:54 +0200, Stefano Garzarella wrote:

On Mon, Sep 18, 2023 at 07:56:00PM +0300, Arseniy Krasnov wrote:
> Hi Stefano,
>
> thanks for review! So when this patchset will be merged to net-next,
> I'll start sending next part of MSG_ZEROCOPY patchset, e.g. AF_VSOCK +
> Documentation/ patches.

Ack, if it is not a very big series, maybe better to include also the
tests so we can run them before merge the feature.


I understand that at least 2 follow-up series are waiting for this, one
of them targeting net-next and the bigger one targeting the virtio
tree. Am I correct?


IIUC the next series will touch only the vsock core
(net/vmw_vsock/af_vsock.c), tests, and documentation.

The virtio part should be fully covered by this series.

@Arseniy feel free to correct me!



DaveM suggests this should go via the virtio tree, too. Any different
opinion?


For this series should be fine, I'm not sure about the next series.
Merging this with the virtio tree, then it forces us to do it for
followup as well right?

In theory followup is more on the core, so better with net-next, but
it's also true that for now only virtio transports support it, so it
might be okay to continue with virtio.

@Michael WDYT?

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v9 0/4] vsock/virtio/vhost: MSG_ZEROCOPY preparations

2023-09-19 Thread Stefano Garzarella

On Mon, Sep 18, 2023 at 07:56:00PM +0300, Arseniy Krasnov wrote:

Hi Stefano,

thanks for review! So when this patchset will be merged to net-next,
I'll start sending next part of MSG_ZEROCOPY patchset, e.g. AF_VSOCK +
Documentation/ patches.


Ack, if it is not a very big series, maybe better to include also the
tests so we can run them before merge the feature.

WDYT?

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v9 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-09-18 Thread Stefano Garzarella

On Sat, Sep 16, 2023 at 04:09:18PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path:

1) If this flag is set and zerocopy transmission is possible (enabled
  in socket options and transport allows zerocopy), then non-linear
  skb will be created and filled with the pages of user's buffer.
  Pages of user's buffer are locked in memory by 'get_user_pages()'.
2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it
  calls 'skb_set_owner_w()'. Reason of this change is that
  '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so
  to decrease this field correctly, proper skb destructor is needed:
  'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'.
  If this callback is set, then transport needs extra check to be able
  to send provided number of buffers in zerocopy mode. Currently, the
  only transport that needs this callback set is virtio, because this
  transport adds new buffers to the virtio queue and we need to check,
  that number of these buffers is less than size of the queue (it is
  required by virtio spec). vhost and loopback transports don't need
  this check.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Refactorings of 'if' conditions.
 * Remove extra blank line.
 * Remove 'frag_off' field unneeded init.
 * Add function 'virtio_transport_fill_skb()' which fills both linear
   and non-linear skb with provided data.
v1 -> v2:
 * Use original order of last four arguments in 'virtio_transport_alloc_skb()'.
v2 -> v3:
 * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
   provided 'iov_iter' with data could be sent in a zerocopy mode.
   If this callback is not set in transport - transport allows to send
   any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
   then zerocopy is allowed. Reason of this callback is that in case of
   G2H transmission we insert whole skb to the tx virtio queue and such
   skb must fit to the size of the virtio queue to be sent in a single
   iteration (may be tx logic in 'virtio_transport.c' could be reworked
   as in vhost to support partial send of current skb). This callback
   will be enabled only for G2H path. For details pls see comment
   'Check that tx queue...' below.
v3 -> v4:
 * 'msgzerocopy_check_iov' moved from 'struct vsock_transport' to
   'struct virtio_transport' as it is virtio specific callback and
   never needed in other transports.
v4 -> v5:
 * 'msgzerocopy_check_iov' renamed to 'can_msgzerocopy' and now it
   uses number of buffers to send as input argument. I think there is
   no need to pass iov to this callback (at least today, it is used only
   by guest side of virtio transport), because the only thing that this
   callback does is comparison of number of buffers to be inserted to
   the tx queue and size of this queue.
 * Remove any checks for type of current 'iov_iter' with payload (is it
   'iovec' or 'ubuf'). These checks left from the earlier versions where I
   didn't use already implemented kernel API which handles every type of
   'iov_iter'.
v5 -> v6:
 * Refactor 'virtio_transport_fill_skb()'.
 * Add 'WARN_ON_ONCE()' and comment on invalid combination of destination
   socket and payload in 'virtio_transport_alloc_skb()'.
v7 -> v8:
 * Move '+1' addition from 'can_msgzerocopy' callback body to the caller.
   This addition means packet header.
 * In 'virtio_transport_can_zcopy()' rename 'max_to_send' argument to
   'pkt_len'.
 * Update commit message by adding details about new 'can_msgzerocopy'
   callback.
 * In 'virtio_transport_init_hdr()' move 'len' argument directly after
   'info'.
 * Add comment about processing last skb in tx loop.
 * Update comment for 'can_msgzerocopy' callback for more details.
v8 -> v9:
 * Return and update comment for 'virtio_transport_alloc_skb()'.
 * Pass pointer to transport ops to 'virtio_transport_can_zcopy()',
   this allows to use it directly without calling virtio_transport_get_ops()'.
 * Remove redundant call for 'msg_data_left()' in 'virtio_transport_fill_skb()'.
 * Do not pass 'struct vsock_sock*' to 'virtio_transport_alloc_skb()',
   use same pointer from already passed 'struct virtio_vsock_pkt_info*'.
 * Fix setting 'end of message' bit for SOCK_SEQPACKET (add call for
   'msg_data_left()' == 0).
 * Add 'zcopy' parameter to packet allocation trace event.


Thanks for addressing the comments!


include/linux/virtio_vsock.h  |   9 +
.../events/vsock_virtio_transport_common.h|  12 +-
net/vmw_vsock/virtio_transport.c  |  32 +++
net/vmw_vsock/virtio_transport_common.c   | 250 ++
4 files changed, 241 insertions(+), 62 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://l

[PATCH net-next 5/5] vsock/test: track bytes in MSG_PEEK test for SOCK_SEQPACKET

2023-09-15 Thread Stefano Garzarella
The test was a bit complicated to read.
Added variables to keep track of the bytes read and to be read
in each step. Also some comments.

The test is unchanged.

Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/vsock_test.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index b18acbaf92e2..5743dcae2350 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1002,6 +1002,7 @@ static void test_stream_virtio_skb_merge_client(const 
struct test_opts *opts)
 
 static void test_stream_virtio_skb_merge_server(const struct test_opts *opts)
 {
+   size_t read = 0, to_read;
unsigned char buf[64];
int fd;
 
@@ -1014,14 +1015,21 @@ static void test_stream_virtio_skb_merge_server(const 
struct test_opts *opts)
control_expectln("SEND0");
 
/* Read skbuff partially. */
-   recv_buf(fd, buf, 2, 0, 2);
+   to_read = 2;
+   recv_buf(fd, buf + read, to_read, 0, to_read);
+   read += to_read;
 
control_writeln("REPLY0");
control_expectln("SEND1");
 
-   recv_buf(fd, buf + 2, 8, 0, 8);
+   /* Read the rest of both buffers */
+   to_read = strlen(HELLO_STR WORLD_STR) - read;
+   recv_buf(fd, buf + read, to_read, 0, to_read);
+   read += to_read;
 
-   recv_buf(fd, buf, sizeof(buf) - 8 - 2, MSG_DONTWAIT, -EAGAIN);
+   /* No more bytes should be there */
+   to_read = sizeof(buf) - read;
+   recv_buf(fd, buf + read, to_read, MSG_DONTWAIT, -EAGAIN);
 
if (memcmp(buf, HELLO_STR WORLD_STR, strlen(HELLO_STR WORLD_STR))) {
fprintf(stderr, "pattern mismatch\n");
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 1/5] vsock/test: add recv_buf() utility function

2023-09-15 Thread Stefano Garzarella
Move the code of recv_byte() out in a new utility function that
can be used to receive a generic buffer.

This new function can be used when we need to receive a custom
buffer and not just a single 'A' byte.

Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/util.h |  1 +
 tools/testing/vsock/util.c | 88 +++---
 2 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index fb99208a95ea..fe31f267e67e 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -42,6 +42,7 @@ int vsock_stream_accept(unsigned int cid, unsigned int port,
 int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
   struct sockaddr_vm *clientaddrp);
 void vsock_wait_remote_close(int fd);
+void recv_buf(int fd, void *buf, size_t len, int flags, ssize_t expected_ret);
 void send_byte(int fd, int expected_ret, int flags);
 void recv_byte(int fd, int expected_ret, int flags);
 void run_tests(const struct test_case *test_cases,
diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 01b636d3039a..2826902706e8 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -211,6 +211,58 @@ int vsock_seqpacket_accept(unsigned int cid, unsigned int 
port,
return vsock_accept(cid, port, clientaddrp, SOCK_SEQPACKET);
 }
 
+/* Receive bytes in a buffer and check the return value.
+ *
+ * expected_ret:
+ *  <0 Negative errno (for testing errors)
+ *   0 End-of-file
+ *  >0 Success (bytes successfully read)
+ */
+void recv_buf(int fd, void *buf, size_t len, int flags, ssize_t expected_ret)
+{
+   ssize_t nread = 0;
+   ssize_t ret;
+
+   timeout_begin(TIMEOUT);
+   do {
+   ret = recv(fd, buf + nread, len - nread, flags);
+   timeout_check("recv");
+
+   if (ret == 0 || (ret < 0 && errno != EINTR))
+   break;
+
+   nread += ret;
+   } while (nread < len);
+   timeout_end();
+
+   if (expected_ret < 0) {
+   if (ret != -1) {
+   fprintf(stderr, "bogus recv(2) return value %zd 
(expected %zd)\n",
+   ret, expected_ret);
+   exit(EXIT_FAILURE);
+   }
+   if (errno != -expected_ret) {
+   perror("recv");
+   exit(EXIT_FAILURE);
+   }
+   return;
+   }
+
+   if (ret < 0) {
+   perror("recv");
+   exit(EXIT_FAILURE);
+   }
+
+   if (nread != expected_ret) {
+   if (ret == 0)
+   fprintf(stderr, "unexpected EOF while receiving 
bytes\n");
+
+   fprintf(stderr, "bogus recv(2) bytes read %zd (expected %zd)\n",
+   nread, expected_ret);
+   exit(EXIT_FAILURE);
+   }
+}
+
 /* Transmit one byte and check the return value.
  *
  * expected_ret:
@@ -270,43 +322,9 @@ void send_byte(int fd, int expected_ret, int flags)
 void recv_byte(int fd, int expected_ret, int flags)
 {
uint8_t byte;
-   ssize_t nread;
-
-   timeout_begin(TIMEOUT);
-   do {
-   nread = recv(fd, , sizeof(byte), flags);
-   timeout_check("read");
-   } while (nread < 0 && errno == EINTR);
-   timeout_end();
-
-   if (expected_ret < 0) {
-   if (nread != -1) {
-   fprintf(stderr, "bogus recv(2) return value %zd\n",
-   nread);
-   exit(EXIT_FAILURE);
-   }
-   if (errno != -expected_ret) {
-   perror("read");
-   exit(EXIT_FAILURE);
-   }
-   return;
-   }
 
-   if (nread < 0) {
-   perror("read");
-   exit(EXIT_FAILURE);
-   }
-   if (nread == 0) {
-   if (expected_ret == 0)
-   return;
+   recv_buf(fd, , sizeof(byte), flags, expected_ret);
 
-   fprintf(stderr, "unexpected EOF while receiving byte\n");
-   exit(EXIT_FAILURE);
-   }
-   if (nread != sizeof(byte)) {
-   fprintf(stderr, "bogus recv(2) return value %zd\n", nread);
-   exit(EXIT_FAILURE);
-   }
if (byte != 'A') {
fprintf(stderr, "unexpected byte read %c\n", byte);
exit(EXIT_FAILURE);
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 4/5] vsock/test: use send_buf() in vsock_test.c

2023-09-15 Thread Stefano Garzarella
We have a very common pattern used in vsock_test that we can
now replace with the new send_buf().

This allows us to reuse the code we already had to check the
actual return value and wait for all the bytes to be sent with
an appropriate timeout.

Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/vsock_test.c | 75 
 1 file changed, 9 insertions(+), 66 deletions(-)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index d1dcbaeb477a..b18acbaf92e2 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -261,7 +261,6 @@ static void test_msg_peek_client(const struct test_opts 
*opts,
 bool seqpacket)
 {
unsigned char buf[MSG_PEEK_BUF_LEN];
-   ssize_t send_size;
int fd;
int i;
 
@@ -280,17 +279,7 @@ static void test_msg_peek_client(const struct test_opts 
*opts,
 
control_expectln("SRVREADY");
 
-   send_size = send(fd, buf, sizeof(buf), 0);
-
-   if (send_size < 0) {
-   perror("send");
-   exit(EXIT_FAILURE);
-   }
-
-   if (send_size != sizeof(buf)) {
-   fprintf(stderr, "Invalid send size %zi\n", send_size);
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
 
close(fd);
 }
@@ -385,7 +374,6 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE;
 
for (int i = 0; i < msg_count; i++) {
-   ssize_t send_size;
size_t buf_size;
int flags;
void *buf;
@@ -413,17 +401,7 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
flags = 0;
}
 
-   send_size = send(fd, buf, buf_size, flags);
-
-   if (send_size < 0) {
-   perror("send");
-   exit(EXIT_FAILURE);
-   }
-
-   if (send_size != buf_size) {
-   fprintf(stderr, "Invalid send size\n");
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, buf, buf_size, flags, buf_size);
 
/*
 * Hash sum is computed at both client and server in
@@ -524,10 +502,7 @@ static void test_seqpacket_msg_trunc_client(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}
 
-   if (send(fd, buf, sizeof(buf), 0) != sizeof(buf)) {
-   perror("send failed");
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
 
control_writeln("SENDDONE");
close(fd);
@@ -649,7 +624,6 @@ static void test_seqpacket_timeout_server(const struct 
test_opts *opts)
 static void test_seqpacket_bigmsg_client(const struct test_opts *opts)
 {
unsigned long sock_buf_size;
-   ssize_t send_size;
socklen_t len;
void *data;
int fd;
@@ -676,18 +650,7 @@ static void test_seqpacket_bigmsg_client(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}
 
-   send_size = send(fd, data, sock_buf_size, 0);
-   if (send_size != -1) {
-   fprintf(stderr, "expected 'send(2)' failure, got %zi\n",
-   send_size);
-   exit(EXIT_FAILURE);
-   }
-
-   if (errno != EMSGSIZE) {
-   fprintf(stderr, "expected EMSGSIZE in 'errno', got %i\n",
-   errno);
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, data, sock_buf_size, 0, -EMSGSIZE);
 
control_writeln("CLISENT");
 
@@ -741,15 +704,9 @@ static void test_seqpacket_invalid_rec_buffer_client(const 
struct test_opts *opt
memset(buf1, BUF_PATTERN_1, buf_size);
memset(buf2, BUF_PATTERN_2, buf_size);
 
-   if (send(fd, buf1, buf_size, 0) != buf_size) {
-   perror("send failed");
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, buf1, buf_size, 0, buf_size);
 
-   if (send(fd, buf2, buf_size, 0) != buf_size) {
-   perror("send failed");
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, buf2, buf_size, 0, buf_size);
 
close(fd);
 }
@@ -972,7 +929,6 @@ static void test_inv_buf_client(const struct test_opts 
*opts, bool stream)
 static void test_inv_buf_server(const struct test_opts *opts, bool stream)
 {
unsigned char data[INV_BUF_TEST_DATA_LEN] = {0};
-   ssize_t res;
int fd;
 
if (stream)
@@ -985,11 +941,7 @@ static void test_inv_buf_server(const struct test_opts 
*opts, bool stream)
exit(EXIT_FAILURE);
}
 
-   res = send(fd, data, sizeof(data), 0);
-   if (res != sizeof(data)) {
-  

[PATCH net-next 3/5] vsock/test: add send_buf() utility function

2023-09-15 Thread Stefano Garzarella
Move the code of send_byte() out in a new utility function that
can be used to send a generic buffer.

This new function can be used when we need to send a custom
buffer and not just a single 'A' byte.

Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/util.h |  2 +
 tools/testing/vsock/util.c | 90 +++---
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index fe31f267e67e..e5407677ce05 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -42,6 +42,8 @@ int vsock_stream_accept(unsigned int cid, unsigned int port,
 int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
   struct sockaddr_vm *clientaddrp);
 void vsock_wait_remote_close(int fd);
+void send_buf(int fd, const void *buf, size_t len, int flags,
+ ssize_t expected_ret);
 void recv_buf(int fd, void *buf, size_t len, int flags, ssize_t expected_ret);
 void send_byte(int fd, int expected_ret, int flags);
 void recv_byte(int fd, int expected_ret, int flags);
diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 2826902706e8..6779d5008b27 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -211,6 +211,59 @@ int vsock_seqpacket_accept(unsigned int cid, unsigned int 
port,
return vsock_accept(cid, port, clientaddrp, SOCK_SEQPACKET);
 }
 
+/* Transmit bytes from a buffer and check the return value.
+ *
+ * expected_ret:
+ *  <0 Negative errno (for testing errors)
+ *   0 End-of-file
+ *  >0 Success (bytes successfully written)
+ */
+void send_buf(int fd, const void *buf, size_t len, int flags,
+ ssize_t expected_ret)
+{
+   ssize_t nwritten = 0;
+   ssize_t ret;
+
+   timeout_begin(TIMEOUT);
+   do {
+   ret = send(fd, buf + nwritten, len - nwritten, flags);
+   timeout_check("send");
+
+   if (ret == 0 || (ret < 0 && errno != EINTR))
+   break;
+
+   nwritten += ret;
+   } while (nwritten < len);
+   timeout_end();
+
+   if (expected_ret < 0) {
+   if (ret != -1) {
+   fprintf(stderr, "bogus send(2) return value %zd 
(expected %zd)\n",
+   ret, expected_ret);
+   exit(EXIT_FAILURE);
+   }
+   if (errno != -expected_ret) {
+   perror("send");
+   exit(EXIT_FAILURE);
+   }
+   return;
+   }
+
+   if (ret < 0) {
+   perror("send");
+   exit(EXIT_FAILURE);
+   }
+
+   if (nwritten != expected_ret) {
+   if (ret == 0)
+   fprintf(stderr, "unexpected EOF while sending bytes\n");
+
+   fprintf(stderr, "bogus send(2) bytes written %zd (expected 
%zd)\n",
+   nwritten, expected_ret);
+   exit(EXIT_FAILURE);
+   }
+}
+
 /* Receive bytes in a buffer and check the return value.
  *
  * expected_ret:
@@ -273,43 +326,8 @@ void recv_buf(int fd, void *buf, size_t len, int flags, 
ssize_t expected_ret)
 void send_byte(int fd, int expected_ret, int flags)
 {
const uint8_t byte = 'A';
-   ssize_t nwritten;
-
-   timeout_begin(TIMEOUT);
-   do {
-   nwritten = send(fd, , sizeof(byte), flags);
-   timeout_check("write");
-   } while (nwritten < 0 && errno == EINTR);
-   timeout_end();
-
-   if (expected_ret < 0) {
-   if (nwritten != -1) {
-   fprintf(stderr, "bogus send(2) return value %zd\n",
-   nwritten);
-   exit(EXIT_FAILURE);
-   }
-   if (errno != -expected_ret) {
-   perror("write");
-   exit(EXIT_FAILURE);
-   }
-   return;
-   }
 
-   if (nwritten < 0) {
-   perror("write");
-   exit(EXIT_FAILURE);
-   }
-   if (nwritten == 0) {
-   if (expected_ret == 0)
-   return;
-
-   fprintf(stderr, "unexpected EOF while sending byte\n");
-   exit(EXIT_FAILURE);
-   }
-   if (nwritten != sizeof(byte)) {
-   fprintf(stderr, "bogus send(2) return value %zd\n", nwritten);
-   exit(EXIT_FAILURE);
-   }
+   send_buf(fd, , sizeof(byte), flags, expected_ret);
 }
 
 /* Receive one byte and check the return value.
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 2/5] vsock/test: use recv_buf() in vsock_test.c

2023-09-15 Thread Stefano Garzarella
We have a very common pattern used in vsock_test that we can
now replace with the new recv_buf().

This allows us to reuse the code we already had to check the
actual return value and wait for all bytes to be received with
an appropriate timeout.

Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/vsock_test.c | 104 +--
 1 file changed, 17 insertions(+), 87 deletions(-)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 90718c2fd4ea..d1dcbaeb477a 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -301,7 +301,6 @@ static void test_msg_peek_server(const struct test_opts 
*opts,
unsigned char buf_half[MSG_PEEK_BUF_LEN / 2];
unsigned char buf_normal[MSG_PEEK_BUF_LEN];
unsigned char buf_peek[MSG_PEEK_BUF_LEN];
-   ssize_t res;
int fd;
 
if (seqpacket)
@@ -315,34 +314,16 @@ static void test_msg_peek_server(const struct test_opts 
*opts,
}
 
/* Peek from empty socket. */
-   res = recv(fd, buf_peek, sizeof(buf_peek), MSG_PEEK | MSG_DONTWAIT);
-   if (res != -1) {
-   fprintf(stderr, "expected recv(2) failure, got %zi\n", res);
-   exit(EXIT_FAILURE);
-   }
-
-   if (errno != EAGAIN) {
-   perror("EAGAIN expected");
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf_peek, sizeof(buf_peek), MSG_PEEK | MSG_DONTWAIT,
+-EAGAIN);
 
control_writeln("SRVREADY");
 
/* Peek part of data. */
-   res = recv(fd, buf_half, sizeof(buf_half), MSG_PEEK);
-   if (res != sizeof(buf_half)) {
-   fprintf(stderr, "recv(2) + MSG_PEEK, expected %zu, got %zi\n",
-   sizeof(buf_half), res);
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf_half, sizeof(buf_half), MSG_PEEK, sizeof(buf_half));
 
/* Peek whole data. */
-   res = recv(fd, buf_peek, sizeof(buf_peek), MSG_PEEK);
-   if (res != sizeof(buf_peek)) {
-   fprintf(stderr, "recv(2) + MSG_PEEK, expected %zu, got %zi\n",
-   sizeof(buf_peek), res);
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf_peek, sizeof(buf_peek), MSG_PEEK, sizeof(buf_peek));
 
/* Compare partial and full peek. */
if (memcmp(buf_half, buf_peek, sizeof(buf_half))) {
@@ -355,22 +336,11 @@ static void test_msg_peek_server(const struct test_opts 
*opts,
 * so check it with MSG_PEEK. We must get length
 * of the message.
 */
-   res = recv(fd, buf_half, sizeof(buf_half), MSG_PEEK |
-  MSG_TRUNC);
-   if (res != sizeof(buf_peek)) {
-   fprintf(stderr,
-   "recv(2) + MSG_PEEK | MSG_TRUNC, exp %zu, got 
%zi\n",
-   sizeof(buf_half), res);
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf_half, sizeof(buf_half), MSG_PEEK | MSG_TRUNC,
+sizeof(buf_peek));
}
 
-   res = recv(fd, buf_normal, sizeof(buf_normal), 0);
-   if (res != sizeof(buf_normal)) {
-   fprintf(stderr, "recv(2), expected %zu, got %zi\n",
-   sizeof(buf_normal), res);
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf_normal, sizeof(buf_normal), 0, sizeof(buf_normal));
 
/* Compare full peek and normal read. */
if (memcmp(buf_peek, buf_normal, sizeof(buf_peek))) {
@@ -900,7 +870,6 @@ static void test_stream_poll_rcvlowat_client(const struct 
test_opts *opts)
unsigned long lowat_val = RCVLOWAT_BUF_SIZE;
char buf[RCVLOWAT_BUF_SIZE];
struct pollfd fds;
-   ssize_t read_res;
short poll_flags;
int fd;
 
@@ -955,12 +924,7 @@ static void test_stream_poll_rcvlowat_client(const struct 
test_opts *opts)
/* Use MSG_DONTWAIT, if call is going to wait, EAGAIN
 * will be returned.
 */
-   read_res = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
-   if (read_res != RCVLOWAT_BUF_SIZE) {
-   fprintf(stderr, "Unexpected recv result %zi\n",
-   read_res);
-   exit(EXIT_FAILURE);
-   }
+   recv_buf(fd, buf, sizeof(buf), MSG_DONTWAIT, RCVLOWAT_BUF_SIZE);
 
control_writeln("POLLDONE");
 
@@ -972,7 +936,7 @@ static void test_stream_poll_rcvlowat_client(const struct 
test_opts *opts)
 static void test_inv_buf_client(const struct test_opts *opts, bool stream)
 {
unsigned char data[INV_BUF_TEST_DATA_LEN] = {0};
-   ssize_t ret;
+   ssize_t expected_ret;
int fd;
 
if (stream)
@@ -988,39 +952,18 @@ static void test_inv_buf_client(const struct test_opts 
*opts, bool stream)
control_expectln(&quo

[PATCH net-next 0/5] vsock/test: add recv_buf()/send_buf() utility functions and some improvements

2023-09-15 Thread Stefano Garzarella
We recently found that some tests were failing [1].

The problem was that we were not waiting for all the bytes correctly,
so we had a partial read. I had initially suggested using MSG_WAITALL,
but this could have timeout problems.

Since we already had send_byte() and recv_byte() that handled the timeout,
but also the expected return value, I moved that code to two new functions
that we can now use to send/receive generic buffers.

The last commit is just an improvement to a test I found difficult to
understand while using the new functions.

@Arseniy a review and some testing are really appreciated :-)

[1] 
https://lore.kernel.org/netdev/63xflnwiohdfo6m3vnrrxgv2ulplencpwug5qqacugqh7xxpu3@tsczkuqgwurb/

Stefano Garzarella (5):
  vsock/test: add recv_buf() utility function
  vsock/test: use recv_buf() in vsock_test.c
  vsock/test: add send_buf() utility function
  vsock/test: use send_buf() in vsock_test.c
  vsock/test: track bytes in MSG_PEEK test for SOCK_SEQPACKET

 tools/testing/vsock/util.h   |   3 +
 tools/testing/vsock/util.c   | 124 
 tools/testing/vsock/vsock_test.c | 187 ++-
 3 files changed, 117 insertions(+), 197 deletions(-)

-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v8 0/4] vsock/virtio/vhost: MSG_ZEROCOPY preparations

2023-09-14 Thread Stefano Garzarella

On Thu, Sep 14, 2023 at 05:05:17PM +0300, Arseniy Krasnov wrote:

Hello Stefano,

On 14.09.2023 17:07, Stefano Garzarella wrote:

Hi Arseniy,

On Mon, Sep 11, 2023 at 11:22:30PM +0300, Arseniy Krasnov wrote:

Hello,

this patchset is first of three parts of another big patchset for
MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) <--- this patchset
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/.
3) Updates for tests and utils.

This series enables handling of fragged skbs in virtio and vhost parts.
Newly logic won't be triggered, because SO_ZEROCOPY options is still
impossible to enable at this moment (next bunch of patches from big
set above will enable it).

I've included changelog to some patches anyway, because there were some
comments during review of last big patchset from the link above.


Thanks, I left some comments on patch 4, the others LGTM.
Sorry to not having spotted them before, but moving
virtio_transport_alloc_skb() around the file, made the patch a little
confusing and difficult to review.


Sure, no problem, I'll fix them! Thanks for review.



In addition, I started having failures of test 14 (server: host,
client: guest), so I looked better to see if there was anything wrong,
but it fails me even without this series applied.

It happens to me intermittently (~30%), does it happen to you?
Can you take a look at it?


Yes! sometime ago I also started to get fails of this test, not ~30%,
significantly rare, but it depends on environment I guess, anyway I'm going to
look at this on the next few days


Maybe it's just a timing issue in the test, indeed we are expecting 8
bytes but we received only 3 plus the 2 bytes we received before it
seems exactly the same bytes we send with the first
`send(fd, HELLO_STR, strlen(HELLO_STR), 0);`

Since it is a stream socket, it could happen, so we should retry
the recv() or just use MSG_WAITALL.

Applying the following patch fixed the issue for me (15 mins without
errors for now):

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 90718c2fd4ea..7b0fed9fc58d 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1129,7 +1129,7 @@ static void test_stream_virtio_skb_merge_server(const 
struct test_opts *opts)
control_expectln("SEND0");

/* Read skbuff partially. */
-   res = recv(fd, buf, 2, 0);
+   res = recv(fd, buf, 2, MSG_WAITALL);
if (res != 2) {
fprintf(stderr, "expected recv(2) returns 2 bytes, got %zi\n", 
res);
exit(EXIT_FAILURE);
@@ -1138,7 +1138,7 @@ static void test_stream_virtio_skb_merge_server(const 
struct test_opts *opts)
control_writeln("REPLY0");
control_expectln("SEND1");

-   res = recv(fd, buf + 2, sizeof(buf) - 2, 0);
+   res = recv(fd, buf + 2, 8, MSG_WAITALL);
if (res != 8) {
fprintf(stderr, "expected recv(2) returns 8 bytes, got %zi\n", 
res);
exit(EXIT_FAILURE);

I will check better all the cases and send a patch upstream.

Anyway it looks just an issue in our test suite :-)

Stefano



Thanks, Arseniy



host$ ./vsock_test --mode=server --control-port=12345 --peer-cid=4
...
14 - SOCK_STREAM virtio skb merge...expected recv(2) returns 8 bytes, got 3

guest$ ./vsock_test --mode=client --control-host=192.168.133.2 
--control-port=12345 --peer-cid=2

Thanks,
Stefano





___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v8 0/4] vsock/virtio/vhost: MSG_ZEROCOPY preparations

2023-09-14 Thread Stefano Garzarella

Hi Arseniy,

On Mon, Sep 11, 2023 at 11:22:30PM +0300, Arseniy Krasnov wrote:

Hello,

this patchset is first of three parts of another big patchset for
MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-avkras...@sberdevices.ru/

During review of this series, Stefano Garzarella 
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) <--- this patchset
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
  tx completions) and update for Documentation/.
3) Updates for tests and utils.

This series enables handling of fragged skbs in virtio and vhost parts.
Newly logic won't be triggered, because SO_ZEROCOPY options is still
impossible to enable at this moment (next bunch of patches from big
set above will enable it).

I've included changelog to some patches anyway, because there were some
comments during review of last big patchset from the link above.


Thanks, I left some comments on patch 4, the others LGTM.
Sorry to not having spotted them before, but moving
virtio_transport_alloc_skb() around the file, made the patch a little
confusing and difficult to review.

In addition, I started having failures of test 14 (server: host,
client: guest), so I looked better to see if there was anything wrong,
but it fails me even without this series applied.

It happens to me intermittently (~30%), does it happen to you?
Can you take a look at it?

host$ ./vsock_test --mode=server --control-port=12345 --peer-cid=4
...
14 - SOCK_STREAM virtio skb merge...expected recv(2) returns 8 bytes, got 3

guest$ ./vsock_test --mode=client --control-host=192.168.133.2 
--control-port=12345 --peer-cid=2

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v8 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-09-14 Thread Stefano Garzarella

On Mon, Sep 11, 2023 at 11:22:34PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path:

1) If this flag is set and zerocopy transmission is possible (enabled
  in socket options and transport allows zerocopy), then non-linear
  skb will be created and filled with the pages of user's buffer.
  Pages of user's buffer are locked in memory by 'get_user_pages()'.
2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it
  calls 'skb_set_owner_w()'. Reason of this change is that
  '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so
  to decrease this field correctly, proper skb destructor is needed:
  'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'.
  If this callback is set, then transport needs extra check to be able
  to send provided number of buffers in zerocopy mode. Currently, the
  only transport that needs this callback set is virtio, because this
  transport adds new buffers to the virtio queue and we need to check,
  that number of these buffers is less than size of the queue (it is
  required by virtio spec). vhost and loopback transports don't need
  this check.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Refactorings of 'if' conditions.
 * Remove extra blank line.
 * Remove 'frag_off' field unneeded init.
 * Add function 'virtio_transport_fill_skb()' which fills both linear
   and non-linear skb with provided data.
v1 -> v2:
 * Use original order of last four arguments in 'virtio_transport_alloc_skb()'.
v2 -> v3:
 * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
   provided 'iov_iter' with data could be sent in a zerocopy mode.
   If this callback is not set in transport - transport allows to send
   any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
   then zerocopy is allowed. Reason of this callback is that in case of
   G2H transmission we insert whole skb to the tx virtio queue and such
   skb must fit to the size of the virtio queue to be sent in a single
   iteration (may be tx logic in 'virtio_transport.c' could be reworked
   as in vhost to support partial send of current skb). This callback
   will be enabled only for G2H path. For details pls see comment
   'Check that tx queue...' below.
v3 -> v4:
 * 'msgzerocopy_check_iov' moved from 'struct vsock_transport' to
   'struct virtio_transport' as it is virtio specific callback and
   never needed in other transports.
v4 -> v5:
 * 'msgzerocopy_check_iov' renamed to 'can_msgzerocopy' and now it
   uses number of buffers to send as input argument. I think there is
   no need to pass iov to this callback (at least today, it is used only
   by guest side of virtio transport), because the only thing that this
   callback does is comparison of number of buffers to be inserted to
   the tx queue and size of this queue.
 * Remove any checks for type of current 'iov_iter' with payload (is it
   'iovec' or 'ubuf'). These checks left from the earlier versions where I
   didn't use already implemented kernel API which handles every type of
   'iov_iter'.
v5 -> v6:
 * Refactor 'virtio_transport_fill_skb()'.
 * Add 'WARN_ON_ONCE()' and comment on invalid combination of destination
   socket and payload in 'virtio_transport_alloc_skb()'.
v7 -> v8:
 * Move '+1' addition from 'can_msgzerocopy' callback body to the caller.
   This addition means packet header.
 * In 'virtio_transport_can_zcopy()' rename 'max_to_send' argument to
   'pkt_len'.
 * Update commit message by adding details about new 'can_msgzerocopy'
   callback.
 * In 'virtio_transport_init_hdr()' move 'len' argument directly after
   'info'.
 * Add comment about processing last skb in tx loop.
 * Update comment for 'can_msgzerocopy' callback for more details.

include/linux/virtio_vsock.h|   9 +
net/vmw_vsock/virtio_transport.c|  32 +++
net/vmw_vsock/virtio_transport_common.c | 256 ++--
3 files changed, 239 insertions(+), 58 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index a91fbdf233e4..ebb3ce63d64d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -160,6 +160,15 @@ struct virtio_transport {

/* Takes ownership of the packet */
int (*send_pkt)(struct sk_buff *skb);
+
+   /* Used in MSG_ZEROCOPY mode. Checks, that provided data
+* (number of buffers) could be transmitted with zerocopy
+* mode. If this callback is not implemented for the current
+* transport - this means that this transport doesn't need
+* extra checks and can perform zerocopy transmission by
+* default.
+*/
+   bool (*can_msgzerocopy)(int bufs_num);
};

ssize_t
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 73d730156349..09ba3128e759 100644
--- a/net/vmw_vsock/virtio_transport.c

Re: [PATCH net-next v8 2/4] vsock/virtio: support to send non-linear skb

2023-09-14 Thread Stefano Garzarella

On Mon, Sep 11, 2023 at 11:22:32PM +0300, Arseniy Krasnov wrote:

For non-linear skb use its pages from fragment array as buffers in
virtio tx queue. These pages are already pinned by 'get_user_pages()'
during such skb creation.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v2 -> v3:
 * Comment about 'page_to_virt()' is updated. I don't remove R-b,
   as this change is quiet small I guess.
v6 -> v7:
 * Move arrays '*sgs' and 'bufs' to 'virtio_vsock' instead of being
   local variables. This allows to save stack space in cases of too
   big MAX_SKB_FRAGS.
 * Add 'WARN_ON_ONCE()' for handling nonlinear skbs - it checks that
   linear part of such skb contains only header.
 * R-b tag removed due to updates above.
v7 -> v8:
 * Add comment in 'struct virtio_vsock' for both 'struct scatterlist'
   fields.
 * Rename '*sgs' and 'bufs' to '*out_sgs' and 'out_bufs'.
 * Initialize '*out_sgs' in 'virtio_vsock_probe()' by always pointing
   to the corresponding element of 'out_bufs'.


LGTM, thanks for addressing that comments!



net/vmw_vsock/virtio_transport.c | 60 
1 file changed, 53 insertions(+), 7 deletions(-)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] virtio-vsock: add VIRTIO_VSOCK_F_DGRAM feature bit

2023-09-06 Thread Stefano Garzarella

On Sat, Sep 02, 2023 at 04:35:25AM -0400, Michael S. Tsirkin wrote:

On Sat, Sep 02, 2023 at 04:56:42AM +, Bobby Eshleman wrote:

On Fri, Sep 01, 2023 at 02:45:14PM +0200, Stefano Garzarella wrote:
> On Tue, Aug 29, 2023 at 09:29:45PM +, Bobby Eshleman wrote:
> > This adds support for datagrams to the virtio-vsock device.
> >
> > virtio-vsock already supports stream and seqpacket types. The existing
> > message types and header fields are extended to support datagrams.
> > Semantic differences between the flow types are stated, as well as any
> > additional requirements for devices and drivers implementing this
> > feature.
> >
> > Signed-off-by: Bobby Eshleman 
> > ---
> > device-types/vsock/description.tex | 95 +++---
> > 1 file changed, 88 insertions(+), 7 deletions(-)
> >
> > diff --git a/device-types/vsock/description.tex 
b/device-types/vsock/description.tex
> > index 7d91d159872f..638dca8e5da1 100644
> > --- a/device-types/vsock/description.tex
> > +++ b/device-types/vsock/description.tex
> > @@ -20,6 +20,7 @@ \subsection{Feature bits}\label{sec:Device Types / Socket 
Device / Feature bits}
> > \item[VIRTIO_VSOCK_F_STREAM (0)] stream socket type is supported.
> > \item[VIRTIO_VSOCK_F_SEQPACKET (1)] seqpacket socket type is supported.
> > \item[VIRTIO_VSOCK_F_NO_IMPLIED_STREAM (2)] stream socket type is not 
implied.
> > +\item[VIRTIO_VSOCK_F_DGRAM (3)] datagram socket type is supported.
> > \end{description}
> >
> > \drivernormative{\subsubsection}{Feature bits}{Device Types / Socket Device 
/ Feature bits}
> > @@ -167,17 +168,22 @@ \subsubsection{Addressing}\label{sec:Device Types / 
Socket Device / Device Opera
> > consists of a (cid, port number) tuple. The header fields used for this are
> > \field{src_cid}, \field{src_port}, \field{dst_cid}, and \field{dst_port}.
> >
> > -Currently stream and seqpacket sockets are supported. \field{type} is 1 
(VIRTIO_VSOCK_TYPE_STREAM)
> > -for stream socket types, and 2 (VIRTIO_VSOCK_TYPE_SEQPACKET) for seqpacket 
socket types.
> > +
> > +Currently stream, seqpacket, and datagram sockets are supported. 
\field{type} is
> > +1 (VIRTIO_VSOCK_TYPE_STREAM) for stream socket types, 2 
(VIRTIO_VSOCK_TYPE_SEQPACKET) for
> > +seqpacket socket types, and 3 (VIRTIO_VSOCK_TYPE_DGRAM) for datagram 
socket types.
> >
> > \begin{lstlisting}
> > #define VIRTIO_VSOCK_TYPE_STREAM1
> > #define VIRTIO_VSOCK_TYPE_SEQPACKET 2
> > +#define VIRTIO_VSOCK_TYPE_DGRAM 3
> > \end{lstlisting}
> >
> > Stream sockets provide in-order, guaranteed, connection-oriented delivery
> > without message boundaries. Seqpacket sockets provide in-order, guaranteed,
> > -connection-oriented delivery with message and record boundaries.
> > +connection-oriented delivery with message and record boundaries. Datagram
> > +sockets provide connection-less, best-effort delivery of messages, with no
> > +order or reliability guarantees.
> >
> > \subsubsection{Buffer Space Management}\label{sec:Device Types / Socket 
Device / Device Operation / Buffer Space Management}
> > \field{buf_alloc} and \field{fwd_cnt} are used for buffer space management 
of
> > @@ -203,16 +209,19 @@ \subsubsection{Buffer Space 
Management}\label{sec:Device Types / Socket Device /
> > previously receiving a VIRTIO_VSOCK_OP_CREDIT_REQUEST packet. This allows
> > communicating updates any time a change in buffer space occurs.
> >
> > +\field{buf_alloc} and \field{fwd_cnt} are reserved for future use by 
datagram
> > +sockets. These fields are not used for datagram buffer space management.
> > +
> > \drivernormative{\paragraph}{Device Operation: Buffer Space 
Management}{Device Types / Socket Device / Device Operation / Buffer Space Management}
> > -VIRTIO_VSOCK_OP_RW data packets MUST only be transmitted when the peer has
> > -sufficient free buffer space for the payload.
> > +For stream and seqpacket flows, VIRTIO_VSOCK_OP_RW data packets MUST only 
be
> > +transmitted when the peer has sufficient free buffer space for the payload.
> >
> > All packets associated with a stream flow MUST contain valid information in
> > \field{buf_alloc} and \field{fwd_cnt} fields.
> >
> > \devicenormative{\paragraph}{Device Operation: Buffer Space
> > Management}{Device Types / Socket Device / Device Operation / Buffer
> > Space Management}
> > -VIRTIO_VSOCK_OP_RW data packets MUST only be transmitted when the peer has
> > -sufficient free buffer space for the payload.
> > +For stream and seqpacket flows, VIRTIO_VSOCK_OP_RW data packets MUST only 
be
> > +transmitted w

Re: [PATCH net-next v7 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-09-04 Thread Stefano Garzarella

On Sun, Sep 03, 2023 at 11:13:23AM +0300, Arseniy Krasnov wrote:



On 01.09.2023 15:30, Stefano Garzarella wrote:

On Sun, Aug 27, 2023 at 11:54:36AM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible (enabled in socket
options and transport allows zerocopy), then non-linear skb will be
created and filled with the pages of user's buffer. Pages of user's
buffer are locked in memory by 'get_user_pages()'. Second thing that
this patch does is replace type of skb owning: instead of calling
'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
of socket, so to decrease this field correctly proper skb destructor is
needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.

Signed-off-by: Arseniy Krasnov 


[...]



-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-   size_t len,
-   u32 src_cid,
-   u32 src_port,
-   u32 dst_cid,
-   u32 dst_port)
-{
-    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-    struct virtio_vsock_hdr *hdr;
-    struct sk_buff *skb;
+static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
+   size_t max_to_send)

  ^
I'd call it `pkt_len`, `max_to_send` is confusing IMHO. I didn't
initially if it was the number of buffers or bytes.


+{
+    const struct virtio_transport *t_ops;
+    struct iov_iter *iov_iter;
+
+    if (!info->msg)
+    return false;
+
+    iov_iter = >msg->msg_iter;
+
+    if (iov_iter->iov_offset)
+    return false;
+
+    /* We can't send whole iov. */
+    if (iov_iter->count > max_to_send)
+    return false;
+
+    /* Check that transport can send data in zerocopy mode. */
+    t_ops = virtio_transport_get_ops(info->vsk);
+
+    if (t_ops->can_msgzerocopy) {


So if `can_msgzerocopy` is not implemented, we always return true after
this point. Should we mention it in the .can_msgzerocopy documentation?


Ops, this is my mistake, I must return 'false' in this case. Seems I didn't
catch this problem with my tests, because there was no test case where
zerocopy will fallback to copy!

I'll fix it and add new test!


yep, I agree!





Can we also mention in the commit description why this is need only for
virtio_tranport and not for vhost and loopback?


+    int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
+    int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
+
+    return t_ops->can_msgzerocopy(pages_to_send);
+    }
+
+    return true;
+}
+


[...]


@@ -270,6 +395,17 @@ static int virtio_transport_send_pkt_info(struct 
vsock_sock *vsk,
    break;
    }

+    /* This is last skb to send this portion of data. */


Sorry I didn't get it :-(

Can you elaborate this a bit more?


I mean that we iterate over user's buffer here, allocating skb on each
iteration. And for last skb for this buffer we initialize completion
for user (we need to allocate one completion for one syscall).


Okay, so maybe we should explain better also in the code comment.


Thanks for review, I'll fix all other comments and resend patchset when
'net-next' will be opened again.


Cool, thanks!
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] virtio-vsock: add VIRTIO_VSOCK_F_DGRAM feature bit

2023-09-01 Thread Stefano Garzarella

On Tue, Aug 29, 2023 at 09:29:45PM +, Bobby Eshleman wrote:

This adds support for datagrams to the virtio-vsock device.

virtio-vsock already supports stream and seqpacket types. The existing
message types and header fields are extended to support datagrams.
Semantic differences between the flow types are stated, as well as any
additional requirements for devices and drivers implementing this
feature.

Signed-off-by: Bobby Eshleman 
---
device-types/vsock/description.tex | 95 +++---
1 file changed, 88 insertions(+), 7 deletions(-)

diff --git a/device-types/vsock/description.tex 
b/device-types/vsock/description.tex
index 7d91d159872f..638dca8e5da1 100644
--- a/device-types/vsock/description.tex
+++ b/device-types/vsock/description.tex
@@ -20,6 +20,7 @@ \subsection{Feature bits}\label{sec:Device Types / Socket 
Device / Feature bits}
\item[VIRTIO_VSOCK_F_STREAM (0)] stream socket type is supported.
\item[VIRTIO_VSOCK_F_SEQPACKET (1)] seqpacket socket type is supported.
\item[VIRTIO_VSOCK_F_NO_IMPLIED_STREAM (2)] stream socket type is not implied.
+\item[VIRTIO_VSOCK_F_DGRAM (3)] datagram socket type is supported.
\end{description}

\drivernormative{\subsubsection}{Feature bits}{Device Types / Socket Device / 
Feature bits}
@@ -167,17 +168,22 @@ \subsubsection{Addressing}\label{sec:Device Types / 
Socket Device / Device Opera
consists of a (cid, port number) tuple. The header fields used for this are
\field{src_cid}, \field{src_port}, \field{dst_cid}, and \field{dst_port}.

-Currently stream and seqpacket sockets are supported. \field{type} is 1 
(VIRTIO_VSOCK_TYPE_STREAM)
-for stream socket types, and 2 (VIRTIO_VSOCK_TYPE_SEQPACKET) for seqpacket 
socket types.
+
+Currently stream, seqpacket, and datagram sockets are supported. \field{type} 
is
+1 (VIRTIO_VSOCK_TYPE_STREAM) for stream socket types, 2 
(VIRTIO_VSOCK_TYPE_SEQPACKET) for
+seqpacket socket types, and 3 (VIRTIO_VSOCK_TYPE_DGRAM) for datagram socket 
types.

\begin{lstlisting}
#define VIRTIO_VSOCK_TYPE_STREAM1
#define VIRTIO_VSOCK_TYPE_SEQPACKET 2
+#define VIRTIO_VSOCK_TYPE_DGRAM 3
\end{lstlisting}

Stream sockets provide in-order, guaranteed, connection-oriented delivery
without message boundaries. Seqpacket sockets provide in-order, guaranteed,
-connection-oriented delivery with message and record boundaries.
+connection-oriented delivery with message and record boundaries. Datagram
+sockets provide connection-less, best-effort delivery of messages, with no
+order or reliability guarantees.

\subsubsection{Buffer Space Management}\label{sec:Device Types / Socket Device 
/ Device Operation / Buffer Space Management}
\field{buf_alloc} and \field{fwd_cnt} are used for buffer space management of
@@ -203,16 +209,19 @@ \subsubsection{Buffer Space Management}\label{sec:Device 
Types / Socket Device /
previously receiving a VIRTIO_VSOCK_OP_CREDIT_REQUEST packet. This allows
communicating updates any time a change in buffer space occurs.

+\field{buf_alloc} and \field{fwd_cnt} are reserved for future use by datagram
+sockets. These fields are not used for datagram buffer space management.
+
\drivernormative{\paragraph}{Device Operation: Buffer Space Management}{Device 
Types / Socket Device / Device Operation / Buffer Space Management}
-VIRTIO_VSOCK_OP_RW data packets MUST only be transmitted when the peer has
-sufficient free buffer space for the payload.
+For stream and seqpacket flows, VIRTIO_VSOCK_OP_RW data packets MUST only be
+transmitted when the peer has sufficient free buffer space for the payload.

All packets associated with a stream flow MUST contain valid information in
\field{buf_alloc} and \field{fwd_cnt} fields.

\devicenormative{\paragraph}{Device Operation: Buffer Space 
Management}{Device Types / Socket Device / Device Operation / Buffer 
Space Management}

-VIRTIO_VSOCK_OP_RW data packets MUST only be transmitted when the peer has
-sufficient free buffer space for the payload.
+For stream and seqpacket flows, VIRTIO_VSOCK_OP_RW data packets MUST only be
+transmitted when the peer has sufficient free buffer space for the payload.

All packets associated with a stream flow MUST contain valid information in
\field{buf_alloc} and \field{fwd_cnt} fields.
@@ -299,6 +308,78 @@ \subsubsection{Seqpacket Sockets}\label{sec:Device Types / 
Socket Device / Devic
#define VIRTIO_VSOCK_SEQ_EOR (1 << 1)
\end{lstlisting}

+\subsubsection{Datagram Sockets}\label{sec:Device Types / Socket Device / 
Device Operation / Datagram Sockets}
+
+\drivernormative{\paragraph}{Device Operation: Packet Fragmentation}{Device 
Types / Socket Device / Datagram Sockets / Fragmentation}
+
+Drivers MAY disassemble packets into smaller fragments. If drivers fragment a
+packet, they MUST follow the fragmentation rules described in section
+\ref{sec:Device Types / Socket Device / Device Operation / Datagram Sockets / 
Fragmentation}.
+
+Drivers MUST support assembly of received packet fragments according to the
+fragmentation rules 

Re: [PATCH net-next v7 2/4] vsock/virtio: support to send non-linear skb

2023-09-01 Thread Stefano Garzarella

On Sun, Aug 27, 2023 at 11:54:34AM +0300, Arseniy Krasnov wrote:

For non-linear skb use its pages from fragment array as buffers in
virtio tx queue. These pages are already pinned by 'get_user_pages()'
during such skb creation.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v2 -> v3:
 * Comment about 'page_to_virt()' is updated. I don't remove R-b,
   as this change is quiet small I guess.
v6 -> v7:
 * Move arrays '*sgs' and 'bufs' to 'virtio_vsock' instead of being
   local variables. This allows to save stack space in cases of too
   big MAX_SKB_FRAGS.
 * Add 'WARN_ON_ONCE()' for handling nonlinear skbs - it checks that
   linear part of such skb contains only header.
 * R-b tag removed due to updates above.

net/vmw_vsock/virtio_transport.c | 54 +++-
1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..8636477cf088 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -63,6 +63,10 @@ struct virtio_vsock {

u32 guest_cid;
bool seqpacket_allow;
+


I'd add a comment here specifying specifying what we need these fields
for and why we put them here (basically Paolo's suggestion).


+   /* +1 is for packet header. */
+   struct scatterlist *sgs[MAX_SKB_FRAGS + 1];
+   struct scatterlist bufs[MAX_SKB_FRAGS + 1];
};

static u32 virtio_transport_get_local_cid(void)
@@ -100,8 +104,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];

for (;;) {
-   struct scatterlist hdr, buf, *sgs[2];
int ret, in_sg = 0, out_sg = 0;
+   struct scatterlist **sgs;
+   struct scatterlist *bufs;
struct sk_buff *skb;
bool reply;

@@ -111,12 +116,47 @@ virtio_transport_send_pkt_work(struct work_struct *work)

virtio_transport_deliver_tap_pkt(skb);
reply = virtio_vsock_skb_reply(skb);
-
-   sg_init_one(, virtio_vsock_hdr(skb), 
sizeof(*virtio_vsock_hdr(skb)));
-   sgs[out_sg++] = 
-   if (skb->len > 0) {
-   sg_init_one(, skb->data, skb->len);
-   sgs[out_sg++] = 
+   sgs = vsock->sgs;
+   bufs = vsock->bufs;
+   sg_init_one([out_sg], virtio_vsock_hdr(skb),
+   sizeof(*virtio_vsock_hdr(skb)));
+   sgs[out_sg] = [out_sg];


IIUC `sgs[i]` always contains `[i]`.

Could we initialize it once when we allocate `struct virtio_vsock` in
`virtio_vsock_probe`?

Of course putting a comment in `struct virtio_vsock` about it.

Since we are using them only for out buffers, I'd also rename them in
out_sgs and out_bufs.

The rest LGTM.

Stefano


+   out_sg++;
+
+   if (!skb_is_nonlinear(skb)) {
+   if (skb->len > 0) {
+   sg_init_one([out_sg], skb->data, skb->len);
+   sgs[out_sg] = [out_sg];
+   out_sg++;
+   }
+   } else {
+   struct skb_shared_info *si;
+   int i;
+
+   /* If skb is nonlinear, then its buffer must contain
+* only header and nothing more. Data is stored in
+* the fragged part.
+*/
+   WARN_ON_ONCE(skb_headroom(skb) != 
sizeof(*virtio_vsock_hdr(skb)));
+
+   si = skb_shinfo(skb);
+
+   for (i = 0; i < si->nr_frags; i++) {
+   skb_frag_t *skb_frag = >frags[i];
+   void *va;
+
+   /* We will use 'page_to_virt()' for the 
userspace page
+* here, because virtio or dma-mapping layers 
will call
+* 'virt_to_phys()' later to fill the buffer 
descriptor.
+* We don't touch memory at "virtual" address 
of this page.
+*/
+   va = page_to_virt(skb_frag->bv_page);
+   sg_init_one([out_sg],
+   va + skb_frag->bv_offset,
+   skb_frag->bv_len);
+   sgs[out_sg] = [out_sg];
+   out_sg++;
+   }
}

ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, 
GFP_KERNEL);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v7 1/4] vsock/virtio/vhost: read data from non-linear skb

2023-09-01 Thread Stefano Garzarella

On Sun, Aug 27, 2023 at 11:54:33AM +0300, Arseniy Krasnov wrote:

This is preparation patch for MSG_ZEROCOPY support. It adds handling of
non-linear skbs by replacing direct calls of 'memcpy_to_msg()' with
'skb_copy_datagram_iter()'. Main advantage of the second one is that it
can handle paged part of the skb by using 'kmap()' on each page, but if
there are no pages in the skb, it behaves like simple copying to iov
iterator. This patch also adds new field to the control block of skb -
this value shows current offset in the skb to read next portion of data
(it doesn't matter linear it or not). Idea behind this field is that
'skb_copy_datagram_iter()' handles both types of skb internally - it
just needs an offset from which to copy data from the given skb. This
offset is incremented on each read from skb. This approach allows to
simplify handling of both linear and non-linear skbs, because for
linear skb we need to call 'skb_pull()' after reading data from it,
while in non-linear case we need to update 'data_len'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Merge 'virtio_transport_common.c' and 'vhost/vsock.c' patches into
   this single patch.
 * Commit message update: grammar fix and remark that this patch is
   MSG_ZEROCOPY preparation.
 * Use 'min_t()' instead of comparison using '<>' operators.
v1 -> v2:
 * R-b tag added.
v3 -> v4:
 * R-b tag removed due to rebase:
   * Part for 'virtio_transport_stream_do_peek()' is changed.
   * Part for 'virtio_transport_seqpacket_do_peek()' is added.
 * Comments about sleep in 'memcpy_to_msg()' now describe sleep in
   'skb_copy_datagram_iter()'.
v5 -> v6:
 * Commit message update.
 * Rename 'frag_off' to 'offset' in 'virtio_vsock_skb_cb'.

drivers/vhost/vsock.c   | 14 +++
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport_common.c | 32 +++--
3 files changed, 29 insertions(+), 18 deletions(-)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v7 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-09-01 Thread Stefano Garzarella

On Sun, Aug 27, 2023 at 11:54:36AM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible (enabled in socket
options and transport allows zerocopy), then non-linear skb will be
created and filled with the pages of user's buffer. Pages of user's
buffer are locked in memory by 'get_user_pages()'. Second thing that
this patch does is replace type of skb owning: instead of calling
'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
of socket, so to decrease this field correctly proper skb destructor is
needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v5(big patchset) -> v1:
 * Refactorings of 'if' conditions.
 * Remove extra blank line.
 * Remove 'frag_off' field unneeded init.
 * Add function 'virtio_transport_fill_skb()' which fills both linear
   and non-linear skb with provided data.
v1 -> v2:
 * Use original order of last four arguments in 'virtio_transport_alloc_skb()'.
v2 -> v3:
 * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
   provided 'iov_iter' with data could be sent in a zerocopy mode.
   If this callback is not set in transport - transport allows to send
   any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
   then zerocopy is allowed. Reason of this callback is that in case of
   G2H transmission we insert whole skb to the tx virtio queue and such
   skb must fit to the size of the virtio queue to be sent in a single
   iteration (may be tx logic in 'virtio_transport.c' could be reworked
   as in vhost to support partial send of current skb). This callback
   will be enabled only for G2H path. For details pls see comment
   'Check that tx queue...' below.
v3 -> v4:
 * 'msgzerocopy_check_iov' moved from 'struct vsock_transport' to
   'struct virtio_transport' as it is virtio specific callback and
   never needed in other transports.
v4 -> v5:
 * 'msgzerocopy_check_iov' renamed to 'can_msgzerocopy' and now it
   uses number of buffers to send as input argument. I think there is
   no need to pass iov to this callback (at least today, it is used only
   by guest side of virtio transport), because the only thing that this
   callback does is comparison of number of buffers to be inserted to
   the tx queue and size of this queue.
 * Remove any checks for type of current 'iov_iter' with payload (is it
   'iovec' or 'ubuf'). These checks left from the earlier versions where I
   didn't use already implemented kernel API which handles every type of
   'iov_iter'.
v5 -> v6:
 * Refactor 'virtio_transport_fill_skb()'.
 * Add 'WARN_ON_ONCE()' and comment on invalid combination of destination
   socket and payload in 'virtio_transport_alloc_skb()'.

include/linux/virtio_vsock.h|   5 +
net/vmw_vsock/virtio_transport.c|  33 
net/vmw_vsock/virtio_transport_common.c | 250 ++--
3 files changed, 231 insertions(+), 57 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index a91fbdf233e4..56501cd9843f 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -160,6 +160,11 @@ struct virtio_transport {

/* Takes ownership of the packet */
int (*send_pkt)(struct sk_buff *skb);
+
+   /* Used in MSG_ZEROCOPY mode. Checks that provided data
+* could be transmitted with zerocopy mode.
+*/
+   bool (*can_msgzerocopy)(int bufs_num);
};

ssize_t
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 8636477cf088..4ce44916e585 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -453,6 +453,38 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >rx_work);
}

+static bool virtio_transport_can_msgzerocopy(int bufs_num)
+{
+   struct virtio_vsock *vsock;
+   bool res = false;
+
+   rcu_read_lock();
+
+   vsock = rcu_dereference(the_virtio_vsock);
+   if (vsock) {
+   struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+
+   /* Check that tx queue is large enough to keep whole
+* data to send. This is needed, because when there is
+* not enough free space in the queue, current skb to
+* send will be reinserted to the head of tx list of
+* the socket to retry transmission later, so if skb
+* is bigger than whole queue, it will be reinserted
+* again and again, thus blocking other skbs to be sent.
+* Each page of the user provided buffer will be added
+* as a single buffer to the tx virtqueue, so compare
+* number of pages against maximum capacity of the queue.
+* +1 means buffer for 

Re: [RFC PATCH v2 0/2] vsock: handle writes to shutdowned socket

2023-08-31 Thread Stefano Garzarella

Hi Arseniy,

On Sat, Aug 26, 2023 at 08:58:58PM +0300, Arseniy Krasnov wrote:

Hello,

this small patchset adds POSIX compliant behaviour on writes to the
socket which was shutdowned with 'shutdown()' (both sides - local with
SHUT_WR flag, peer - with SHUT_RD flag). According POSIX we must send
SIGPIPE in such cases (but SIGPIPE is not send when MSG_NOSIGNAL is set).

First patch is implemented in the same way as 
net/ipv4/tcp.c:tcp_sendmsg_locked().
It uses 'sk_stream_error()' function which handles EPIPE error. Another
way is to use code from net/unix/af_unix.c:unix_stream_sendmsg() where
same logic from 'sk_stream_error()' is implemented "from scratch", but
it doesn't check 'sk_err' field. I think error from this field has more
priority to be returned from syscall. So I guess it is better to reuse
currently implemented 'sk_stream_error()' function.

Test is also added.

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=b38460bc463c54e0c15ff3b37e81f7e2059bb9bb

Link to v1:
https://lore.kernel.org/netdev/20230801141727.481156-1-avkras...@sberdevices.ru/

Changelog:
v1 -> v2:
* 0001 stills the same - SIGPIPE is sent only for SOCK_STREAM as discussed in v1
  with Stefano Garzarella .
* 0002 - use 'sig_atomic_t' instead of 'bool' for flag variables updated from
  signal handler.

Arseniy Krasnov (2):
 vsock: send SIGPIPE on write to shutdowned socket
 test/vsock: shutdowned socket test


Thanks for this series, I fully reviewed it, LGTM!

Please send it targeting net-next when it reopens.

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 2/2] test/vsock: shutdowned socket test

2023-08-31 Thread Stefano Garzarella

On Sat, Aug 26, 2023 at 08:59:00PM +0300, Arseniy Krasnov wrote:

This adds two tests for 'shutdown()' call. It checks that SIGPIPE is
sent when MSG_NOSIGNAL is not set and vice versa. Both flags SHUT_WR
and SHUT_RD are tested.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 138 +++
1 file changed, 138 insertions(+)


Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 90718c2fd4ea..148fc9c47c50 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -19,6 +19,7 @@
#include 
#include 
#include 
+#include 

#include "timeout.h"
#include "control.h"
@@ -1170,6 +1171,133 @@ static void test_seqpacket_msg_peek_server(const struct 
test_opts *opts)
return test_msg_peek_server(opts, true);
}

+static sig_atomic_t have_sigpipe;
+
+static void sigpipe(int signo)
+{
+   have_sigpipe = 1;
+}
+
+static void test_stream_check_sigpipe(int fd)
+{
+   ssize_t res;
+
+   have_sigpipe = 0;
+
+   res = send(fd, "A", 1, 0);
+   if (res != -1) {
+   fprintf(stderr, "expected send(2) failure, got %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   if (!have_sigpipe) {
+   fprintf(stderr, "SIGPIPE expected\n");
+   exit(EXIT_FAILURE);
+   }
+
+   have_sigpipe = 0;
+
+   res = send(fd, "A", 1, MSG_NOSIGNAL);
+   if (res != -1) {
+   fprintf(stderr, "expected send(2) failure, got %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   if (have_sigpipe) {
+   fprintf(stderr, "SIGPIPE not expected\n");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void test_stream_shutwr_client(const struct test_opts *opts)
+{
+   int fd;
+
+   struct sigaction act = {
+   .sa_handler = sigpipe,
+   };
+
+   sigaction(SIGPIPE, , NULL);
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   if (shutdown(fd, SHUT_WR)) {
+   perror("shutdown");
+   exit(EXIT_FAILURE);
+   }
+
+   test_stream_check_sigpipe(fd);
+
+   control_writeln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutwr_server(const struct test_opts *opts)
+{
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   control_expectln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutrd_client(const struct test_opts *opts)
+{
+   int fd;
+
+   struct sigaction act = {
+   .sa_handler = sigpipe,
+   };
+
+   sigaction(SIGPIPE, , NULL);
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   control_expectln("SHUTRDDONE");
+
+   test_stream_check_sigpipe(fd);
+
+   control_writeln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutrd_server(const struct test_opts *opts)
+{
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   if (shutdown(fd, SHUT_RD)) {
+   perror("shutdown");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SHUTRDDONE");
+   control_expectln("CLIENTDONE");
+
+   close(fd);
+}
+
static struct test_case test_cases[] = {
{
.name = "SOCK_STREAM connection reset",
@@ -1250,6 +1378,16 @@ static struct test_case test_cases[] = {
.run_client = test_seqpacket_msg_peek_client,
.run_server = test_seqpacket_msg_peek_server,
},
+   {
+   .name = "SOCK_STREAM SHUT_WR",
+   .run_client = test_stream_shutwr_client,
+   .run_server = test_stream_shutwr_server,
+   },
+   {
+   .name = "SOCK_STREAM SHUT_RD",
+   .run_client = test_stream_shutrd_client,
+   .run_server = test_stream_shutrd_server,
+   },
{},
};

--
2.25.1




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-31 Thread Stefano Garzarella

On Sat, Aug 26, 2023 at 08:58:59PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
err = total_written;
}
out:
+   if (sk->sk_type == SOCK_STREAM)
+   err = sk_stream_error(sk, msg->msg_flags, err);
+
release_sock(sk);
return err;
}
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-22 Thread Stefano Garzarella

On Mon, Aug 14, 2023 at 10:46:05PM +0300, Arseniy Krasnov wrote:



On 04.08.2023 17:28, Stefano Garzarella wrote:

On Fri, Aug 04, 2023 at 03:46:47PM +0300, Arseniy Krasnov wrote:

Hi Stefano,

On 02.08.2023 10:46, Stefano Garzarella wrote:

On Tue, Aug 01, 2023 at 05:17:26PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
    err = total_written;
}
out:
+    if (sk->sk_type == SOCK_STREAM)
+    err = sk_stream_error(sk, msg->msg_flags, err);


Do you know why we don't need this for SOCK_SEQPACKET and SOCK_DGRAM?


Yes, here is my explanation:

This function checks that input error is SIGPIPE, and if so it sends SIGPIPE to 
the 'current' thread
(except case when MSG_NOSIGNAL flag is set). This behaviour is described in 
POSIX:

Page 367 (description of defines from sys/socket.h):
MSG_NOSIGNAL: No SIGPIPE generated when an attempt to send is made on a stream-
oriented socket that is no longer connected.

Page 497 (description of SOCK_STREAM):
A SIGPIPE signal is raised if a thread sends on a broken stream (one that is
no longer connected).


Okay, but I think we should do also for SEQPACKET:

https://pubs.opengroup.org/onlinepubs/009696699/functions/xsh_chap02_10.html

In 2.10.6 Socket Types:

"The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and
is also connection-oriented. The only difference between these types is
that record boundaries ..."

Then in  2.10.14 Signals:

"The SIGPIPE signal shall be sent to a thread that attempts to send data
on a socket that is no longer able to send. In addition, the send
operation fails with the error [EPIPE]."

It's honestly not super clear, but I assume the problem is similar with
seqpacket since it's connection-oriented, or did I miss something?

For example in sctp_sendmsg() IIUC we raise a SIGPIPE regardless of
whether the socket is STREAM or SEQPACKET.


Update about sending SIGPIPE for SOCK_SEQPACKET, I checked POSIX doc and kernel 
sources more deeply:


1)

I checked four types of sockets, which sends SIGPIPE for SOCK_SEQPACKET or not 
('YES' if
this socket sends SIGPIPE in SOCK_SEQPACKET case):

net/kcm/: YES
net/unix/: NO
net/sctp/: YES
net/caif/: NO

Looking for this, I think it is impossible to get the right answer, as there is 
some
mess - everyone implements it as wish.


Eheh, I had the same impression!



2)

I opened POSIX spec again, and here are details about returning EPIPE from pages
for 'send()', 'sendto()', 'sendmsg()':

[EPIPE] The socket is shut down for writing, or the socket is connection-mode 
and is
no longer connected. In the latter case, and if the socket is of type
SOCK_STREAM, the SIGPIPE signal is generated to the calling thread

So my opinion is that we need to send SIGPIPE only for SOCK_STREAM. Another 
question
is how to interpret this from above (but again - SIGPIPE is related for 
SOCK_STREAM
only):

**" and is no longer connected"**

IIUC, if we follow POSIX strictly, this check must be like:

/* socket is shut down for writing or no longer connected. */
if (sk->sk_shutdown & SEND_SHUTDOWN ||
   vsk->peer_shutdown & RCV_SHUTDOWN ||
   sock_flag(SOCK_DONE)) {
err = -EPIPE;
goto out;
}

...

out:
/* Handle -EPIPE for stream socket which is no longer connected. */
if (sk->sk_type == SOCK_STREAM &&
sock_flag(SOCK_DONE))
err = sk_stream_error();



From the other side, we can just follow TCP/AF_UNIX implementations as both are
popular types of socket. In this case I suggest to implement this check like
(e.g. without sock_flag(SOCK_DONE)):


if (sk->sk_shutdown & SEND_SHUTDOWN ||
   vsk->peer_shutdown & RCV_SHUTDOWN) {
err = -EPIPE;
goto out;
}

...

out:
if (sk->sk_type == SOCK_STREAM)
err = sk_stream_error();

What do you think?


I'd follow TCP/AF_UNIX implementations, but it is up to you ;-)

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-22 Thread Stefano Garzarella

On Mon, Aug 14, 2023 at 10:40:17PM +0300, Arseniy Krasnov wrote:



On 04.08.2023 18:02, Stefano Garzarella wrote:

On Fri, Aug 04, 2023 at 05:34:20PM +0300, Arseniy Krasnov wrote:



On 04.08.2023 17:28, Stefano Garzarella wrote:

On Fri, Aug 04, 2023 at 03:46:47PM +0300, Arseniy Krasnov wrote:

Hi Stefano,

On 02.08.2023 10:46, Stefano Garzarella wrote:

On Tue, Aug 01, 2023 at 05:17:26PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
    err = total_written;
}
out:
+    if (sk->sk_type == SOCK_STREAM)
+    err = sk_stream_error(sk, msg->msg_flags, err);


Do you know why we don't need this for SOCK_SEQPACKET and SOCK_DGRAM?


Yes, here is my explanation:

This function checks that input error is SIGPIPE, and if so it sends SIGPIPE to 
the 'current' thread
(except case when MSG_NOSIGNAL flag is set). This behaviour is described in 
POSIX:

Page 367 (description of defines from sys/socket.h):
MSG_NOSIGNAL: No SIGPIPE generated when an attempt to send is made on a stream-
oriented socket that is no longer connected.

Page 497 (description of SOCK_STREAM):
A SIGPIPE signal is raised if a thread sends on a broken stream (one that is
no longer connected).


Okay, but I think we should do also for SEQPACKET:

https://pubs.opengroup.org/onlinepubs/009696699/functions/xsh_chap02_10.html

In 2.10.6 Socket Types:

"The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and
is also connection-oriented. The only difference between these types is
that record boundaries ..."

Then in  2.10.14 Signals:

"The SIGPIPE signal shall be sent to a thread that attempts to send data
on a socket that is no longer able to send. In addition, the send
operation fails with the error [EPIPE]."

It's honestly not super clear, but I assume the problem is similar with
seqpacket since it's connection-oriented, or did I miss something?

For example in sctp_sendmsg() IIUC we raise a SIGPIPE regardless of
whether the socket is STREAM or SEQPACKET.


Hm, yes, you're right. Seems check for socket type is not needed in this case,
as this function is only for connection oriented sockets.


Ack!







Page 1802 (description of 'send()' call):
MSG_NOSIGNAL

Requests not to send the SIGPIPE signal if an attempt to
send is made on a stream-oriented socket that is no
longer connected. The [EPIPE] error shall still be
returned

And the same for 'sendto()' and 'sendmsg()'

Link to the POSIX document:
https://www.open-std.org/jtc1/sc22/open/n4217.pdf

TCP (I think we must rely on it), KCM, SMC sockets (all of them are stream) 
work in the same
way by calling this function. AF_UNIX also works in the same way, but it 
implements SIGPIPE handling
without this function.


I'm okay calling this function.



The only thing that confused me a little bit, that sockets above returns EPIPE 
when
we have only SEND_SHUTDOWN set, but for AF_VSOCK EPIPE is returned for 
RCV_SHUTDOWN
also, but I think it is related to this patchset.


Do you mean that it is NOT related to this patchset?


Yes, **NOT**


Got it, so if you have time when you're back, let's check also that
(not for this series as you mentioned).


^^^
Hello Stefano, so:

there is some confusion with check for RCV_SHUTDOWN: it presents in AF_UNIX, 
but missed
in TCP (it checks only for SEND_SHUTDOWN). I performed simple test which tries
to send data to peer which already called shutdown(SHUT_RD) - AF_UNIX and TCP 
behave
differently. AF_UNIX sends SIGPIPE, while TCP allows to send data.

I suggest to not touch this check for AF_VSOCK (e.g. continue work as AF_UNIX),
because I don't see strong motivation/argument to remove it.


Yep, I agree!

However, I think it's a fairly borderline case, so unless we have a
specific request, I wouldn't spend too much time on it.

Thanks for looking at it!

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-04 Thread Stefano Garzarella

On Fri, Aug 04, 2023 at 05:34:20PM +0300, Arseniy Krasnov wrote:



On 04.08.2023 17:28, Stefano Garzarella wrote:

On Fri, Aug 04, 2023 at 03:46:47PM +0300, Arseniy Krasnov wrote:

Hi Stefano,

On 02.08.2023 10:46, Stefano Garzarella wrote:

On Tue, Aug 01, 2023 at 05:17:26PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
    err = total_written;
}
out:
+    if (sk->sk_type == SOCK_STREAM)
+    err = sk_stream_error(sk, msg->msg_flags, err);


Do you know why we don't need this for SOCK_SEQPACKET and SOCK_DGRAM?


Yes, here is my explanation:

This function checks that input error is SIGPIPE, and if so it sends SIGPIPE to 
the 'current' thread
(except case when MSG_NOSIGNAL flag is set). This behaviour is described in 
POSIX:

Page 367 (description of defines from sys/socket.h):
MSG_NOSIGNAL: No SIGPIPE generated when an attempt to send is made on a stream-
oriented socket that is no longer connected.

Page 497 (description of SOCK_STREAM):
A SIGPIPE signal is raised if a thread sends on a broken stream (one that is
no longer connected).


Okay, but I think we should do also for SEQPACKET:

https://pubs.opengroup.org/onlinepubs/009696699/functions/xsh_chap02_10.html

In 2.10.6 Socket Types:

"The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and
is also connection-oriented. The only difference between these types is
that record boundaries ..."

Then in  2.10.14 Signals:

"The SIGPIPE signal shall be sent to a thread that attempts to send data
on a socket that is no longer able to send. In addition, the send
operation fails with the error [EPIPE]."

It's honestly not super clear, but I assume the problem is similar with
seqpacket since it's connection-oriented, or did I miss something?

For example in sctp_sendmsg() IIUC we raise a SIGPIPE regardless of
whether the socket is STREAM or SEQPACKET.


Hm, yes, you're right. Seems check for socket type is not needed in this case,
as this function is only for connection oriented sockets.


Ack!







Page 1802 (description of 'send()' call):
MSG_NOSIGNAL

Requests not to send the SIGPIPE signal if an attempt to
send is made on a stream-oriented socket that is no
longer connected. The [EPIPE] error shall still be
returned

And the same for 'sendto()' and 'sendmsg()'

Link to the POSIX document:
https://www.open-std.org/jtc1/sc22/open/n4217.pdf

TCP (I think we must rely on it), KCM, SMC sockets (all of them are stream) 
work in the same
way by calling this function. AF_UNIX also works in the same way, but it 
implements SIGPIPE handling
without this function.


I'm okay calling this function.



The only thing that confused me a little bit, that sockets above returns EPIPE 
when
we have only SEND_SHUTDOWN set, but for AF_VSOCK EPIPE is returned for 
RCV_SHUTDOWN
also, but I think it is related to this patchset.


Do you mean that it is NOT related to this patchset?


Yes, **NOT**


Got it, so if you have time when you're back, let's check also that
(not for this series as you mentioned).

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-04 Thread Stefano Garzarella

On Fri, Aug 04, 2023 at 03:46:47PM +0300, Arseniy Krasnov wrote:

Hi Stefano,

On 02.08.2023 10:46, Stefano Garzarella wrote:

On Tue, Aug 01, 2023 at 05:17:26PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
    err = total_written;
}
out:
+    if (sk->sk_type == SOCK_STREAM)
+    err = sk_stream_error(sk, msg->msg_flags, err);


Do you know why we don't need this for SOCK_SEQPACKET and SOCK_DGRAM?


Yes, here is my explanation:

This function checks that input error is SIGPIPE, and if so it sends SIGPIPE to 
the 'current' thread
(except case when MSG_NOSIGNAL flag is set). This behaviour is described in 
POSIX:

Page 367 (description of defines from sys/socket.h):
MSG_NOSIGNAL: No SIGPIPE generated when an attempt to send is made on a stream-
oriented socket that is no longer connected.

Page 497 (description of SOCK_STREAM):
A SIGPIPE signal is raised if a thread sends on a broken stream (one that is
no longer connected).


Okay, but I think we should do also for SEQPACKET:

https://pubs.opengroup.org/onlinepubs/009696699/functions/xsh_chap02_10.html

In 2.10.6 Socket Types:

"The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and
is also connection-oriented. The only difference between these types is
that record boundaries ..."

Then in  2.10.14 Signals:

"The SIGPIPE signal shall be sent to a thread that attempts to send data
on a socket that is no longer able to send. In addition, the send
operation fails with the error [EPIPE]."

It's honestly not super clear, but I assume the problem is similar with
seqpacket since it's connection-oriented, or did I miss something?

For example in sctp_sendmsg() IIUC we raise a SIGPIPE regardless of
whether the socket is STREAM or SEQPACKET.



Page 1802 (description of 'send()' call):
MSG_NOSIGNAL

Requests not to send the SIGPIPE signal if an attempt to
send is made on a stream-oriented socket that is no
longer connected. The [EPIPE] error shall still be
returned

And the same for 'sendto()' and 'sendmsg()'

Link to the POSIX document:
https://www.open-std.org/jtc1/sc22/open/n4217.pdf

TCP (I think we must rely on it), KCM, SMC sockets (all of them are stream) 
work in the same
way by calling this function. AF_UNIX also works in the same way, but it 
implements SIGPIPE handling
without this function.


I'm okay calling this function.



The only thing that confused me a little bit, that sockets above returns EPIPE 
when
we have only SEND_SHUTDOWN set, but for AF_VSOCK EPIPE is returned for 
RCV_SHUTDOWN
also, but I think it is related to this patchset.


Do you mean that it is NOT related to this patchset?

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v5 03/14] af_vsock: support multi-transport datagrams

2023-08-04 Thread Stefano Garzarella

On Thu, Aug 03, 2023 at 06:58:24PM +, Bobby Eshleman wrote:

On Thu, Aug 03, 2023 at 02:42:26PM +0200, Stefano Garzarella wrote:

On Thu, Aug 03, 2023 at 12:53:22AM +, Bobby Eshleman wrote:
> On Wed, Aug 02, 2023 at 10:24:44PM +, Bobby Eshleman wrote:
> > On Sun, Jul 23, 2023 at 12:53:15AM +0300, Arseniy Krasnov wrote:
> > >
> > >
> > > On 19.07.2023 03:50, Bobby Eshleman wrote:
> > > > This patch adds support for multi-transport datagrams.
> > > >
> > > > This includes:
> > > > - Per-packet lookup of transports when using sendto(sockaddr_vm)
> > > > - Selecting H2G or G2H transport using VMADDR_FLAG_TO_HOST and CID in
> > > >   sockaddr_vm
> > > > - rename VSOCK_TRANSPORT_F_DGRAM to VSOCK_TRANSPORT_F_DGRAM_FALLBACK
> > > > - connect() now assigns the transport for (similar to connectible
> > > >   sockets)
> > > >
> > > > To preserve backwards compatibility with VMCI, some important changes
> > > > are made. The "transport_dgram" / VSOCK_TRANSPORT_F_DGRAM is changed to
> > > > be used for dgrams only if there is not yet a g2h or h2g transport that
> > > > has been registered that can transmit the packet. If there is a g2h/h2g
> > > > transport for that remote address, then that transport will be used and
> > > > not "transport_dgram". This essentially makes "transport_dgram" a
> > > > fallback transport for when h2g/g2h has not yet gone online, and so it
> > > > is renamed "transport_dgram_fallback". VMCI implements this transport.
> > > >
> > > > The logic around "transport_dgram" needs to be retained to prevent
> > > > breaking VMCI:
> > > >
> > > > 1) VMCI datagrams existed prior to h2g/g2h and so operate under a
> > > >different paradigm. When the vmci transport comes online, it 
registers
> > > >itself with the DGRAM feature, but not H2G/G2H. Only later when the
> > > >transport has more information about its environment does it register
> > > >H2G or G2H.  In the case that a datagram socket is created after
> > > >VSOCK_TRANSPORT_F_DGRAM registration but before G2H/H2G registration,
> > > >the "transport_dgram" transport is the only registered transport and 
so
> > > >needs to be used.
> > > >
> > > > 2) VMCI seems to require a special message be sent by the transport 
when a
> > > >datagram socket calls bind(). Under the h2g/g2h model, the transport
> > > >is selected using the remote_addr which is set by connect(). At
> > > >bind time there is no remote_addr because often no connect() has been
> > > >called yet: the transport is null. Therefore, with a null transport
> > > >there doesn't seem to be any good way for a datagram socket to tell 
the
> > > >VMCI transport that it has just had bind() called upon it.
> > > >
> > > > With the new fallback logic, after H2G/G2H comes online the socket layer
> > > > will access the VMCI transport via transport_{h2g,g2h}. Prior to H2G/G2H
> > > > coming online, the socket layer will access the VMCI transport via
> > > > "transport_dgram_fallback".
> > > >
> > > > Only transports with a special datagram fallback use-case such as VMCI
> > > > need to register VSOCK_TRANSPORT_F_DGRAM_FALLBACK.
> > > >
> > > > Signed-off-by: Bobby Eshleman 
> > > > ---
> > > >  drivers/vhost/vsock.c   |  1 -
> > > >  include/linux/virtio_vsock.h|  2 --
> > > >  include/net/af_vsock.h  | 10 +++---
> > > >  net/vmw_vsock/af_vsock.c| 64 
++---
> > > >  net/vmw_vsock/hyperv_transport.c|  6 
> > > >  net/vmw_vsock/virtio_transport.c|  1 -
> > > >  net/vmw_vsock/virtio_transport_common.c |  7 
> > > >  net/vmw_vsock/vmci_transport.c  |  2 +-
> > > >  net/vmw_vsock/vsock_loopback.c  |  1 -
> > > >  9 files changed, 58 insertions(+), 36 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > > > index ae8891598a48..d5d6a3c3f273 100644
> > > > --- a/drivers/vhost/vsock.c
> > > > +++ b/drivers/vhost/vsock.c
> > > > @@ -410,7 +410,6 @@ static struct virtio_transport vhost_transport = {

Re: [PATCH -next] af_vsock: Remove unused declaration vsock_release_pending()/vsock_init_tap()

2023-08-04 Thread Stefano Garzarella

On Thu, Aug 03, 2023 at 09:33:41PM +0200, Simon Horman wrote:

On Thu, Aug 03, 2023 at 09:45:07PM +0800, Yue Haibing wrote:

Commit d021c344051a ("VSOCK: Introduce VM Sockets") declared but never 
implemented
vsock_release_pending(). Also vsock_init_tap() never implemented since 
introduction
in commit 531b374834c8 ("VSOCK: Add vsockmon tap functions").

Signed-off-by: Yue Haibing 


Hi Yue Haibing,

FWIIW, I think this should be targeted at net-next.


Yep, please send to net-next.

Looks good also to me:

Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v5 03/14] af_vsock: support multi-transport datagrams

2023-08-03 Thread Stefano Garzarella

On Thu, Aug 03, 2023 at 12:53:22AM +, Bobby Eshleman wrote:

On Wed, Aug 02, 2023 at 10:24:44PM +, Bobby Eshleman wrote:

On Sun, Jul 23, 2023 at 12:53:15AM +0300, Arseniy Krasnov wrote:
>
>
> On 19.07.2023 03:50, Bobby Eshleman wrote:
> > This patch adds support for multi-transport datagrams.
> >
> > This includes:
> > - Per-packet lookup of transports when using sendto(sockaddr_vm)
> > - Selecting H2G or G2H transport using VMADDR_FLAG_TO_HOST and CID in
> >   sockaddr_vm
> > - rename VSOCK_TRANSPORT_F_DGRAM to VSOCK_TRANSPORT_F_DGRAM_FALLBACK
> > - connect() now assigns the transport for (similar to connectible
> >   sockets)
> >
> > To preserve backwards compatibility with VMCI, some important changes
> > are made. The "transport_dgram" / VSOCK_TRANSPORT_F_DGRAM is changed to
> > be used for dgrams only if there is not yet a g2h or h2g transport that
> > has been registered that can transmit the packet. If there is a g2h/h2g
> > transport for that remote address, then that transport will be used and
> > not "transport_dgram". This essentially makes "transport_dgram" a
> > fallback transport for when h2g/g2h has not yet gone online, and so it
> > is renamed "transport_dgram_fallback". VMCI implements this transport.
> >
> > The logic around "transport_dgram" needs to be retained to prevent
> > breaking VMCI:
> >
> > 1) VMCI datagrams existed prior to h2g/g2h and so operate under a
> >different paradigm. When the vmci transport comes online, it registers
> >itself with the DGRAM feature, but not H2G/G2H. Only later when the
> >transport has more information about its environment does it register
> >H2G or G2H.  In the case that a datagram socket is created after
> >VSOCK_TRANSPORT_F_DGRAM registration but before G2H/H2G registration,
> >the "transport_dgram" transport is the only registered transport and so
> >needs to be used.
> >
> > 2) VMCI seems to require a special message be sent by the transport when a
> >datagram socket calls bind(). Under the h2g/g2h model, the transport
> >is selected using the remote_addr which is set by connect(). At
> >bind time there is no remote_addr because often no connect() has been
> >called yet: the transport is null. Therefore, with a null transport
> >there doesn't seem to be any good way for a datagram socket to tell the
> >VMCI transport that it has just had bind() called upon it.
> >
> > With the new fallback logic, after H2G/G2H comes online the socket layer
> > will access the VMCI transport via transport_{h2g,g2h}. Prior to H2G/G2H
> > coming online, the socket layer will access the VMCI transport via
> > "transport_dgram_fallback".
> >
> > Only transports with a special datagram fallback use-case such as VMCI
> > need to register VSOCK_TRANSPORT_F_DGRAM_FALLBACK.
> >
> > Signed-off-by: Bobby Eshleman 
> > ---
> >  drivers/vhost/vsock.c   |  1 -
> >  include/linux/virtio_vsock.h|  2 --
> >  include/net/af_vsock.h  | 10 +++---
> >  net/vmw_vsock/af_vsock.c| 64 
++---
> >  net/vmw_vsock/hyperv_transport.c|  6 
> >  net/vmw_vsock/virtio_transport.c|  1 -
> >  net/vmw_vsock/virtio_transport_common.c |  7 
> >  net/vmw_vsock/vmci_transport.c  |  2 +-
> >  net/vmw_vsock/vsock_loopback.c  |  1 -
> >  9 files changed, 58 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index ae8891598a48..d5d6a3c3f273 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -410,7 +410,6 @@ static struct virtio_transport vhost_transport = {
> >   .cancel_pkt   = vhost_transport_cancel_pkt,
> >
> >   .dgram_enqueue= virtio_transport_dgram_enqueue,
> > - .dgram_bind   = virtio_transport_dgram_bind,
> >   .dgram_allow  = virtio_transport_dgram_allow,
> >
> >   .stream_enqueue   = virtio_transport_stream_enqueue,
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index 18cbe8d37fca..7632552bee58 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -211,8 +211,6 @@ void virtio_transport_notify_buffer_size(struct 
vsock_sock *vsk, u64 *val);
> >  u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
> >  bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
> >  bool virtio_transport_stream_allow(u32 cid, u32 port);
> > -int virtio_transport_dgram_bind(struct vsock_sock *vsk,
> > - struct sockaddr_vm *addr);
> >  bool virtio_transport_dgram_allow(u32 cid, u32 port);
> >
> >  int virtio_transport_connect(struct vsock_sock *vsk);
> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > index 305d57502e89..f6a0ca9d7c3e 100644
> > --- a/include/net/af_vsock.h
> > +++ b/include/net/af_vsock.h
> > 

[PATCH net] test/vsock: remove vsock_perf executable on `make clean`

2023-08-03 Thread Stefano Garzarella
We forgot to add vsock_perf to the rm command in the `clean`
target, so now we have a left over after `make clean` in
tools/testing/vsock.

Fixes: 8abbffd27ced ("test/vsock: vsock_perf utility")
Cc: avkras...@sberdevices.ru
Signed-off-by: Stefano Garzarella 
---
 tools/testing/vsock/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 43a254f0e14d..21a98ba565ab 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -8,5 +8,5 @@ vsock_perf: vsock_perf.o
 CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
 .PHONY: all test clean
 clean:
-   ${RM} *.o *.d vsock_test vsock_diag_test
+   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
 -include *.d
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 2/2] test/vsock: shutdowned socket test

2023-08-02 Thread Stefano Garzarella

On Tue, Aug 01, 2023 at 05:17:27PM +0300, Arseniy Krasnov wrote:

This adds two tests for 'shutdown()' call. It checks that SIGPIPE is
sent when MSG_NOSIGNAL is not set and vice versa. Both flags SHUT_WR
and SHUT_RD are tested.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 138 +++
1 file changed, 138 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 90718c2fd4ea..21d40a8d881c 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -19,6 +19,7 @@
#include 
#include 
#include 
+#include 

#include "timeout.h"
#include "control.h"
@@ -1170,6 +1171,133 @@ static void test_seqpacket_msg_peek_server(const struct 
test_opts *opts)
return test_msg_peek_server(opts, true);
}

+static bool have_sigpipe;

 ^
We should define it as `volatile sig_atomic_t`:

the behavior is undefined if the signal handler refers to any object
[CX] [Option Start]  other than errno [Option End]  with static storage
duration other than by assigning a value to an object declared as
volatile sig_atomic_t

https://pubs.opengroup.org/onlinepubs/9699919799/functions/signal.html

The rest LGTM!

Thanks,
Stefano


+
+static void sigpipe(int signo)
+{
+   have_sigpipe = true;
+}
+
+static void test_stream_check_sigpipe(int fd)
+{
+   ssize_t res;
+
+   have_sigpipe = false;
+
+   res = send(fd, "A", 1, 0);
+   if (res != -1) {
+   fprintf(stderr, "expected send(2) failure, got %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   if (!have_sigpipe) {
+   fprintf(stderr, "SIGPIPE expected\n");
+   exit(EXIT_FAILURE);
+   }
+
+   have_sigpipe = false;
+
+   res = send(fd, "A", 1, MSG_NOSIGNAL);
+   if (res != -1) {
+   fprintf(stderr, "expected send(2) failure, got %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   if (have_sigpipe) {
+   fprintf(stderr, "SIGPIPE not expected\n");
+   exit(EXIT_FAILURE);
+   }
+}
+
+static void test_stream_shutwr_client(const struct test_opts *opts)
+{
+   int fd;
+
+   struct sigaction act = {
+   .sa_handler = sigpipe,
+   };
+
+   sigaction(SIGPIPE, , NULL);
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   if (shutdown(fd, SHUT_WR)) {
+   perror("shutdown");
+   exit(EXIT_FAILURE);
+   }
+
+   test_stream_check_sigpipe(fd);
+
+   control_writeln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutwr_server(const struct test_opts *opts)
+{
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   control_expectln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutrd_client(const struct test_opts *opts)
+{
+   int fd;
+
+   struct sigaction act = {
+   .sa_handler = sigpipe,
+   };
+
+   sigaction(SIGPIPE, , NULL);
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   control_expectln("SHUTRDDONE");
+
+   test_stream_check_sigpipe(fd);
+
+   control_writeln("CLIENTDONE");
+
+   close(fd);
+}
+
+static void test_stream_shutrd_server(const struct test_opts *opts)
+{
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   if (shutdown(fd, SHUT_RD)) {
+   perror("shutdown");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SHUTRDDONE");
+   control_expectln("CLIENTDONE");
+
+   close(fd);
+}
+
static struct test_case test_cases[] = {
{
.name = "SOCK_STREAM connection reset",
@@ -1250,6 +1378,16 @@ static struct test_case test_cases[] = {
.run_client = test_seqpacket_msg_peek_client,
.run_server = test_seqpacket_msg_peek_server,
},
+   {
+   .name = "SOCK_STREAM SHUT_WR",
+   .run_client = test_stream_shutwr_client,
+   .run_server = test_stream_shutwr_server,
+   },
+   {
+   .name = "SOCK_STREAM SHUT_RD",
+   .run_client = test_stream_shutrd_client,
+   .run_server = test_stream_shutrd_server,
+   },
{},
};

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/2] vsock: send SIGPIPE on write to shutdowned socket

2023-08-02 Thread Stefano Garzarella

On Tue, Aug 01, 2023 at 05:17:26PM +0300, Arseniy Krasnov wrote:

POSIX requires to send SIGPIPE on write to SOCK_STREAM socket which was
shutdowned with SHUT_WR flag or its peer was shutdowned with SHUT_RD
flag. Also we must not send SIGPIPE if MSG_NOSIGNAL flag is set.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
err = total_written;
}
out:
+   if (sk->sk_type == SOCK_STREAM)
+   err = sk_stream_error(sk, msg->msg_flags, err);


Do you know why we don't need this for SOCK_SEQPACKET and SOCK_DGRAM?

Thanks,
Stefano


+
release_sock(sk);
return err;
}
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v5 10/14] virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit

2023-08-01 Thread Stefano Garzarella

On Tue, Aug 01, 2023 at 04:30:22AM +, Bobby Eshleman wrote:

On Thu, Jul 27, 2023 at 09:48:21AM +0200, Stefano Garzarella wrote:

On Wed, Jul 26, 2023 at 02:38:08PM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 19, 2023 at 12:50:14AM +, Bobby Eshleman wrote:
> > This commit adds a feature bit for virtio vsock to support datagrams.
> >
> > Signed-off-by: Jiang Wang 
> > Signed-off-by: Bobby Eshleman 
> > ---
> >  include/uapi/linux/virtio_vsock.h | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
> > index 331be28b1d30..27b4b2b8bf13 100644
> > --- a/include/uapi/linux/virtio_vsock.h
> > +++ b/include/uapi/linux/virtio_vsock.h
> > @@ -40,6 +40,7 @@
> >
> >  /* The feature bitmap for virtio vsock */
> >  #define VIRTIO_VSOCK_F_SEQPACKET 1   /* SOCK_SEQPACKET supported */
> > +#define VIRTIO_VSOCK_F_DGRAM 3   /* SOCK_DGRAM supported */
> >
> >  struct virtio_vsock_config {
> >   __le64 guest_cid;
>
> pls do not add interface without first getting it accepted in the
> virtio spec.

Yep, fortunatelly this series is still RFC.
I think by now we've seen that the implementation is doable, so we
should discuss the changes to the specification ASAP. Then we can
merge the series.

@Bobby can you start the discussion about spec changes?



No problem at all. Am I right to assume that a new patch to the spec is
the standard starting point for discussion?


Yep, I think so!

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next] vsock: Remove unused function declarations

2023-07-31 Thread Stefano Garzarella

On Sat, Jul 29, 2023 at 08:20:36PM +0800, Yue Haibing wrote:

These are never implemented since introduction in
commit d021c344051a ("VSOCK: Introduce VM Sockets")

Signed-off-by: Yue Haibing 
---
net/vmw_vsock/vmci_transport.h | 3 ---
1 file changed, 3 deletions(-)


Good catch ;-)

I'd used "vsock/vmci:" as a prefix in the title.

With or without:

Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index b7b072194282..dbda3ababa14 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -116,9 +116,6 @@ struct vmci_transport {
spinlock_t lock; /* protects sk. */
};

-int vmci_transport_register(void);
-void vmci_transport_unregister(void);
-
int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
 struct sockaddr_vm *src);
int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
--
2.34.1




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-27 Thread Stefano Garzarella

On Thu, Jul 27, 2023 at 11:32:00AM +0300, Arseniy Krasnov wrote:

On 25.07.2023 15:28, Stefano Garzarella wrote:

On Tue, Jul 25, 2023 at 12:16:11PM +0300, Arseniy Krasnov wrote:

On 25.07.2023 11:46, Arseniy Krasnov wrote:

On 25.07.2023 11:43, Stefano Garzarella wrote:

On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:


[...]


+    t = vsock_core_get_transport(info->vsk);

-    if (msg_data_left(info->msg) == 0 &&
-    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
-    hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+    if (t->msgzerocopy_check_iov &&
+    !t->msgzerocopy_check_iov(iov_iter))
+    return false;


I'd avoid adding a new transport callback used only internally in virtio
transports.


Ok, I see.



Usually the transport callbacks are used in af_vsock.c, if we need a
callback just for virtio transports, maybe better to add it in struct
virtio_vsock_pkt_info or struct virtio_vsock_sock.


Hm, may be I just need to move this callback from 'struct vsock_transport' to 
parent 'struct virtio_transport',
after 'send_pkt' callback. In this case:
1) AF_VSOCK part is not touched.
2) This callback stays in 'virtio_transport.c' and is set also in this file.
  vhost and loopback are unchanged - only 'send_pkt' still enabled in both
  files for these two transports.


Yep, this could also work!

Stefano


Great! I'll send this implementation when this patchset for MSG_PEEK will be 
merged
to net-next as both conflicts with each other.

https://lore.kernel.org/netdev/20230726060150-mutt-send-email-...@kernel.org/T/#m56f3b850361a412735616145162d2d9df25f6350


Ack!

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v5 10/14] virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit

2023-07-27 Thread Stefano Garzarella

On Wed, Jul 26, 2023 at 02:38:08PM -0400, Michael S. Tsirkin wrote:

On Wed, Jul 19, 2023 at 12:50:14AM +, Bobby Eshleman wrote:

This commit adds a feature bit for virtio vsock to support datagrams.

Signed-off-by: Jiang Wang 
Signed-off-by: Bobby Eshleman 
---
 include/uapi/linux/virtio_vsock.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 331be28b1d30..27b4b2b8bf13 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -40,6 +40,7 @@

 /* The feature bitmap for virtio vsock */
 #define VIRTIO_VSOCK_F_SEQPACKET   1   /* SOCK_SEQPACKET supported */
+#define VIRTIO_VSOCK_F_DGRAM   3   /* SOCK_DGRAM supported */

 struct virtio_vsock_config {
__le64 guest_cid;


pls do not add interface without first getting it accepted in the
virtio spec.


Yep, fortunatelly this series is still RFC.
I think by now we've seen that the implementation is doable, so we
should discuss the changes to the specification ASAP. Then we can
merge the series.

@Bobby can you start the discussion about spec changes?

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 4/4] vsock/test: MSG_PEEK test for SOCK_SEQPACKET

2023-07-26 Thread Stefano Garzarella

On Tue, Jul 25, 2023 at 08:29:12PM +0300, Arseniy Krasnov wrote:

This adds MSG_PEEK test for SOCK_SEQPACKET. It works in the same way as
SOCK_STREAM test, except it also tests MSG_TRUNC flag.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 58 +---
1 file changed, 54 insertions(+), 4 deletions(-)


Reviewed-by: Stefano Garzarella 

Thanks,
Stefano



diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 444a3ff0681f..90718c2fd4ea 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -257,14 +257,19 @@ static void test_stream_multiconn_server(const struct 
test_opts *opts)

#define MSG_PEEK_BUF_LEN 64

-static void test_stream_msg_peek_client(const struct test_opts *opts)
+static void test_msg_peek_client(const struct test_opts *opts,
+bool seqpacket)
{
unsigned char buf[MSG_PEEK_BUF_LEN];
ssize_t send_size;
int fd;
int i;

-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (seqpacket)
+   fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
+   else
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -290,7 +295,8 @@ static void test_stream_msg_peek_client(const struct 
test_opts *opts)
close(fd);
}

-static void test_stream_msg_peek_server(const struct test_opts *opts)
+static void test_msg_peek_server(const struct test_opts *opts,
+bool seqpacket)
{
unsigned char buf_half[MSG_PEEK_BUF_LEN / 2];
unsigned char buf_normal[MSG_PEEK_BUF_LEN];
@@ -298,7 +304,11 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
ssize_t res;
int fd;

-   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (seqpacket)
+   fd = vsock_seqpacket_accept(VMADDR_CID_ANY, 1234, NULL);
+   else
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+
if (fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
@@ -340,6 +350,21 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}

+   if (seqpacket) {
+   /* This type of socket supports MSG_TRUNC flag,
+* so check it with MSG_PEEK. We must get length
+* of the message.
+*/
+   res = recv(fd, buf_half, sizeof(buf_half), MSG_PEEK |
+  MSG_TRUNC);
+   if (res != sizeof(buf_peek)) {
+   fprintf(stderr,
+   "recv(2) + MSG_PEEK | MSG_TRUNC, exp %zu, got 
%zi\n",
+   sizeof(buf_half), res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
res = recv(fd, buf_normal, sizeof(buf_normal), 0);
if (res != sizeof(buf_normal)) {
fprintf(stderr, "recv(2), expected %zu, got %zi\n",
@@ -356,6 +381,16 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
close(fd);
}

+static void test_stream_msg_peek_client(const struct test_opts *opts)
+{
+   return test_msg_peek_client(opts, false);
+}
+
+static void test_stream_msg_peek_server(const struct test_opts *opts)
+{
+   return test_msg_peek_server(opts, false);
+}
+
#define SOCK_BUF_SIZE (2 * 1024 * 1024)
#define MAX_MSG_SIZE (32 * 1024)

@@ -1125,6 +1160,16 @@ static void test_stream_virtio_skb_merge_server(const 
struct test_opts *opts)
close(fd);
}

+static void test_seqpacket_msg_peek_client(const struct test_opts *opts)
+{
+   return test_msg_peek_client(opts, true);
+}
+
+static void test_seqpacket_msg_peek_server(const struct test_opts *opts)
+{
+   return test_msg_peek_server(opts, true);
+}
+
static struct test_case test_cases[] = {
{
.name = "SOCK_STREAM connection reset",
@@ -1200,6 +1245,11 @@ static struct test_case test_cases[] = {
.run_client = test_stream_virtio_skb_merge_client,
.run_server = test_stream_virtio_skb_merge_server,
},
+   {
+   .name = "SOCK_SEQPACKET MSG_PEEK",
+   .run_client = test_seqpacket_msg_peek_client,
+   .run_server = test_seqpacket_msg_peek_server,
+   },
{},
};

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 4/4] vsock/test: MSG_PEEK test for SOCK_SEQPACKET

2023-07-25 Thread Stefano Garzarella

On Wed, Jul 19, 2023 at 10:27:08PM +0300, Arseniy Krasnov wrote:

This adds MSG_PEEK test for SOCK_SEQPACKET. It works in the same way as
SOCK_STREAM test, except it also tests MSG_TRUNC flag.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 58 +---
1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 444a3ff0681f..2ca2cbfa9808 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -257,14 +257,19 @@ static void test_stream_multiconn_server(const struct 
test_opts *opts)

#define MSG_PEEK_BUF_LEN 64

-static void test_stream_msg_peek_client(const struct test_opts *opts)
+static void __test_msg_peek_client(const struct test_opts *opts,


Let's stay with just test_msg_peek_client(), WDYT?


+  bool seqpacket)
{
unsigned char buf[MSG_PEEK_BUF_LEN];
ssize_t send_size;
int fd;
int i;

-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (seqpacket)
+   fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
+   else
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -290,7 +295,8 @@ static void test_stream_msg_peek_client(const struct 
test_opts *opts)
close(fd);
}

-static void test_stream_msg_peek_server(const struct test_opts *opts)
+static void __test_msg_peek_server(const struct test_opts *opts,


Same here.

The rest LGTM!

Also the whole series should be ready for net-next, right?

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 3/4] vsock/test: rework MSG_PEEK test for SOCK_STREAM

2023-07-25 Thread Stefano Garzarella

On Wed, Jul 19, 2023 at 10:27:07PM +0300, Arseniy Krasnov wrote:

This new version makes test more complicated by adding empty read,
partial read and data comparisons between MSG_PEEK and normal reads.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 78 ++--
1 file changed, 75 insertions(+), 3 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index ac1bd3ac1533..444a3ff0681f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -255,9 +255,14 @@ static void test_stream_multiconn_server(const struct 
test_opts *opts)
close(fds[i]);
}

+#define MSG_PEEK_BUF_LEN 64
+
static void test_stream_msg_peek_client(const struct test_opts *opts)
{
+   unsigned char buf[MSG_PEEK_BUF_LEN];
+   ssize_t send_size;
int fd;
+   int i;

fd = vsock_stream_connect(opts->peer_cid, 1234);
if (fd < 0) {
@@ -265,12 +270,32 @@ static void test_stream_msg_peek_client(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}

-   send_byte(fd, 1, 0);
+   for (i = 0; i < sizeof(buf); i++)
+   buf[i] = rand() & 0xFF;
+
+   control_expectln("SRVREADY");
+
+   send_size = send(fd, buf, sizeof(buf), 0);
+
+   if (send_size < 0) {
+   perror("send");
+   exit(EXIT_FAILURE);
+   }
+
+   if (send_size != sizeof(buf)) {
+   fprintf(stderr, "Invalid send size %zi\n", send_size);
+   exit(EXIT_FAILURE);
+   }
+
close(fd);
}

static void test_stream_msg_peek_server(const struct test_opts *opts)
{
+   unsigned char buf_half[MSG_PEEK_BUF_LEN / 2];
+   unsigned char buf_normal[MSG_PEEK_BUF_LEN];
+   unsigned char buf_peek[MSG_PEEK_BUF_LEN];
+   ssize_t res;
int fd;

fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
@@ -279,8 +304,55 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}

-   recv_byte(fd, 1, MSG_PEEK);
-   recv_byte(fd, 1, 0);
+   /* Peek from empty socket. */
+   res = recv(fd, buf_peek, sizeof(buf_peek), MSG_PEEK | MSG_DONTWAIT);
+   if (res != -1) {
+   fprintf(stderr, "expected recv(2) failure, got %zi\n", res);
+   exit(EXIT_FAILURE);
+   }
+
+   if (errno != EAGAIN) {
+   perror("EAGAIN expected");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SRVREADY");
+
+   /* Peek part of data. */
+   res = recv(fd, buf_half, sizeof(buf_half), MSG_PEEK);
+   if (res != sizeof(buf_half)) {
+   fprintf(stderr, "recv(2) + MSG_PEEK, expected %zu, got %zi\n",
+   sizeof(buf_half), res);
+   exit(EXIT_FAILURE);
+   }
+
+   /* Peek whole data. */
+   res = recv(fd, buf_peek, sizeof(buf_peek), MSG_PEEK);
+   if (res != sizeof(buf_peek)) {
+   fprintf(stderr, "recv(2) + MSG_PEEK, expected %zu, got %zi\n",
+   sizeof(buf_peek), res);
+   exit(EXIT_FAILURE);
+   }
+
+   /* Compare partial and full peek. */
+   if (memcmp(buf_half, buf_peek, sizeof(buf_half))) {
+   fprintf(stderr, "Partial peek data mismatch\n");
+   exit(EXIT_FAILURE);
+   }
+
+   res = recv(fd, buf_normal, sizeof(buf_normal), 0);
+   if (res != sizeof(buf_normal)) {
+   fprintf(stderr, "recv(2), expected %zu, got %zi\n",
+   sizeof(buf_normal), res);
+   exit(EXIT_FAILURE);
+   }
+
+   /* Compare full peek and normal read. */
+   if (memcmp(buf_peek, buf_normal, sizeof(buf_peek))) {
+   fprintf(stderr, "Full peek data mismatch\n");
+   exit(EXIT_FAILURE);
+   }
+
close(fd);
}

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 2/4] virtio/vsock: support MSG_PEEK for SOCK_SEQPACKET

2023-07-25 Thread Stefano Garzarella

On Wed, Jul 19, 2023 at 10:27:06PM +0300, Arseniy Krasnov wrote:

This adds support of MSG_PEEK flag for SOCK_SEQPACKET type of socket.
Difference with SOCK_STREAM is that this callback returns either length
of the message or error.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 63 +++--
1 file changed, 60 insertions(+), 3 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 2ee40574c339..352d042b130b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -460,6 +460,63 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static ssize_t
+virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
+  struct msghdr *msg)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct sk_buff *skb;
+   size_t total, len;
+
+   spin_lock_bh(>rx_lock);
+
+   if (!vvs->msg_count) {
+   spin_unlock_bh(>rx_lock);
+   return 0;
+   }
+
+   total = 0;
+   len = msg_data_left(msg);
+
+   skb_queue_walk(>rx_queue, skb) {
+   struct virtio_vsock_hdr *hdr;
+
+   if (total < len) {
+   size_t bytes;
+   int err;
+
+   bytes = len - total;
+   if (bytes > skb->len)
+   bytes = skb->len;
+
+   spin_unlock_bh(>rx_lock);
+
+   /* sk_lock is held by caller so no one else can dequeue.
+* Unlock rx_lock since memcpy_to_msg() may sleep.
+*/
+   err = memcpy_to_msg(msg, skb->data, bytes);
+   if (err)
+   return err;
+
+   spin_lock_bh(>rx_lock);
+   }
+
+   total += skb->len;
+   hdr = virtio_vsock_hdr(skb);
+
+   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+   msg->msg_flags |= MSG_EOR;
+
+   break;
+   }
+   }
+
+   spin_unlock_bh(>rx_lock);
+
+   return total;
+}
+
static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 struct msghdr *msg,
 int flags)
@@ -554,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
   int flags)
{
if (flags & MSG_PEEK)
-   return -EOPNOTSUPP;
-
-   return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
+   return virtio_transport_seqpacket_do_peek(vsk, msg);
+   else
+   return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 1/4] virtio/vsock: rework MSG_PEEK for SOCK_STREAM

2023-07-25 Thread Stefano Garzarella

On Wed, Jul 19, 2023 at 10:27:05PM +0300, Arseniy Krasnov wrote:

This reworks current implementation of MSG_PEEK logic:
1) Replaces 'skb_queue_walk_safe()' with 'skb_queue_walk()'. There is
  no need in the first one, as there are no removes of skb in loop.
2) Removes nested while loop - MSG_PEEK logic could be implemented
  without it: just iterate over skbs without removing it and copy
  data from each until destination buffer is not full.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Bobby Eshleman 
---
net/vmw_vsock/virtio_transport_common.c | 41 -
1 file changed, 19 insertions(+), 22 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index b769fc258931..2ee40574c339 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
size_t len)
{
struct virtio_vsock_sock *vvs = vsk->trans;
-   size_t bytes, total = 0, off;
-   struct sk_buff *skb, *tmp;
-   int err = -EFAULT;
+   struct sk_buff *skb;
+   size_t total = 0;
+   int err;

spin_lock_bh(>rx_lock);

-   skb_queue_walk_safe(>rx_queue, skb,  tmp) {
-   off = 0;
+   skb_queue_walk(>rx_queue, skb) {
+   size_t bytes;

-   if (total == len)
-   break;
+   bytes = len - total;
+   if (bytes > skb->len)
+   bytes = skb->len;

-   while (total < len && off < skb->len) {
-   bytes = len - total;
-   if (bytes > skb->len - off)
-   bytes = skb->len - off;
+   spin_unlock_bh(>rx_lock);

-   /* sk_lock is held by caller so no one else can dequeue.
-* Unlock rx_lock since memcpy_to_msg() may sleep.
-*/
-   spin_unlock_bh(>rx_lock);
+   /* sk_lock is held by caller so no one else can dequeue.
+* Unlock rx_lock since memcpy_to_msg() may sleep.
+*/
+   err = memcpy_to_msg(msg, skb->data, bytes);
+   if (err)
+   goto out;

-   err = memcpy_to_msg(msg, skb->data + off, bytes);
-   if (err)
-   goto out;
+   total += bytes;

-   spin_lock_bh(>rx_lock);
+   spin_lock_bh(>rx_lock);

-   total += bytes;
-   off += bytes;
-   }
+   if (total == len)
+   break;
}

spin_unlock_bh(>rx_lock);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-25 Thread Stefano Garzarella

On Tue, Jul 25, 2023 at 09:06:02AM -0400, Michael S. Tsirkin wrote:

On Tue, Jul 25, 2023 at 02:53:39PM +0200, Stefano Garzarella wrote:

On Tue, Jul 25, 2023 at 07:50:53AM -0400, Michael S. Tsirkin wrote:
> On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:
> >
> >
> > On 21.07.2023 00:42, Arseniy Krasnov wrote:
> > > This adds handling of MSG_ZEROCOPY flag on transmission path: if this
> > > flag is set and zerocopy transmission is possible (enabled in socket
> > > options and transport allows zerocopy), then non-linear skb will be
> > > created and filled with the pages of user's buffer. Pages of user's
> > > buffer are locked in memory by 'get_user_pages()'. Second thing that
> > > this patch does is replace type of skb owning: instead of calling
> > > 'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
> > > change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
> > > of socket, so to decrease this field correctly proper skb destructor is
> > > needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
> > >
> > > Signed-off-by: Arseniy Krasnov 
> > > ---
> > >  Changelog:
> > >  v5(big patchset) -> v1:
> > >   * Refactorings of 'if' conditions.
> > >   * Remove extra blank line.
> > >   * Remove 'frag_off' field unneeded init.
> > >   * Add function 'virtio_transport_fill_skb()' which fills both linear
> > > and non-linear skb with provided data.
> > >  v1 -> v2:
> > >   * Use original order of last four arguments in 
'virtio_transport_alloc_skb()'.
> > >  v2 -> v3:
> > >   * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
> > > provided 'iov_iter' with data could be sent in a zerocopy mode.
> > > If this callback is not set in transport - transport allows to send
> > > any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 
'true'
> > > then zerocopy is allowed. Reason of this callback is that in case of
> > > G2H transmission we insert whole skb to the tx virtio queue and such
> > > skb must fit to the size of the virtio queue to be sent in a single
> > > iteration (may be tx logic in 'virtio_transport.c' could be reworked
> > > as in vhost to support partial send of current skb). This callback
> > > will be enabled only for G2H path. For details pls see comment
> > > 'Check that tx queue...' below.
> > >
> > >  include/net/af_vsock.h  |   3 +
> > >  net/vmw_vsock/virtio_transport.c|  39 
> > >  net/vmw_vsock/virtio_transport_common.c | 257 ++--
> > >  3 files changed, 241 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > > index 0e7504a42925..a6b346eeeb8e 100644
> > > --- a/include/net/af_vsock.h
> > > +++ b/include/net/af_vsock.h
> > > @@ -177,6 +177,9 @@ struct vsock_transport {
> > >
> > >  /* Read a single skb */
> > >  int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> > > +
> > > +/* Zero-copy. */
> > > +bool (*msgzerocopy_check_iov)(const struct iov_iter *);
> > >  };
> > >
> > >  / CORE /
> > > diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c
> > > index 7bbcc8093e51..23cb8ed638c4 100644
> > > --- a/net/vmw_vsock/virtio_transport.c
> > > +++ b/net/vmw_vsock/virtio_transport.c
> > > @@ -442,6 +442,43 @@ static void virtio_vsock_rx_done(struct virtqueue 
*vq)
> > >  queue_work(virtio_vsock_workqueue, >rx_work);
> > >  }
> > >
> > > +static bool virtio_transport_msgzerocopy_check_iov(const struct iov_iter 
*iov)
> > > +{
> > > +struct virtio_vsock *vsock;
> > > +bool res = false;
> > > +
> > > +rcu_read_lock();
> > > +
> > > +vsock = rcu_dereference(the_virtio_vsock);
> > > +if (vsock) {
> > > +struct virtqueue *vq;
> > > +int iov_pages;
> > > +
> > > +vq = vsock->vqs[VSOCK_VQ_TX];
> > > +
> > > +iov_pages = round_up(iov->count, PAGE_SIZE) / PAGE_SIZE;
> > > +
> > > +/* Check that tx queue is large enough to keep whole
> > > + * data to send. This is needed, because when there is
> >

Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-25 Thread Stefano Garzarella

On Tue, Jul 25, 2023 at 07:50:53AM -0400, Michael S. Tsirkin wrote:

On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:



On 21.07.2023 00:42, Arseniy Krasnov wrote:
> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
> flag is set and zerocopy transmission is possible (enabled in socket
> options and transport allows zerocopy), then non-linear skb will be
> created and filled with the pages of user's buffer. Pages of user's
> buffer are locked in memory by 'get_user_pages()'. Second thing that
> this patch does is replace type of skb owning: instead of calling
> 'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
> change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
> of socket, so to decrease this field correctly proper skb destructor is
> needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
>
> Signed-off-by: Arseniy Krasnov 
> ---
>  Changelog:
>  v5(big patchset) -> v1:
>   * Refactorings of 'if' conditions.
>   * Remove extra blank line.
>   * Remove 'frag_off' field unneeded init.
>   * Add function 'virtio_transport_fill_skb()' which fills both linear
> and non-linear skb with provided data.
>  v1 -> v2:
>   * Use original order of last four arguments in 
'virtio_transport_alloc_skb()'.
>  v2 -> v3:
>   * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
> provided 'iov_iter' with data could be sent in a zerocopy mode.
> If this callback is not set in transport - transport allows to send
> any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
> then zerocopy is allowed. Reason of this callback is that in case of
> G2H transmission we insert whole skb to the tx virtio queue and such
> skb must fit to the size of the virtio queue to be sent in a single
> iteration (may be tx logic in 'virtio_transport.c' could be reworked
> as in vhost to support partial send of current skb). This callback
> will be enabled only for G2H path. For details pls see comment
> 'Check that tx queue...' below.
>
>  include/net/af_vsock.h  |   3 +
>  net/vmw_vsock/virtio_transport.c|  39 
>  net/vmw_vsock/virtio_transport_common.c | 257 ++--
>  3 files changed, 241 insertions(+), 58 deletions(-)
>
> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> index 0e7504a42925..a6b346eeeb8e 100644
> --- a/include/net/af_vsock.h
> +++ b/include/net/af_vsock.h
> @@ -177,6 +177,9 @@ struct vsock_transport {
>
>/* Read a single skb */
>int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> +
> +  /* Zero-copy. */
> +  bool (*msgzerocopy_check_iov)(const struct iov_iter *);
>  };
>
>  / CORE /
> diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c
> index 7bbcc8093e51..23cb8ed638c4 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -442,6 +442,43 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
>queue_work(virtio_vsock_workqueue, >rx_work);
>  }
>
> +static bool virtio_transport_msgzerocopy_check_iov(const struct iov_iter 
*iov)
> +{
> +  struct virtio_vsock *vsock;
> +  bool res = false;
> +
> +  rcu_read_lock();
> +
> +  vsock = rcu_dereference(the_virtio_vsock);
> +  if (vsock) {
> +  struct virtqueue *vq;
> +  int iov_pages;
> +
> +  vq = vsock->vqs[VSOCK_VQ_TX];
> +
> +  iov_pages = round_up(iov->count, PAGE_SIZE) / PAGE_SIZE;
> +
> +  /* Check that tx queue is large enough to keep whole
> +   * data to send. This is needed, because when there is
> +   * not enough free space in the queue, current skb to
> +   * send will be reinserted to the head of tx list of
> +   * the socket to retry transmission later, so if skb
> +   * is bigger than whole queue, it will be reinserted
> +   * again and again, thus blocking other skbs to be sent.
> +   * Each page of the user provided buffer will be added
> +   * as a single buffer to the tx virtqueue, so compare
> +   * number of pages against maximum capacity of the queue.
> +   * +1 means buffer for the packet header.
> +   */
> +  if (iov_pages + 1 <= vq->num_max)

I think this check is actual only for case one we don't have indirect buffer 
feature.
With indirect mode whole data to send will be packed into one indirect buffer.

Thanks, Arseniy


Actually the reverse. With indirect you are limited to num_max.
Without you are limited to whatever space is left in the
queue (which you did not check here, so you should).



> +  res = true;
> +  }
> +
> +  rcu_read_unlock();


Just curious:
is the point of all this RCU dance to allow vsock
to change from under us? then why is it ok to
have it change? the virtio_transport_msgzerocopy_check_iov
will then refer to the old vsock ...


IIRC we introduced the RCU to 

Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-25 Thread Stefano Garzarella

On Tue, Jul 25, 2023 at 08:39:17AM -0400, Michael S. Tsirkin wrote:

On Tue, Jul 25, 2023 at 02:28:02PM +0200, Stefano Garzarella wrote:

On Tue, Jul 25, 2023 at 12:16:11PM +0300, Arseniy Krasnov wrote:
>
>
> On 25.07.2023 11:46, Arseniy Krasnov wrote:
> >
> >
> > On 25.07.2023 11:43, Stefano Garzarella wrote:
> > > On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:
> > > >
> > > >
> > > > On 21.07.2023 00:42, Arseniy Krasnov wrote:
> > > > > This adds handling of MSG_ZEROCOPY flag on transmission path: if this
> > > > > flag is set and zerocopy transmission is possible (enabled in socket
> > > > > options and transport allows zerocopy), then non-linear skb will be
> > > > > created and filled with the pages of user's buffer. Pages of user's
> > > > > buffer are locked in memory by 'get_user_pages()'. Second thing that
> > > > > this patch does is replace type of skb owning: instead of calling
> > > > > 'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
> > > > > change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
> > > > > of socket, so to decrease this field correctly proper skb destructor 
is
> > > > > needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
> > > > >
> > > > > Signed-off-by: Arseniy Krasnov 
> > > > > ---
> > > > >  Changelog:
> > > > >  v5(big patchset) -> v1:
> > > > >   * Refactorings of 'if' conditions.
> > > > >   * Remove extra blank line.
> > > > >   * Remove 'frag_off' field unneeded init.
> > > > >   * Add function 'virtio_transport_fill_skb()' which fills both linear
> > > > >     and non-linear skb with provided data.
> > > > >  v1 -> v2:
> > > > >   * Use original order of last four arguments in 
'virtio_transport_alloc_skb()'.
> > > > >  v2 -> v3:
> > > > >   * Add new transport callback: 'msgzerocopy_check_iov'. It checks 
that
> > > > >     provided 'iov_iter' with data could be sent in a zerocopy mode.
> > > > >     If this callback is not set in transport - transport allows to 
send
> > > > >     any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 
'true'
> > > > >     then zerocopy is allowed. Reason of this callback is that in case 
of
> > > > >     G2H transmission we insert whole skb to the tx virtio queue and 
such
> > > > >     skb must fit to the size of the virtio queue to be sent in a 
single
> > > > >     iteration (may be tx logic in 'virtio_transport.c' could be 
reworked
> > > > >     as in vhost to support partial send of current skb). This callback
> > > > >     will be enabled only for G2H path. For details pls see comment
> > > > >     'Check that tx queue...' below.
> > > > >
> > > > >  include/net/af_vsock.h  |   3 +
> > > > >  net/vmw_vsock/virtio_transport.c    |  39 
> > > > >  net/vmw_vsock/virtio_transport_common.c | 257 
++--
> > > > >  3 files changed, 241 insertions(+), 58 deletions(-)
> > > > >
> > > > > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > > > > index 0e7504a42925..a6b346eeeb8e 100644
> > > > > --- a/include/net/af_vsock.h
> > > > > +++ b/include/net/af_vsock.h
> > > > > @@ -177,6 +177,9 @@ struct vsock_transport {
> > > > >
> > > > >  /* Read a single skb */
> > > > >  int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> > > > > +
> > > > > +    /* Zero-copy. */
> > > > > +    bool (*msgzerocopy_check_iov)(const struct iov_iter *);
> > > > >  };
> > > > >
> > > > >  / CORE /
> > > > > diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c
> > > > > index 7bbcc8093e51..23cb8ed638c4 100644
> > > > > --- a/net/vmw_vsock/virtio_transport.c
> > > > > +++ b/net/vmw_vsock/virtio_transport.c
> > > > > @@ -442,6 +442,43 @@ static void virtio_vsock_rx_done(struct 
virtqueue *vq)
> > > > >  queue_work(virtio_vsock_workqueue, >rx_work);
> > > > >  }
> > > > >
> > > > > +static bool
> > > > > virtio_transport

Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-25 Thread Stefano Garzarella

On Tue, Jul 25, 2023 at 12:16:11PM +0300, Arseniy Krasnov wrote:



On 25.07.2023 11:46, Arseniy Krasnov wrote:



On 25.07.2023 11:43, Stefano Garzarella wrote:

On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:



On 21.07.2023 00:42, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible (enabled in socket
options and transport allows zerocopy), then non-linear skb will be
created and filled with the pages of user's buffer. Pages of user's
buffer are locked in memory by 'get_user_pages()'. Second thing that
this patch does is replace type of skb owning: instead of calling
'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
of socket, so to decrease this field correctly proper skb destructor is
needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v5(big patchset) -> v1:
  * Refactorings of 'if' conditions.
  * Remove extra blank line.
  * Remove 'frag_off' field unneeded init.
  * Add function 'virtio_transport_fill_skb()' which fills both linear
    and non-linear skb with provided data.
 v1 -> v2:
  * Use original order of last four arguments in 'virtio_transport_alloc_skb()'.
 v2 -> v3:
  * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
    provided 'iov_iter' with data could be sent in a zerocopy mode.
    If this callback is not set in transport - transport allows to send
    any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
    then zerocopy is allowed. Reason of this callback is that in case of
    G2H transmission we insert whole skb to the tx virtio queue and such
    skb must fit to the size of the virtio queue to be sent in a single
    iteration (may be tx logic in 'virtio_transport.c' could be reworked
    as in vhost to support partial send of current skb). This callback
    will be enabled only for G2H path. For details pls see comment
    'Check that tx queue...' below.

 include/net/af_vsock.h  |   3 +
 net/vmw_vsock/virtio_transport.c    |  39 
 net/vmw_vsock/virtio_transport_common.c | 257 ++--
 3 files changed, 241 insertions(+), 58 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 0e7504a42925..a6b346eeeb8e 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -177,6 +177,9 @@ struct vsock_transport {

 /* Read a single skb */
 int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
+
+    /* Zero-copy. */
+    bool (*msgzerocopy_check_iov)(const struct iov_iter *);
 };

 / CORE /
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 7bbcc8093e51..23cb8ed638c4 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -442,6 +442,43 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
 queue_work(virtio_vsock_workqueue, >rx_work);
 }

+static bool virtio_transport_msgzerocopy_check_iov(const struct 
iov_iter *iov)

+{
+    struct virtio_vsock *vsock;
+    bool res = false;
+
+    rcu_read_lock();
+
+    vsock = rcu_dereference(the_virtio_vsock);
+    if (vsock) {


Just noted, what about the following to reduce the indentation?

if (!vsock) {
goto out;
}
...
...
out:
rcu_read_unlock();
return res;


+    struct virtqueue *vq;
+    int iov_pages;
+
+    vq = vsock->vqs[VSOCK_VQ_TX];
+
+    iov_pages = round_up(iov->count, PAGE_SIZE) / PAGE_SIZE;
+
+    /* Check that tx queue is large enough to keep whole
+ * data to send. This is needed, because when there is
+ * not enough free space in the queue, current skb to
+ * send will be reinserted to the head of tx list of
+ * the socket to retry transmission later, so if skb
+ * is bigger than whole queue, it will be reinserted
+ * again and again, thus blocking other skbs to be sent.
+ * Each page of the user provided buffer will be added
+ * as a single buffer to the tx virtqueue, so compare
+ * number of pages against maximum capacity of the queue.
+ * +1 means buffer for the packet header.
+ */
+    if (iov_pages + 1 <= vq->num_max)


I think this check is actual only for case one we don't have indirect buffer 
feature.
With indirect mode whole data to send will be packed into one indirect buffer.


I think so.
So, should we check also that here?



Thanks, Arseniy


+    res = true;
+    }
+
+    rcu_read_unlock();
+
+    return res;
+}
+
 static bool virtio_transport_seqpacket_allow(u32 remote_cid);

 static struct virtio_transport virtio_transport = {
@@ -475,6 +512,8 @@ static struct virtio_transport virtio_transport = {
 

Re: [PATCH net-next v3 4/4] vsock/virtio: MSG_ZEROCOPY flag support

2023-07-25 Thread Stefano Garzarella

On Fri, Jul 21, 2023 at 08:09:03AM +0300, Arseniy Krasnov wrote:



On 21.07.2023 00:42, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible (enabled in socket
options and transport allows zerocopy), then non-linear skb will be
created and filled with the pages of user's buffer. Pages of user's
buffer are locked in memory by 'get_user_pages()'. Second thing that
this patch does is replace type of skb owning: instead of calling
'skb_set_owner_sk_safe()' it calls 'skb_set_owner_w()'. Reason of this
change is that '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc'
of socket, so to decrease this field correctly proper skb destructor is
needed: 'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v5(big patchset) -> v1:
  * Refactorings of 'if' conditions.
  * Remove extra blank line.
  * Remove 'frag_off' field unneeded init.
  * Add function 'virtio_transport_fill_skb()' which fills both linear
and non-linear skb with provided data.
 v1 -> v2:
  * Use original order of last four arguments in 'virtio_transport_alloc_skb()'.
 v2 -> v3:
  * Add new transport callback: 'msgzerocopy_check_iov'. It checks that
provided 'iov_iter' with data could be sent in a zerocopy mode.
If this callback is not set in transport - transport allows to send
any 'iov_iter' in zerocopy mode. Otherwise - if callback returns 'true'
then zerocopy is allowed. Reason of this callback is that in case of
G2H transmission we insert whole skb to the tx virtio queue and such
skb must fit to the size of the virtio queue to be sent in a single
iteration (may be tx logic in 'virtio_transport.c' could be reworked
as in vhost to support partial send of current skb). This callback
will be enabled only for G2H path. For details pls see comment
'Check that tx queue...' below.

 include/net/af_vsock.h  |   3 +
 net/vmw_vsock/virtio_transport.c|  39 
 net/vmw_vsock/virtio_transport_common.c | 257 ++--
 3 files changed, 241 insertions(+), 58 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 0e7504a42925..a6b346eeeb8e 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -177,6 +177,9 @@ struct vsock_transport {

/* Read a single skb */
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
+
+   /* Zero-copy. */
+   bool (*msgzerocopy_check_iov)(const struct iov_iter *);
 };

 / CORE /
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 7bbcc8093e51..23cb8ed638c4 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -442,6 +442,43 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >rx_work);
 }

+static bool virtio_transport_msgzerocopy_check_iov(const struct iov_iter *iov)
+{
+   struct virtio_vsock *vsock;
+   bool res = false;
+
+   rcu_read_lock();
+
+   vsock = rcu_dereference(the_virtio_vsock);
+   if (vsock) {
+   struct virtqueue *vq;
+   int iov_pages;
+
+   vq = vsock->vqs[VSOCK_VQ_TX];
+
+   iov_pages = round_up(iov->count, PAGE_SIZE) / PAGE_SIZE;
+
+   /* Check that tx queue is large enough to keep whole
+* data to send. This is needed, because when there is
+* not enough free space in the queue, current skb to
+* send will be reinserted to the head of tx list of
+* the socket to retry transmission later, so if skb
+* is bigger than whole queue, it will be reinserted
+* again and again, thus blocking other skbs to be sent.
+* Each page of the user provided buffer will be added
+* as a single buffer to the tx virtqueue, so compare
+* number of pages against maximum capacity of the queue.
+* +1 means buffer for the packet header.
+*/
+   if (iov_pages + 1 <= vq->num_max)


I think this check is actual only for case one we don't have indirect buffer 
feature.
With indirect mode whole data to send will be packed into one indirect buffer.


I think so.
So, should we check also that here?



Thanks, Arseniy


+   res = true;
+   }
+
+   rcu_read_unlock();
+
+   return res;
+}
+
 static bool virtio_transport_seqpacket_allow(u32 remote_cid);

 static struct virtio_transport virtio_transport = {
@@ -475,6 +512,8 @@ static struct virtio_transport virtio_transport = {
.seqpacket_allow  = virtio_transport_seqpacket_allow,
.seqpacket_has_data   = virtio_transport_seqpacket_has_data,

+   .msgzerocopy_check_iov= 
virtio_transport_msgzerocopy_check_iov,
+

  1   2   3   4   5   6   7   8   9   10   >