from:"Ilya Maximets"

[Qemu-devel] [PATCH] vhost-user: verify that number of queues is less than MAX_QUEUE_NUM

2016-02-24 Thread Ilya Maximets

Fix QEMU crash when -netdev vhost-user,queues=n is passed with number
of queues greater than MAX_QUEUE_NUM.

Signed-off-by: Ilya Maximets 
---
 net/vhost-user.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/vhost-user.c b/net/vhost-user.c
index 451dbbf..b753b3d 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -317,9 +317,10 @@ int net_init_vhost_user(const NetClientOptions *opts, 
const char *name,
 }
 
 queues = vhost_user_opts->has_queues ? vhost_user_opts->queues : 1;
-if (queues < 1) {
+if (queues < 1 || queues > MAX_QUEUE_NUM) {
 error_setg(errp,
-   "vhost-user number of queues must be bigger than zero");
+   "vhost-user number of queues must be in range [1, %d]",
+   MAX_QUEUE_NUM);
 return -1;
 }
 
-- 
2.5.0

[Qemu-devel] [PATCH 3/4] vhost: check for vhost_net device validity.

2016-03-30 Thread Ilya Maximets

After successfull destroying of vhost-user device (may be after
virtio driver unbinding after disconnection of vhost-user socket)
QEMU will fail to bind virtio driver again with segmentation fault:

[ cut ---]
# After vhost-user socket diconnection.
[guest]# ip link show eth0
2: eth0:  mtu 1500 qdisc <...>
link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff

[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.

[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/bind

Child terminated with signal = 0xb (SIGSEGV)
GDBserver exiting
[ cut ---]
[host]# gdb
Program received signal SIGSEGV, Segmentation fault.
vhost_set_vring_enable (<...>) at /hw/net/vhost_net.c:425
425 if (vhost_ops->vhost_set_vring_enable) {
(gdb) bt
#0  vhost_set_vring_enable (nc=0xfff8a0, enable=enable@entry=1)
net = 0x0 # NULL pointer to vhost_net device !
vhost_ops = <(Cannot access memory at address 0x110)>
res = 0
#1  peer_attach
#2  virtio_net_set_queues
#3  virtio_net_set_multiqueue
#4  virtio_net_set_features
#5  virtio_set_features_nocheck
#6  memory_region_write_accessor
#7  access_with_adjusted_size
#8  memory_region_dispatch_write
#9  address_space_write_continue
#10 address_space_write
<...>
[ cut ---]

This happens because of invalid vhost_net device pointer.
Fix that by checking this pointer in all functions before using.

Result:
[ cut ---]
[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/bind

# Check link in guest. No crashes here, link in DOWN state.
[guest]# ip link show eth0
7: eth0:  mtu 1500 qdisc <...>
link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
[---- cut ---]

Signed-off-by: Ilya Maximets 
---
 hw/net/vhost_net.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 0996e5d..4c3363f 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -202,6 +202,10 @@ static int vhost_net_start_one(struct vhost_net *net,
 struct vhost_vring_file file = { };
 int r;
 
+if (!net) {
+return -1;
+}
+
 net->dev.nvqs = 2;
 net->dev.vqs = net->vqs;
 
@@ -256,6 +260,10 @@ static void vhost_net_stop_one(struct vhost_net *net,
 {
 struct vhost_vring_file file = { .fd = -1 };
 
+if (!net) {
+return;
+}
+
 if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
 for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
 const VhostOps *vhost_ops = net->dev.vhost_ops;
@@ -287,6 +295,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
 struct vhost_net *net;
 
 net = get_vhost_net(ncs[i].peer);
+if (!net) {
+return -1;
+}
 vhost_net_set_vq_index(net, i * 2);
 
 /* Suppress the masking guest notifiers on vhost user
@@ -419,9 +430,14 @@ void put_vhost_net(NetClientState *nc)
 int vhost_set_vring_enable(NetClientState *nc, int enable)
 {
 VHostNetState *net = get_vhost_net(nc);
-const VhostOps *vhost_ops = net->dev.vhost_ops;
+const VhostOps *vhost_ops;
 int res = 0;
 
+if (!net) {
+return 0;
+}
+
+vhost_ops = net->dev.vhost_ops;
 if (vhost_ops->vhost_set_vring_enable) {
 res = vhost_ops->vhost_set_vring_enable(&net->dev, enable);
 }
-- 
2.5.0

[Qemu-devel] [PATCH 4/4] net: notify about link status only if it changed.

2016-03-30 Thread Ilya Maximets

No need to notify nc->peer if nothing changed.

Signed-off-by: Ilya Maximets 
---
 net/net.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/net.c b/net/net.c
index 3b5a142..6f6a8ce 100644
--- a/net/net.c
+++ b/net/net.c
@@ -1385,9 +1385,10 @@ void qmp_set_link(const char *name, bool up, Error 
**errp)
 for (i = 0; i < queues; i++) {
 ncs[i]->peer->link_down = !up;
 }
-}
-if (nc->peer->info->link_status_changed) {
-nc->peer->info->link_status_changed(nc->peer);
+
+if (nc->peer->info->link_status_changed) {
+nc->peer->info->link_status_changed(nc->peer);
+}
 }
 }
 }
-- 
2.5.0

[Qemu-devel] [PATCH 1/4] vhost-user: fix crash on socket disconnect.

2016-03-30 Thread Ilya Maximets

After disconnect of vhost-user socket (Ex.: host application crash)
QEMU will crash with segmentation fault while virtio driver unbinding
inside the guest.

This happens because vhost_net_cleanup() called while stopping
virtqueue #0 because of CHR_EVENT_CLOSED event arriving.
After that stopping of virtqueue #1 crashes because of cleaned and
freed device memory:

[ cut ---]
[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.

Child terminated with signal = 0xb (SIGSEGV)
GDBserver exiting
[ cut ---]
[host]# gdb
Breakpoint 1 vhost_user_cleanup(dev) at hw/virtio/vhost-user.c
(gdb) bt
#0  vhost_user_cleanup # executes 'dev->opaque = 0;'
#1  vhost_dev_cleanup
#2  vhost_net_cleanup  # After return from #1: g_free(net)
#3  vhost_user_stop
#4  net_vhost_user_event (<...>, event=5) /* CHR_EVENT_CLOSED */
#5  qemu_chr_be_event (<...>, event=event@entry=5)
#6  tcp_chr_disconnect
#7  tcp_chr_sync_read
#8  qemu_chr_fe_read_all
#9  vhost_user_read
#10 vhost_user_get_vring_base
#11 vhost_virtqueue_stop (vq=0xffe190, idx=0)
#12 vhost_dev_stop
#13 vhost_net_stop_one
#14 vhost_net_stop
#15 virtio_net_vhost_status
#16 virtio_net_set_status
#17 virtio_set_status
<...>
(gdb) c
Continuing.

Program received signal SIGSEGV, Segmentation fault.
qemu_chr_fe_write_all (s=s@entry=0x0, <...>) at qemu-char.c:302
302 if (s->replay && replay_mode == REPLAY_MODE_PLAY) {
(gdb) bt
#0  qemu_chr_fe_write_all (s=s@entry=0x0, <...>)
#1  vhost_user_write
#2  vhost_user_get_vring_base
#3  vhost_virtqueue_stop (vq=0xffe190, idx=1) # device already freed here
#4  vhost_dev_stop
#5  vhost_net_stop_one
#6  vhost_net_stop
#7  virtio_net_vhost_status
#8  virtio_net_set_status
#9  virtio_set_status
<...>
[ cut ---]

Fix that by introducing of reference counter for vhost_net device
and freeing memory only after dropping of last reference.

Signed-off-by: Ilya Maximets 
---
 hw/net/vhost_net.c   | 22 +++---
 hw/net/virtio-net.c  | 32 
 include/net/vhost-user.h |  1 +
 include/net/vhost_net.h  |  1 +
 net/filter.c |  1 +
 net/vhost-user.c | 43 ++-
 6 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6e1032f..55d5456 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -296,6 +296,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
 if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER) {
 dev->use_guest_notifier_mask = false;
 }
+put_vhost_net(ncs[i].peer);
  }
 
 r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true);
@@ -306,7 +307,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
 
 for (i = 0; i < total_queues; i++) {
 r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev);
-
+put_vhost_net(ncs[i].peer);
 if (r < 0) {
 goto err_start;
 }
@@ -317,6 +318,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
 err_start:
 while (--i >= 0) {
 vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
+put_vhost_net(ncs[i].peer);
 }
 e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
 if (e < 0) {
@@ -337,6 +339,7 @@ void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs,
 
 for (i = 0; i < total_queues; i++) {
 vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
+put_vhost_net(ncs[i].peer);
 }
 
 r = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
@@ -398,16 +401,25 @@ VHostNetState *get_vhost_net(NetClientState *nc)
 return vhost_net;
 }
 
+void put_vhost_net(NetClientState *nc)
+{
+if (nc && nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER) {
+vhost_user_put_vhost_net(nc);
+}
+}
+
 int vhost_set_vring_enable(NetClientState *nc, int enable)
 {
 VHostNetState *net = get_vhost_net(nc);
 const VhostOps *vhost_ops = net->dev.vhost_ops;
+int res = 0;
 
 if (vhost_ops->vhost_set_vring_enable) {
-return vhost_ops->vhost_set_vring_enable(&net->dev, enable);
+res = vhost_ops->vhost_set_vring_enable(&net->dev, enable);
 }
 
-return 0;
+put_vhost_net(nc);
+return res;
 }
 
 #else
@@ -466,6 +478,10 @@ VHostNetState *get_vhost_net(NetClientState *nc)
 return 0;
 }
 
+void put

[Qemu-devel] [PATCH 2/4] vhost: prevent double stop of vhost_net device.

2016-03-30 Thread Ilya Maximets

There are few control flows in vhost's implementation where
vhost_net_stop() may be re entered several times in a row. This
leads to triggering of assertion while duble freeing of same resources.

For example, if vhost-user socket disconnection occured:
[ cut ---]
[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: /qemu/memory.c:1852: memory_region_del_eventfd:
  Assertion `i != mr->ioeventfd_nb' failed.

Child terminated with signal = 0x6 (SIGABRT)
GDBserver exiting
[ cut ---]
[host]# gdb
Breakpoint 1 in virtio_pci_set_host_notifier_internal()
#0  virtio_pci_set_host_notifier_internal
#1  vhost_dev_disable_notifiers
#2  vhost_net_stop_one
#3  vhost_net_stop (dev=dev@entry=0x13a45f8, ncs=0x1ce89f0, <...>)
#4  virtio_net_vhost_status
#5  virtio_net_set_status
#6  qmp_set_link (<...>, up=up@entry=false, <...>)
#7  net_vhost_user_event (<...>, event=5)
#8  qemu_chr_be_event
#9  tcp_chr_disconnect
#10 tcp_chr_sync_read
#11 qemu_chr_fe_read_all
#12 vhost_user_read
#13 vhost_user_get_vring_base
#14 vhost_virtqueue_stop
#15 vhost_dev_stop
#16 vhost_net_stop_one
#17 vhost_net_stop (dev=dev@entry=0x13a45f8, ncs=0x1ce89f0, <...>)
#18 virtio_net_vhost_status
#19 virtio_net_set_status
#20 virtio_set_status
<...>
[ cut ---]

In example above assertion will fail when control will be brought back
to function at #17 and it will try to free 'eventfd' that was already
freed at call #3.

Fix that by disallowing execution of vhost_net_stop() if we're
already inside of it.

Signed-off-by: Ilya Maximets 
---
 hw/net/vhost_net.c | 8 
 hw/net/virtio-net.c| 1 +
 include/hw/virtio/virtio-net.h | 1 +
 3 files changed, 10 insertions(+)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 55d5456..0996e5d 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -335,8 +335,14 @@ void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs,
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
 VirtioBusState *vbus = VIRTIO_BUS(qbus);
 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
+VirtIONet *n = VIRTIO_NET(dev);
 int i, r;
 
+if (n->vhost_stopping_in_progress) {
+return;
+}
+n->vhost_stopping_in_progress = 1;
+
 for (i = 0; i < total_queues; i++) {
 vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
 put_vhost_net(ncs[i].peer);
@@ -348,6 +354,8 @@ void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs,
 fflush(stderr);
 }
 assert(r >= 0);
+
+n->vhost_stopping_in_progress = 0;
 }
 
 void vhost_net_cleanup(struct vhost_net *net)
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e5456ef..9b1539c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -149,6 +149,7 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t 
status)
 }
 
 n->vhost_started = 1;
+n->vhost_stopping_in_progress = 0;
 r = vhost_net_start(vdev, n->nic->ncs, queues);
 if (r < 0) {
 error_report("unable to start vhost net: %d: "
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 0cabdb6..fc07385 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -74,6 +74,7 @@ typedef struct VirtIONet {
 uint8_t nouni;
 uint8_t nobcast;
 uint8_t vhost_started;
+uint8_t vhost_stopping_in_progress;
 struct {
 uint32_t in_use;
 uint32_t first_multi;
-- 
2.5.0

[Qemu-devel] [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

2016-03-30 Thread Ilya Maximets

Currently QEMU always crashes in following scenario (assume that
vhost-user application is Open vSwitch with 'dpdkvhostuser' port):

1. # Check that link in guest is in a normal state.
   [guest]# ip link show eth0
2: eth0:  mtu 1500 qdisc <...>
link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff

2. # Kill vhost-user application (using SIGSEGV just to be sure).
   [host]# kill -11 `pgrep ovs-vswitchd`

3. # Check that guest still thinks that all is good.
   [guest]# ip link show eth0
2: eth0:  mtu 1500 qdisc <...>
link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff

4. # Try to unbind virtio-pci driver and observe QEMU crash.
   [guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.

Child terminated with signal = 0xb (SIGSEGV)
GDBserver exiting

After the applying of this patch-set:

4. # Try to unbind virtio-pci driver. Unbind works fine with only few errors.
   [guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.
qemu: Failed to read msg header. Read 0 instead of 12. Original request 11.

5. # Bind virtio-pci driver back.
   [guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/bind

6. # Check link in guest. No crashes here, link in DOWN state.
   [guest]# ip link show eth0
7: eth0:  mtu 1500 qdisc <...>
link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff

7. QEMU may be gracefully restarted to restore communication after restarting
   of vhost-user application.

Ilya Maximets (4):
  vhost-user: fix crash on socket disconnect.
  vhost: prevent double stop of vhost_net device.
  vhost: check for vhost_net device validity.
  net: notify about link status only if it changed.

 hw/net/vhost_net.c | 48 ++
 hw/net/virtio-net.c| 33 ++---
 include/hw/virtio/virtio-net.h |  1 +
 include/net/vhost-user.h   |  1 +
 include/net/vhost_net.h|  1 +
 net/filter.c   |  1 +
 net/net.c  |  7 +++---
 net/vhost-user.c   | 43 +
 8 files changed, 111 insertions(+), 24 deletions(-)

-- 
2.5.0

Re: [Qemu-devel] [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

2016-03-30 Thread Ilya Maximets

On 30.03.2016 20:01, Michael S. Tsirkin wrote:
> On Wed, Mar 30, 2016 at 06:14:05PM +0300, Ilya Maximets wrote:
>> Currently QEMU always crashes in following scenario (assume that
>> vhost-user application is Open vSwitch with 'dpdkvhostuser' port):
> 
> In fact, wouldn't the right thing to do be stopping the VM?

I don't think that is reasonable to stop VM on failure of just one of
network interfaces. There may be still working vhost-kernel interfaces.
Even connection can still be established if vhost-user interface was in
bonding with kernel interface.
Anyway user should be able to save all his data before QEMU restart.

> 
>> 1. # Check that link in guest is in a normal state.
>>[guest]# ip link show eth0
>> 2: eth0:  mtu 1500 qdisc <...>
>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>
>> 2. # Kill vhost-user application (using SIGSEGV just to be sure).
>>[host]# kill -11 `pgrep ovs-vswitchd`
>>
>> 3. # Check that guest still thinks that all is good.
>>[guest]# ip link show eth0
>> 2: eth0:  mtu 1500 qdisc <...>
>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>
>> 4. # Try to unbind virtio-pci driver and observe QEMU crash.
>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
>> qemu: Failed to read msg header. Read 0 instead of 12. Original request 
>> 11.
>> qemu: Failed to read msg header. Read 0 instead of 12. Original request 
>> 11.
>> qemu: Failed to read msg header. Read 0 instead of 12. Original request 
>> 11.
>>
>> Child terminated with signal = 0xb (SIGSEGV)
>> GDBserver exiting
>>
>> After the applying of this patch-set:
>>
>> 4. # Try to unbind virtio-pci driver. Unbind works fine with only few errors.
>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
>> qemu: Failed to read msg header. Read 0 instead of 12. Original request 
>> 11.
>> qemu: Failed to read msg header. Read 0 instead of 12. Original request 
>> 11.
>>
>> 5. # Bind virtio-pci driver back.
>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/bind
>>
>> 6. # Check link in guest. No crashes here, link in DOWN state.
>>[guest]# ip link show eth0
>> 7: eth0:  mtu 1500 qdisc <...>
>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>
>> 7. QEMU may be gracefully restarted to restore communication after restarting
>>of vhost-user application.
>>
>> Ilya Maximets (4):
>>   vhost-user: fix crash on socket disconnect.
>>   vhost: prevent double stop of vhost_net device.
>>   vhost: check for vhost_net device validity.
>>   net: notify about link status only if it changed.
>>
>>  hw/net/vhost_net.c | 48 
>> ++
>>  hw/net/virtio-net.c| 33 ++---
>>  include/hw/virtio/virtio-net.h |  1 +
>>  include/net/vhost-user.h   |  1 +
>>  include/net/vhost_net.h|  1 +
>>  net/filter.c   |  1 +
>>  net/net.c  |  7 +++---
>>  net/vhost-user.c   | 43 +
>>  8 files changed, 111 insertions(+), 24 deletions(-)
>>
>> -- 
>> 2.5.0
> 
>

Re: [Qemu-devel] [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

2016-03-31 Thread Ilya Maximets

On 31.03.2016 12:21, Michael S. Tsirkin wrote:
> On Thu, Mar 31, 2016 at 09:02:01AM +0300, Ilya Maximets wrote:
>> On 30.03.2016 20:01, Michael S. Tsirkin wrote:
>>> On Wed, Mar 30, 2016 at 06:14:05PM +0300, Ilya Maximets wrote:
>>>> Currently QEMU always crashes in following scenario (assume that
>>>> vhost-user application is Open vSwitch with 'dpdkvhostuser' port):
>>>
>>> In fact, wouldn't the right thing to do be stopping the VM?
>>
>> I don't think that is reasonable to stop VM on failure of just one of
>> network interfaces. There may be still working vhost-kernel interfaces.
>> Even connection can still be established if vhost-user interface was in
>> bonding with kernel interface.
>> Anyway user should be able to save all his data before QEMU restart.
> 
> 
> Unfrotunately some guests are known to crash if we defer
> using available buffers indefinitely.

What exactly do you mean?

Anyway here we have 'crash of some guests' vs. '100% constant QEMU crash'.

> 
> If you see bugs let's fix them, but recovering would be
> a new feature, let's defer it until after 2.6.

The segmentation fault is a bug. This series *doesn't introduce any recovering*
features. Disconnected network interface in guest will never work again.
I just tried to avoid segmentation fault in this situation. There will
be no difference for guest between QEMU with this patches and without them.
There just will not be any crashes. 

> 
>>>
>>>> 1. # Check that link in guest is in a normal state.
>>>>[guest]# ip link show eth0
>>>> 2: eth0:  mtu 1500 qdisc <...>
>>>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>>>
>>>> 2. # Kill vhost-user application (using SIGSEGV just to be sure).
>>>>[host]# kill -11 `pgrep ovs-vswitchd`
>>>>
>>>> 3. # Check that guest still thinks that all is good.
>>>>[guest]# ip link show eth0
>>>> 2: eth0:  mtu 1500 qdisc <...>
>>>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>>>
>>>> 4. # Try to unbind virtio-pci driver and observe QEMU crash.
>>>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
>>>> qemu: Failed to read msg header. Read 0 instead of 12. Original 
>>>> request 11.
>>>> qemu: Failed to read msg header. Read 0 instead of 12. Original 
>>>> request 11.
>>>> qemu: Failed to read msg header. Read 0 instead of 12. Original 
>>>> request 11.
>>>>
>>>> Child terminated with signal = 0xb (SIGSEGV)
>>>> GDBserver exiting
>>>>
>>>> After the applying of this patch-set:
>>>>
>>>> 4. # Try to unbind virtio-pci driver. Unbind works fine with only few 
>>>> errors.
>>>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/unbind
>>>> qemu: Failed to read msg header. Read 0 instead of 12. Original 
>>>> request 11.
>>>> qemu: Failed to read msg header. Read 0 instead of 12. Original 
>>>> request 11.
>>>>
>>>> 5. # Bind virtio-pci driver back.
>>>>[guest]# echo -n ':00:01.0' > /sys/bus/pci/drivers/virtio-pci/bind
>>>>
>>>> 6. # Check link in guest. No crashes here, link in DOWN state.
>>>>[guest]# ip link show eth0
>>>> 7: eth0:  mtu 1500 qdisc <...>
>>>> link/ether 00:16:35:af:aa:4b brd ff:ff:ff:ff:ff:ff
>>>>
>>>> 7. QEMU may be gracefully restarted to restore communication after 
>>>> restarting
>>>>of vhost-user application.
>>>>
>>>> Ilya Maximets (4):
>>>>   vhost-user: fix crash on socket disconnect.
>>>>   vhost: prevent double stop of vhost_net device.
>>>>   vhost: check for vhost_net device validity.
>>>>   net: notify about link status only if it changed.
>>>>
>>>>  hw/net/vhost_net.c | 48 
>>>> ++
>>>>  hw/net/virtio-net.c| 33 ++---
>>>>  include/hw/virtio/virtio-net.h |  1 +
>>>>  include/net/vhost-user.h   |  1 +
>>>>  include/net/vhost_net.h|  1 +
>>>>  net/filter.c   |  1 +
>>>>  net/net.c  |  7 +++---
>>>>  net/vhost-user.c   | 43 +
>>>>  8 files changed, 111 insertions(+), 24 deletions(-)
>>>>
>>>> -- 
>>>> 2.5.0
>>>
>>>
> 
>

Re: [Qemu-devel] [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

2016-04-06 Thread Ilya Maximets

--- Original Message ---
Sender : Michael S. Tsirkin
Date : Apr 05, 2016 13:46 (GMT+03:00)
Title : Re: [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

> On Thu, Mar 31, 2016 at 09:02:01AM +0300, Ilya Maximets wrote:
> > On 30.03.2016 20:01, Michael S. Tsirkin wrote:
> > > On Wed, Mar 30, 2016 at 06:14:05PM +0300, Ilya Maximets wrote:
> > >> Currently QEMU always crashes in following scenario (assume that
> > >> vhost-user application is Open vSwitch with 'dpdkvhostuser' port):
> > > 
> > > In fact, wouldn't the right thing to do be stopping the VM?
> > 
> > I don't think that is reasonable to stop VM on failure of just one of
> > network interfaces.
> 
> We don't start QEMU until vhost user connects, either.
> Making guest run properly with a backend is a much bigger
> project, let's tackle this separately.
> 
> Also, I think handling graceful disconnect is the correct first step.
> Handling misbehaving clients is much harder as we have asserts on remote
> obeying protocol rules all over the place.
> 
> > There may be still working vhost-kernel interfaces.
> > Even connection can still be established if vhost-user interface was in
> > bonding with kernel interface.
> 
> Could not parse this.
>
> > Anyway user should be able to save all his data before QEMU restart.
>
> So reconnect a new backend, and VM will keep going.

We cant't do this because of 2 reasons:
1. Segmentation fault of QEMU on disconnect. (fixed by this patch-set)
2. There is no reconnect functionality in current QEMU version.

So, what are you talking about?

Best regards, Ilya Maximets.

Re: [Qemu-devel] [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.

2016-04-07 Thread Ilya Maximets

> --- Original Message ---
> Sender : Michael S. Tsirkin
> Date : Apr 07, 2016 10:01 (GMT+03:00)
> Title : Re: Re: [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.
> 
> On Wed, Apr 06, 2016 at 11:52:56PM +0000, Ilya Maximets wrote:
> > --- Original Message ---
> > Sender : Michael S. Tsirkin
> > Date : Apr 05, 2016 13:46 (GMT+03:00)
> > Title : Re: [PATCH 0/4] Fix QEMU crash on vhost-user socket disconnect.
> > 
> > > On Thu, Mar 31, 2016 at 09:02:01AM +0300, Ilya Maximets wrote:
> > > > On 30.03.2016 20:01, Michael S. Tsirkin wrote:
> > > > > On Wed, Mar 30, 2016 at 06:14:05PM +0300, Ilya Maximets wrote:
> > > > >> Currently QEMU always crashes in following scenario (assume that
> > > > >> vhost-user application is Open vSwitch with 'dpdkvhostuser' port):
> > > > > 
> > > > > In fact, wouldn't the right thing to do be stopping the VM?
> > > > 
> > > > I don't think that is reasonable to stop VM on failure of just one of
> > > > network interfaces.
> > > 
> > > We don't start QEMU until vhost user connects, either.
> > > Making guest run properly with a backend is a much bigger
> > > project, let's tackle this separately.
> > > 
> > > Also, I think handling graceful disconnect is the correct first step.
> > > Handling misbehaving clients is much harder as we have asserts on remote
> > > obeying protocol rules all over the place.
> > > 
> > > > There may be still working vhost-kernel interfaces.
> > > > Even connection can still be established if vhost-user interface was in
> > > > bonding with kernel interface.
> > > 
> > > Could not parse this.
> > >
> > > > Anyway user should be able to save all his data before QEMU restart.
> > >
> > > So reconnect a new backend, and VM will keep going.
> > 
> > We cant't do this because of 2 reasons:
> > 1. Segmentation fault of QEMU on disconnect. (fixed by this patch-set)
> > 2. There is no reconnect functionality in current QEMU version.
> > 
> > So, what are you talking about?
> > 
> > Best regards, Ilya Maximets.
> 
> One can currently disconnect vhost user clients without killing
> guests using migration:
> - save vm
> - quit qemu
> - start new qemu
> - load vm
>
> Or using hotplug
> - request unplug
> - wait for guest eject
> - create new device

This may be fixes second reason mentioned above, but in my scenario (described 
in cover-letter)
guset will know that backend disconnected only after segmentation fault. So, 
there will be
no opportunity to save or unplug machine.

> I would like to make sure we do not create an expectation that guests
> keep going unconditionally with device present even with backend
>disconnected. Unfortunately your patchset might create such
> expectation.

It's already so, because guest will never know about crach of backend untill it
tries to communicate via socket.

Re: [PATCH v2] net: add initial support for AF_XDP network backend

2023-07-20 Thread Ilya Maximets

On 7/20/23 09:37, Jason Wang wrote:
> On Thu, Jul 6, 2023 at 4:58 AM Ilya Maximets  wrote:
>>
>> AF_XDP is a network socket family that allows communication directly
>> with the network device driver in the kernel, bypassing most or all
>> of the kernel networking stack.  In the essence, the technology is
>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>> and works with any network interfaces without driver modifications.
>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>> require access to character devices or unix sockets.  Only access to
>> the network interface itself is necessary.
>>
>> This patch implements a network backend that communicates with the
>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>> Fill and Completion) are placed in that memory along with a pool of
>> memory buffers for the packet data.  Data transmission is done by
>> allocating one of the buffers, copying packet data into it and
>> placing the pointer into Tx ring.  After transmission, device will
>> return the buffer via Completion ring.  On Rx, device will take
>> a buffer form a pre-populated Fill ring, write the packet data into
>> it and place the buffer into Rx ring.
>>
>> AF_XDP network backend takes on the communication with the host
>> kernel and the network interface and forwards packets to/from the
>> peer device in QEMU.
>>
>> Usage example:
>>
>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>
>> XDP program bridges the socket with a network interface.  It can be
>> attached to the interface in 2 different modes:
>>
>> 1. skb - this mode should work for any interface and doesn't require
>>  driver support.  With a caveat of lower performance.
>>
>> 2. native - this does require support from the driver and allows to
>> bypass skb allocation in the kernel and potentially use
>> zero-copy while getting packets in/out userspace.
>>
>> By default, QEMU will try to use native mode and fall back to skb.
>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>> mode, use 'force-copy=on' option.  This might be useful if there is
>> some issue with the driver.
>>
>> Option 'queues=N' allows to specify how many device queues should
>> be open.  Note that all the queues that are not open are still
>> functional and can receive traffic, but it will not be delivered to
>> QEMU.  So, the number of device queues should generally match the
>> QEMU configuration, unless the device is shared with something
>> else and the traffic re-direction to appropriate queues is correctly
>> configured on a device level (e.g. with ethtool -N).
>> 'start-queue=M' option can be used to specify from which queue id
>> QEMU should start configuring 'N' queues.  It might also be necessary
>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>> for examples.
>>
>> In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
>> capabilities in order to load default XSK/XDP programs to the
>> network interface and configure BPF maps.  It is possible, however,
>> to run with no capabilities.  For that to work, an external process
>> with admin capabilities will need to pre-load default XSK program,
>> create AF_XDP sockets and pass their file descriptors to QEMU process
>> on startup via 'sock-fds' option.  Network backend will need to be
>> configured with 'inhibit=on' to avoid loading of the program.
>> QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue
>> or CAP_IPC_LOCK.
>>
>> Alternatively, the file descriptor for 'xsks_map' can be passed via
>> 'xsks-map-fd=N' option instead of passing socket file descriptors.
>> That will additionally require CAP_NET_RAW on QEMU side.  This is
>> useful, because 'sock-fds' may not be available with older libxdp.
>> 'sock-fds' requires libxdp >= 1.4.0.
>>
>> There are few performance challenges with the current network backends.
>>
>> First is that they do not support IO threads.  This means that data
>> path is handled by the main thread in QEMU and may slow down other
>> work or may be slowed down by some other work.  This also means that
>> taking advantage of multi-queue is generally not possible t

Re: [PATCH v2] net: add initial support for AF_XDP network backend

2023-08-04 Thread Ilya Maximets

On 7/25/23 08:55, Jason Wang wrote:
> On Thu, Jul 20, 2023 at 9:26 PM Ilya Maximets  wrote:
>>
>> On 7/20/23 09:37, Jason Wang wrote:
>>> On Thu, Jul 6, 2023 at 4:58 AM Ilya Maximets  wrote:
>>>>
>>>> AF_XDP is a network socket family that allows communication directly
>>>> with the network device driver in the kernel, bypassing most or all
>>>> of the kernel networking stack.  In the essence, the technology is
>>>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>>>> and works with any network interfaces without driver modifications.
>>>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>>>> require access to character devices or unix sockets.  Only access to
>>>> the network interface itself is necessary.
>>>>
>>>> This patch implements a network backend that communicates with the
>>>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>>>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>>>> Fill and Completion) are placed in that memory along with a pool of
>>>> memory buffers for the packet data.  Data transmission is done by
>>>> allocating one of the buffers, copying packet data into it and
>>>> placing the pointer into Tx ring.  After transmission, device will
>>>> return the buffer via Completion ring.  On Rx, device will take
>>>> a buffer form a pre-populated Fill ring, write the packet data into
>>>> it and place the buffer into Rx ring.
>>>>
>>>> AF_XDP network backend takes on the communication with the host
>>>> kernel and the network interface and forwards packets to/from the
>>>> peer device in QEMU.
>>>>
>>>> Usage example:
>>>>
>>>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>>>
>>>> XDP program bridges the socket with a network interface.  It can be
>>>> attached to the interface in 2 different modes:
>>>>
>>>> 1. skb - this mode should work for any interface and doesn't require
>>>>  driver support.  With a caveat of lower performance.
>>>>
>>>> 2. native - this does require support from the driver and allows to
>>>> bypass skb allocation in the kernel and potentially use
>>>> zero-copy while getting packets in/out userspace.
>>>>
>>>> By default, QEMU will try to use native mode and fall back to skb.
>>>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>>>> mode, use 'force-copy=on' option.  This might be useful if there is
>>>> some issue with the driver.
>>>>
>>>> Option 'queues=N' allows to specify how many device queues should
>>>> be open.  Note that all the queues that are not open are still
>>>> functional and can receive traffic, but it will not be delivered to
>>>> QEMU.  So, the number of device queues should generally match the
>>>> QEMU configuration, unless the device is shared with something
>>>> else and the traffic re-direction to appropriate queues is correctly
>>>> configured on a device level (e.g. with ethtool -N).
>>>> 'start-queue=M' option can be used to specify from which queue id
>>>> QEMU should start configuring 'N' queues.  It might also be necessary
>>>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>>>> for examples.
>>>>
>>>> In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
>>>> capabilities in order to load default XSK/XDP programs to the
>>>> network interface and configure BPF maps.  It is possible, however,
>>>> to run with no capabilities.  For that to work, an external process
>>>> with admin capabilities will need to pre-load default XSK program,
>>>> create AF_XDP sockets and pass their file descriptors to QEMU process
>>>> on startup via 'sock-fds' option.  Network backend will need to be
>>>> configured with 'inhibit=on' to avoid loading of the program.
>>>> QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue
>>>> or CAP_IPC_LOCK.
>>>>
>>>> Alternatively, the file descriptor for 'xsks_map' can be passed via
>>>> 'xsks-map-fd=N

[PATCH v3] net: add initial support for AF_XDP network backend

2023-08-04 Thread Ilya Maximets

on virtual
interfaces.

However, keeping in mind all of these challenges, current implementation
of the AF_XDP backend shows a decent performance while running on top
of a physical NIC with zero-copy support.

Test setup:

2 VMs running on 2 physical hosts connected via ConnectX6-Dx card.
Network backend is configured to open the NIC directly in native mode.
The driver supports zero-copy.  NIC is configured to use 1 queue.

Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd
for PPS testing.

iperf3 result:
 TCP stream  : 19.1 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
 Tx only : 3.4 Mpps
 Rx only : 2.0 Mpps
 L2 FWD Loopback : 1.5 Mpps

In skb mode the same setup shows much lower performance, similar to
the setup where pair of physical NICs is replaced with veth pair:

iperf3 result:
  TCP stream  : 9 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
  Tx only : 1.2 Mpps
  Rx only : 1.0 Mpps
  L2 FWD Loopback : 0.7 Mpps

Results in skb mode or over the veth are close to results of a tap
backend with vhost=on and disabled segmentation offloading bridged
with a NIC.

Signed-off-by: Ilya Maximets 
---

Version 3:

  - Bump requirements to libxdp 1.4.0+.  Having that, removed all
the conditional compilation parts, since all the needed APIs
are available in this version of libxdp.

  - Also removed the ability to pass xsks map fd, since ability
to just pass socket fds is now always available and it doesn't
require any capabilities untile manipulations with BPF maps.

  - Updated documentation to not call out specific vendors, memory
numbers or specific required capabilities.

  - Changed logic of returning peeked at but unused descriptors.

  - Minor cleanups.


Version 2:

  - Added support for running with no capabilities by passing
pre-created AF_XDP socket file descriptors via 'sock-fds' option.
Conditionally complied because requires unreleased libxdp 1.4.0.
The last restriction is having 32 MB of RLIMIT_MEMLOCK per queue.

  - Refined and extended documentation.


 MAINTAINERS   |   4 +
 hmp-commands.hx   |   3 +
 meson.build   |   9 +
 meson_options.txt |   2 +
 net/af-xdp.c  | 526 ++
 net/clients.h |   5 +
 net/meson.build   |   3 +
 net/net.c |   6 +
 qapi/net.json |  58 ++
 qemu-options.hx   |  70 ++-
 .../ci/org.centos/stream/8/x86_64/configure   |   1 +
 scripts/meson-buildoptions.sh |   3 +
 tests/docker/dockerfiles/debian-amd64.docker  |   1 +
 13 files changed, 690 insertions(+), 1 deletion(-)
 create mode 100644 net/af-xdp.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6111b6b4d9..bf015131a7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2948,6 +2948,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c
 
+AF_XDP network backend
+R: Ilya Maximets 
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand 
 M: Igor Mammedov 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..63eac22734 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1296,6 +1296,9 @@ ERST
 .name   = "netdev_add",
 .args_type  = "netdev:O",
 .params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+  "|af-xdp"
+#endif
 #ifdef CONFIG_VMNET
   "|vmnet-host|vmnet-shared|vmnet-bridged"
 #endif
diff --git a/meson.build b/meson.build
index 98e68ef0b1..8367a1ad1e 100644
--- a/meson.build
+++ b/meson.build
@@ -1881,6 +1881,13 @@ if libbpf.found() and not cc.links('''
   endif
 endif
 
+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+libxdp = dependency('libxdp', required: get_option('af_xdp'),
+version: '>=1.4.0', method: 'pkg-config')
+endif
+
 # libdw
 libdw = not_found
 if not get_option('libdw').auto() or \
@@ -2102,6 +2109,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', 
get_option('hexagon_idef_pars
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
 config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
 config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
 config_host_data.set('CONFIG_LIBNFS', libnf

Re: [Qemu-devel] [PATCH v2 0/4] memfd fixes.

2019-03-11 Thread Ilya Maximets

Hi.

I'm trying to figure out the state of this patch set.
Is there any chance for it to be accepted?

The first patch needs minor rebase. I could send a new version.

BTW, even without the first patch that raised some discussion
the last three patches are kind of straightforward and useful.

Best regards, Ilya Maximets.

On 27.11.2018 16:50, Ilya Maximets wrote:
> Version 2:
> * First patch changed to just drop the memfd backend
>   if seals are not supported.
> 
> Ilya Maximets (4):
>   hostmem-memfd: disable for systems wihtout sealing support
>   memfd: always check for MFD_CLOEXEC
>   memfd: set up correct errno if not supported
>   memfd: improve error messages
> 
>  backends/hostmem-memfd.c | 18 --
>  tests/vhost-user-test.c  |  6 +++---
>  util/memfd.c | 10 --
>  3 files changed, 19 insertions(+), 15 deletions(-)
>

[Qemu-devel] [PATCH v3 0/4] memfd fixes.

2019-03-11 Thread Ilya Maximets

Version 3:
* Rebase on top of current master.

Version 2:
* First patch changed to just drop the memfd backend
  if seals are not supported.

Ilya Maximets (4):
  hostmem-memfd: disable for systems wihtout sealing support
  memfd: always check for MFD_CLOEXEC
  memfd: set up correct errno if not supported
  memfd: improve error messages

 backends/hostmem-memfd.c | 18 --
 tests/vhost-user-test.c  |  5 +++--
 util/memfd.c | 10 --
 3 files changed, 19 insertions(+), 14 deletions(-)

-- 
2.17.1

[Qemu-devel] [PATCH v3 1/4] hostmem-memfd: disable for systems wihtout sealing support

2019-03-11 Thread Ilya Maximets

If seals are not supported, memfd_create() will fail.
Furthermore, there is no way to disable it in this case because
'.seal' property is not registered.

This issue leads to vhost-user-test failures on RHEL 7.2:

  qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
  failed to create memfd: Invalid argument

and actually breaks the feature on such systems.

Let's restrict memfd backend to systems with sealing support.

Signed-off-by: Ilya Maximets 
---
 backends/hostmem-memfd.c | 18 --
 tests/vhost-user-test.c  |  5 +++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index 98c9bf3240..46b15b916a 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -154,15 +154,13 @@ memfd_backend_class_init(ObjectClass *oc, void *data)
   "Huge pages size (ex: 2M, 1G)",
   &error_abort);
 }
-if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
-object_class_property_add_bool(oc, "seal",
-   memfd_backend_get_seal,
-   memfd_backend_set_seal,
-   &error_abort);
-object_class_property_set_description(oc, "seal",
-  "Seal growing & shrinking",
-  &error_abort);
-}
+object_class_property_add_bool(oc, "seal",
+   memfd_backend_get_seal,
+   memfd_backend_set_seal,
+   &error_abort);
+object_class_property_set_description(oc, "seal",
+  "Seal growing & shrinking",
+  &error_abort);
 }
 
 static const TypeInfo memfd_backend_info = {
@@ -175,7 +173,7 @@ static const TypeInfo memfd_backend_info = {
 
 static void register_types(void)
 {
-if (qemu_memfd_check(0)) {
+if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
 type_register_static(&memfd_backend_info);
 }
 }
diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index 0c965b3b1e..3817966010 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -178,7 +178,8 @@ static void append_mem_opts(TestServer *server, GString 
*cmd_line,
 int size, enum test_memfd memfd)
 {
 if (memfd == TEST_MEMFD_AUTO) {
-memfd = qemu_memfd_check(0) ? TEST_MEMFD_YES : TEST_MEMFD_NO;
+memfd = qemu_memfd_check(MFD_ALLOW_SEALING) ? TEST_MEMFD_YES
+: TEST_MEMFD_NO;
 }
 
 if (memfd == TEST_MEMFD_YES) {
@@ -930,7 +931,7 @@ static void register_vhost_user_test(void)
  "virtio-net",
  test_read_guest_mem, &opts);
 
-if (qemu_memfd_check(0)) {
+if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
 opts.before = vhost_user_test_setup_memfd;
 qos_add_test("vhost-user/read-guest-mem/memfd",
  "virtio-net",
-- 
2.17.1

[Qemu-devel] [PATCH v3 2/4] memfd: always check for MFD_CLOEXEC

2019-03-11 Thread Ilya Maximets

QEMU always sets this flag unconditionally. We need to
check if it's supported.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 8debd0d037..d74ce4d793 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -188,7 +188,7 @@ bool qemu_memfd_alloc_check(void)
 bool qemu_memfd_check(unsigned int flags)
 {
 #ifdef CONFIG_LINUX
-int mfd = memfd_create("test", flags);
+int mfd = memfd_create("test", flags | MFD_CLOEXEC);
 
 if (mfd >= 0) {
 close(mfd);
-- 
2.17.1

[Qemu-devel] [PATCH v3 3/4] memfd: set up correct errno if not supported

2019-03-11 Thread Ilya Maximets

qemu_memfd_create() prints the value of 'errno' which is not
set in this case.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/memfd.c b/util/memfd.c
index d74ce4d793..393d23da96 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -40,6 +40,7 @@ static int memfd_create(const char *name, unsigned int flags)
 #ifdef __NR_memfd_create
 return syscall(__NR_memfd_create, name, flags);
 #else
+errno = ENOSYS;
 return -1;
 #endif
 }
-- 
2.17.1

[Qemu-devel] [PATCH v3 4/4] memfd: improve error messages

2019-03-11 Thread Ilya Maximets

This gives more information about the failure.
Additionally 'ENOSYS' returned for a non-Linux platforms instead of
'errno', which is not initilaized in this case.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 393d23da96..00334e5b21 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -71,14 +71,18 @@ int qemu_memfd_create(const char *name, size_t size, bool 
hugetlb,
 }
 mfd = memfd_create(name, flags);
 if (mfd < 0) {
+error_setg_errno(errp, errno,
+ "failed to create memfd with flags 0x%x", flags);
 goto err;
 }
 
 if (ftruncate(mfd, size) == -1) {
+error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
 goto err;
 }
 
 if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
+error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
 goto err;
 }
 
@@ -88,8 +92,9 @@ err:
 if (mfd >= 0) {
 close(mfd);
 }
+#else
+error_setg_errno(errp, ENOSYS, "failed to create memfd");
 #endif
-error_setg_errno(errp, errno, "failed to create memfd");
 return -1;
 }
 
-- 
2.17.1

[PATCH] vhost_net: Print feature masks in hex

2022-03-18 Thread Ilya Maximets

"0x2" is much more readable than "8589934592".
The change saves one step (conversion) while debugging.

Signed-off-by: Ilya Maximets 
---
 hw/net/vhost_net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 30379d2ca4..df0f050548 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -201,7 +201,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
 net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
 }
 if (~net->dev.features & net->dev.backend_features) {
-fprintf(stderr, "vhost lacks feature mask %" PRIu64
+fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64
" for backend\n",
(uint64_t)(~net->dev.features & net->dev.backend_features));
 goto fail;
@@ -213,7 +213,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
 if (net->nc->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 features = vhost_user_get_acked_features(net->nc);
 if (~net->dev.features & features) {
-fprintf(stderr, "vhost lacks feature mask %" PRIu64
+fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64
 " for backend\n",
 (uint64_t)(~net->dev.features & features));
 goto fail;
-- 
2.34.1

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-25 Thread Ilya Maximets

On 9/25/23 16:23, Stefan Hajnoczi wrote:
> On Fri, 25 Aug 2023 at 13:04, Ilya Maximets  wrote:
>>
>> We do not need the most up to date number of heads, we only want to
>> know if there is at least one.
>>
>> Use shadow variable as long as it is not equal to the last available
>> index checked.  This avoids expensive qatomic dereference of the
>> RCU-protected memory region cache as well as the memory access itself
>> and the subsequent memory barrier.
>>
>> The change improves performance of the af-xdp network backend by 2-3%.
>>
>> Signed-off-by: Ilya Maximets 
>> ---
>>  hw/virtio/virtio.c | 10 +-
>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 309038fd46..04bf7cc977 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
>> VirtQueueElement *elem,
>>  /* Called within rcu_read_lock().  */
>>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>>  {
>> -uint16_t num_heads = vring_avail_idx(vq) - idx;
>> +uint16_t num_heads;
>> +
>> +if (vq->shadow_avail_idx != idx) {
>> +num_heads = vq->shadow_avail_idx - idx;
>> +
>> +return num_heads;
> 
> This still needs to check num_heads > vq->vring.num and return -EINVAL
> as is done below.

Hmm, yeas, you're right.  If the value was incorrect initially, the shadow
will be incorrect.  However, I think we should just not return here in this
case and let vring_avail_idx() to grab an actual new value below.  Otherwise
we may never break out of this error.

Does that make sense?

> 
>> +}
>> +
>> +num_heads = vring_avail_idx(vq) - idx;
>>
>>  /* Check it isn't doing very strange things with descriptor numbers. */
>>  if (num_heads > vq->vring.num) {
>> --
>> 2.40.1
>>
>>

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-25 Thread Ilya Maximets

On 9/25/23 17:12, Stefan Hajnoczi wrote:
> On Mon, 25 Sept 2023 at 11:02, Ilya Maximets  wrote:
>>
>> On 9/25/23 16:23, Stefan Hajnoczi wrote:
>>> On Fri, 25 Aug 2023 at 13:04, Ilya Maximets  wrote:
>>>>
>>>> We do not need the most up to date number of heads, we only want to
>>>> know if there is at least one.
>>>>
>>>> Use shadow variable as long as it is not equal to the last available
>>>> index checked.  This avoids expensive qatomic dereference of the
>>>> RCU-protected memory region cache as well as the memory access itself
>>>> and the subsequent memory barrier.
>>>>
>>>> The change improves performance of the af-xdp network backend by 2-3%.
>>>>
>>>> Signed-off-by: Ilya Maximets 
>>>> ---
>>>>  hw/virtio/virtio.c | 10 +-
>>>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>>> index 309038fd46..04bf7cc977 100644
>>>> --- a/hw/virtio/virtio.c
>>>> +++ b/hw/virtio/virtio.c
>>>> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
>>>> VirtQueueElement *elem,
>>>>  /* Called within rcu_read_lock().  */
>>>>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>>>>  {
>>>> -uint16_t num_heads = vring_avail_idx(vq) - idx;
>>>> +uint16_t num_heads;
>>>> +
>>>> +if (vq->shadow_avail_idx != idx) {
>>>> +num_heads = vq->shadow_avail_idx - idx;
>>>> +
>>>> +return num_heads;
>>>
>>> This still needs to check num_heads > vq->vring.num and return -EINVAL
>>> as is done below.
>>
>> Hmm, yeas, you're right.  If the value was incorrect initially, the shadow
>> will be incorrect.  However, I think we should just not return here in this
>> case and let vring_avail_idx() to grab an actual new value below.  Otherwise
>> we may never break out of this error.
>>
>> Does that make sense?
> 
> No, because virtio_error() marks the device as broken. The device
> requires a reset in order to function again. Fetching
> vring_avail_idx() again won't help.

OK, I see.  In this case we're talking about situation where
vring_avail_idx() was called in some other place and stored a bad value
in the shadow variable, then virtqueue_num_heads() got called.  Right?

AFAIU, we can still just fall through here and let vring_avail_idx()
to read the index again and fail the existing check.  That would happen
today without this patch applied.

I'm jut trying to avoid duplication of the virtio_error call, i.e.:

if (vq->shadow_avail_idx != idx) {
num_heads = vq->shadow_avail_idx - idx;

/* Check it isn't doing very strange things with descriptor numbers. */
    if (num_heads > vq->vring.num) {
virtio_error(vq->vdev, "Guest moved used index from %u to %u",
 idx, vq->shadow_avail_idx);
return -EINVAL;
}
return num_heads;
}

vs

if (vq->shadow_avail_idx != idx) {
num_heads = vq->shadow_avail_idx - idx;

/* Only use the shadow value if it was good initially. */
if (num_heads <= vq->vring.num) {
return num_heads;
}
}


What do you think?

Best regards, Ilya Maximets.

Re: [PATCH] virtio: remove unnecessary thread fence while reading next descriptor

2023-09-25 Thread Ilya Maximets

On 9/25/23 16:32, Stefan Hajnoczi wrote:
> On Fri, 25 Aug 2023 at 13:02, Ilya Maximets  wrote:
>>
>> It was supposed to be a compiler barrier and it was a compiler barrier
>> initially called 'wmb' (??) when virtio core support was introduced.
>> Later all the instances of 'wmb' were switched to smp_wmb to fix memory
>> ordering issues on non-x86 platforms.  However, this one doesn't need
>> to be an actual barrier.  It's enough for it to stay a compiler barrier
>> as its only purpose is to ensure that the value is not read twice.
>>
>> There is no counterpart read barrier in the drivers, AFAICT.  And even
>> if we needed an actual barrier, it shouldn't have been a write barrier.
>>
>> Signed-off-by: Ilya Maximets 
>> ---
>>  hw/virtio/virtio.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 309038fd46..6eb8586858 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -1051,7 +1051,7 @@ static int virtqueue_split_read_next_desc(VirtIODevice 
>> *vdev, VRingDesc *desc,
>>  /* Check they're not leading us off end of descriptors. */
>>  *next = desc->next;
> 
> I don't see a caller that uses *next. Can the argument be eliminated?

Yes, it can.  The 'next' was converted from a local variable to
an output parameter in commit:
  412e0e81b174 ("virtio: handle virtqueue_read_next_desc() errors")

And that didn't actually make sense even then, because all the
actual uses of the 'i/next' as an output were removed a few months
prior in commit:
  aa570d6fb6bd ("virtio: combine the read of a descriptor")

I can post a separate patch for this.

> 
>>  /* Make sure compiler knows to grab that: we don't want it changing! */
>> -smp_wmb();
>> +barrier();
> 
> What is the purpose of this barrier? desc is not guest memory and
> nothing modifies desc's fields while this function is executing. I
> think the barrier can be removed.

True.  In fact, that was the first thing I did, but then the comment
derailed me into thinking that it somehow can be updated concurrently,
so I went with a safer option.  :/
It is indeed a local variable and the barrier is not needed today.
It had a little more sense before the previously mentioned commit:
  aa570d6fb6bd ("virtio: combine the read of a descriptor")
because we were reading guest memory before the barrier and used the
result after.

I'll remove it.

Best regards, Ilya Maximets.

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-25 Thread Ilya Maximets

On 9/25/23 17:38, Stefan Hajnoczi wrote:
> On Mon, 25 Sept 2023 at 11:36, Ilya Maximets  wrote:
>>
>> On 9/25/23 17:12, Stefan Hajnoczi wrote:
>>> On Mon, 25 Sept 2023 at 11:02, Ilya Maximets  wrote:
>>>>
>>>> On 9/25/23 16:23, Stefan Hajnoczi wrote:
>>>>> On Fri, 25 Aug 2023 at 13:04, Ilya Maximets  wrote:
>>>>>>
>>>>>> We do not need the most up to date number of heads, we only want to
>>>>>> know if there is at least one.
>>>>>>
>>>>>> Use shadow variable as long as it is not equal to the last available
>>>>>> index checked.  This avoids expensive qatomic dereference of the
>>>>>> RCU-protected memory region cache as well as the memory access itself
>>>>>> and the subsequent memory barrier.
>>>>>>
>>>>>> The change improves performance of the af-xdp network backend by 2-3%.
>>>>>>
>>>>>> Signed-off-by: Ilya Maximets 
>>>>>> ---
>>>>>>  hw/virtio/virtio.c | 10 +-
>>>>>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>>>>> index 309038fd46..04bf7cc977 100644
>>>>>> --- a/hw/virtio/virtio.c
>>>>>> +++ b/hw/virtio/virtio.c
>>>>>> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
>>>>>> VirtQueueElement *elem,
>>>>>>  /* Called within rcu_read_lock().  */
>>>>>>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>>>>>>  {
>>>>>> -uint16_t num_heads = vring_avail_idx(vq) - idx;
>>>>>> +uint16_t num_heads;
>>>>>> +
>>>>>> +if (vq->shadow_avail_idx != idx) {
>>>>>> +num_heads = vq->shadow_avail_idx - idx;
>>>>>> +
>>>>>> +return num_heads;
>>>>>
>>>>> This still needs to check num_heads > vq->vring.num and return -EINVAL
>>>>> as is done below.
>>>>
>>>> Hmm, yeas, you're right.  If the value was incorrect initially, the shadow
>>>> will be incorrect.  However, I think we should just not return here in this
>>>> case and let vring_avail_idx() to grab an actual new value below.  
>>>> Otherwise
>>>> we may never break out of this error.
>>>>
>>>> Does that make sense?
>>>
>>> No, because virtio_error() marks the device as broken. The device
>>> requires a reset in order to function again. Fetching
>>> vring_avail_idx() again won't help.
>>
>> OK, I see.  In this case we're talking about situation where
>> vring_avail_idx() was called in some other place and stored a bad value
>> in the shadow variable, then virtqueue_num_heads() got called.  Right?

Hmm, I suppose we also need a read barrier after all even if we use
a shadow index.  Assuming the index is correct, but the shadow variable
was updated by a call outside of this function, then we may miss a
barrier and read the descriptor out of order, in theory.  Read barrier
is going to be a compiler barrier on x86, so the performance gain from
this patch should still be mostly there.  I'll test that.

>>
>> AFAIU, we can still just fall through here and let vring_avail_idx()
>> to read the index again and fail the existing check.  That would happen
>> today without this patch applied.
> 
> Yes, that is fine.
> 
>>
>> I'm jut trying to avoid duplication of the virtio_error call, i.e.:
>>
>> if (vq->shadow_avail_idx != idx) {
>> num_heads = vq->shadow_avail_idx - idx;
>>
>> /* Check it isn't doing very strange things with descriptor numbers. 
>> */
>> if (num_heads > vq->vring.num) {
>> virtio_error(vq->vdev, "Guest moved used index from %u to %u",
>>  idx, vq->shadow_avail_idx);
>> return -EINVAL;
>> }
>> return num_heads;
>> }
>>
>> vs
>>
>> if (vq->shadow_avail_idx != idx) {
>> num_heads = vq->shadow_avail_idx - idx;
>>
>> /* Only use the shadow value if it was good initially. */
>> if (num_heads <= vq->vring.num) {
>> return num_heads;
>> }
>> }
>>
>>
>> What do you think?
> 
> Sounds good.
> 
>>
>> Best regards, Ilya Maximets.

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-25 Thread Ilya Maximets

On 9/25/23 23:24, Michael S. Tsirkin wrote:
> On Mon, Sep 25, 2023 at 10:58:05PM +0200, Ilya Maximets wrote:
>> On 9/25/23 17:38, Stefan Hajnoczi wrote:
>>> On Mon, 25 Sept 2023 at 11:36, Ilya Maximets  wrote:
>>>>
>>>> On 9/25/23 17:12, Stefan Hajnoczi wrote:
>>>>> On Mon, 25 Sept 2023 at 11:02, Ilya Maximets  wrote:
>>>>>>
>>>>>> On 9/25/23 16:23, Stefan Hajnoczi wrote:
>>>>>>> On Fri, 25 Aug 2023 at 13:04, Ilya Maximets  wrote:
>>>>>>>>
>>>>>>>> We do not need the most up to date number of heads, we only want to
>>>>>>>> know if there is at least one.
>>>>>>>>
>>>>>>>> Use shadow variable as long as it is not equal to the last available
>>>>>>>> index checked.  This avoids expensive qatomic dereference of the
>>>>>>>> RCU-protected memory region cache as well as the memory access itself
>>>>>>>> and the subsequent memory barrier.
>>>>>>>>
>>>>>>>> The change improves performance of the af-xdp network backend by 2-3%.
>>>>>>>>
>>>>>>>> Signed-off-by: Ilya Maximets 
>>>>>>>> ---
>>>>>>>>  hw/virtio/virtio.c | 10 +-
>>>>>>>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>>>>>>> index 309038fd46..04bf7cc977 100644
>>>>>>>> --- a/hw/virtio/virtio.c
>>>>>>>> +++ b/hw/virtio/virtio.c
>>>>>>>> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
>>>>>>>> VirtQueueElement *elem,
>>>>>>>>  /* Called within rcu_read_lock().  */
>>>>>>>>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>>>>>>>>  {
>>>>>>>> -uint16_t num_heads = vring_avail_idx(vq) - idx;
>>>>>>>> +uint16_t num_heads;
>>>>>>>> +
>>>>>>>> +if (vq->shadow_avail_idx != idx) {
>>>>>>>> +num_heads = vq->shadow_avail_idx - idx;
>>>>>>>> +
>>>>>>>> +return num_heads;
>>>>>>>
>>>>>>> This still needs to check num_heads > vq->vring.num and return -EINVAL
>>>>>>> as is done below.
>>>>>>
>>>>>> Hmm, yeas, you're right.  If the value was incorrect initially, the 
>>>>>> shadow
>>>>>> will be incorrect.  However, I think we should just not return here in 
>>>>>> this
>>>>>> case and let vring_avail_idx() to grab an actual new value below.  
>>>>>> Otherwise
>>>>>> we may never break out of this error.
>>>>>>
>>>>>> Does that make sense?
>>>>>
>>>>> No, because virtio_error() marks the device as broken. The device
>>>>> requires a reset in order to function again. Fetching
>>>>> vring_avail_idx() again won't help.
>>>>
>>>> OK, I see.  In this case we're talking about situation where
>>>> vring_avail_idx() was called in some other place and stored a bad value
>>>> in the shadow variable, then virtqueue_num_heads() got called.  Right?
>>
>> Hmm, I suppose we also need a read barrier after all even if we use
>> a shadow index.  Assuming the index is correct, but the shadow variable
>> was updated by a call outside of this function, then we may miss a
>> barrier and read the descriptor out of order, in theory.  Read barrier
>> is going to be a compiler barrier on x86, so the performance gain from
>> this patch should still be mostly there.  I'll test that.
> 
> I can't say I understand generally. shadow is under qemu control,
> I don't think it can be updated concurrently by multiple CPUs.

It can't, I agree.  Scenario I'm thinking about is the following:

1. vring_avail_idx() is called from one of the places other than
   virtqueue_num_heads().  Shadow is updated with the current value.
   Some users of vring_avail_idx() do not use barriers after the call.

2. virtqueue_split_get_avail_bytes() is called.

3. virtqueue_split_get_avail_bytes() calls virtqueue_num_heads().

4. virtqueue_num_heads() checks the shadow and r

[PATCH v2] virtio: use shadow_avail_idx while checking number of heads

2023-09-27 Thread Ilya Maximets

We do not need the most up to date number of heads, we only want to
know if there is at least one.

Use shadow variable as long as it is not equal to the last available
index checked.  This avoids expensive qatomic dereference of the
RCU-protected memory region cache as well as the memory access itself.

The change improves performance of the af-xdp network backend by 2-3%.

Signed-off-by: Ilya Maximets 
---

Version 2:
  - Changed to not skip error checks and a barrier.
  - Added comments about the need for a barrier.

 hw/virtio/virtio.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 4577f3f5b3..8a4c3e95d2 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -999,7 +999,12 @@ void virtqueue_push(VirtQueue *vq, const VirtQueueElement 
*elem,
 /* Called within rcu_read_lock().  */
 static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
 {
-uint16_t num_heads = vring_avail_idx(vq) - idx;
+uint16_t avail_idx, num_heads;
+
+/* Use shadow index whenever possible. */
+avail_idx = (vq->shadow_avail_idx != idx) ? vq->shadow_avail_idx
+  : vring_avail_idx(vq);
+num_heads = avail_idx - idx;
 
 /* Check it isn't doing very strange things with descriptor numbers. */
 if (num_heads > vq->vring.num) {
@@ -1007,8 +1012,15 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned 
int idx)
  idx, vq->shadow_avail_idx);
 return -EINVAL;
 }
-/* On success, callers read a descriptor at vq->last_avail_idx.
- * Make sure descriptor read does not bypass avail index read. */
+/*
+ * On success, callers read a descriptor at vq->last_avail_idx.
+ * Make sure descriptor read does not bypass avail index read.
+ *
+ * This is necessary even if we are using a shadow index, since
+ * the shadow index could have been initialized by calling
+ * vring_avail_idx() outside of this function, i.e., by a guest
+ * memory read not accompanied by a barrier.
+ */
 if (num_heads) {
 smp_rmb();
 }
-- 
2.41.0

[PATCH v2 2/2] virtio: remove unused next argument from virtqueue_split_read_next_desc()

2023-09-27 Thread Ilya Maximets

The 'next' was converted from a local variable to an output parameter
in commit:
  412e0e81b174 ("virtio: handle virtqueue_read_next_desc() errors")

But all the actual uses of the 'i/next' as an output were removed a few
months prior in commit:
  aa570d6fb6bd ("virtio: combine the read of a descriptor")

Remove the unused argument to simplify the code.

Also, adding a comment to the function to describe what it is actually
doing, as it is not obvious that the 'desc' is both an input and an
output argument.

Signed-off-by: Ilya Maximets 
---
 hw/virtio/virtio.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 37584aaa74..bc0388d45b 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1039,9 +1039,10 @@ enum {
 VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
 };
 
+/* Reads the 'desc->next' descriptor into '*desc'. */
 static int virtqueue_split_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
   MemoryRegionCache *desc_cache,
-  unsigned int max, unsigned int *next)
+  unsigned int max)
 {
 /* If this descriptor says it doesn't chain, we're done. */
 if (!(desc->flags & VRING_DESC_F_NEXT)) {
@@ -1049,14 +1050,12 @@ static int virtqueue_split_read_next_desc(VirtIODevice 
*vdev, VRingDesc *desc,
 }
 
 /* Check they're not leading us off end of descriptors. */
-*next = desc->next;
-
-if (*next >= max) {
-virtio_error(vdev, "Desc next is %u", *next);
+if (desc->next >= max) {
+virtio_error(vdev, "Desc next is %u", desc->next);
 return VIRTQUEUE_READ_DESC_ERROR;
 }
 
-vring_split_desc_read(vdev, desc, desc_cache, *next);
+vring_split_desc_read(vdev, desc, desc_cache, desc->next);
 return VIRTQUEUE_READ_DESC_MORE;
 }
 
@@ -1134,7 +1133,7 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
 goto done;
 }
 
-rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, 
&i);
+rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max);
 } while (rc == VIRTQUEUE_READ_DESC_MORE);
 
 if (rc == VIRTQUEUE_READ_DESC_ERROR) {
@@ -1585,7 +1584,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 goto err_undo_map;
 }
 
-rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, &i);
+rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max);
 } while (rc == VIRTQUEUE_READ_DESC_MORE);
 
 if (rc == VIRTQUEUE_READ_DESC_ERROR) {
@@ -4039,8 +4038,7 @@ VirtioQueueElement 
*qmp_x_query_virtio_queue_element(const char *path,
 list = node;
 
 ndescs++;
-rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache,
-max, &i);
+rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max);
 } while (rc == VIRTQUEUE_READ_DESC_MORE);
 element->descs = list;
 done:
-- 
2.41.0

[PATCH v2 1/2] virtio: remove unnecessary thread fence while reading next descriptor

2023-09-27 Thread Ilya Maximets

It was supposed to be a compiler barrier and it was a compiler barrier
initially called 'wmb' when virtio core support was introduced.
Later all the instances of 'wmb' were switched to smp_wmb to fix memory
ordering issues on non-x86 platforms.  However, this one doesn't need
to be an actual barrier, as its only purpose was to ensure that the
value is not read twice.

And since commit aa570d6fb6bd ("virtio: combine the read of a descriptor")
there is no need for a barrier at all, since we're no longer reading
guest memory here, but accessing a local structure.

Signed-off-by: Ilya Maximets 
---
 hw/virtio/virtio.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 4577f3f5b3..37584aaa74 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1050,8 +1050,6 @@ static int virtqueue_split_read_next_desc(VirtIODevice 
*vdev, VRingDesc *desc,
 
 /* Check they're not leading us off end of descriptors. */
 *next = desc->next;
-/* Make sure compiler knows to grab that: we don't want it changing! */
-smp_wmb();
 
 if (*next >= max) {
 virtio_error(vdev, "Desc next is %u", *next);
-- 
2.41.0

[PATCH v2 0/2] virtio: clean up of virtqueue_split_read_next_desc()

2023-09-27 Thread Ilya Maximets



Version 2:
  - Converted into a patch set adding a new patch that removes the
'next' argument.  [Stefan]
  - Completely removing the barrier instead of changing into compiler
barrier.  [Stefan]


Ilya Maximets (2):
  virtio: remove unnecessary thread fence while reading next descriptor
  virtio: remove unused next argument from
virtqueue_split_read_next_desc()

 hw/virtio/virtio.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

-- 
2.41.0

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-27 Thread Ilya Maximets

On 9/26/23 00:24, Michael S. Tsirkin wrote:
> On Tue, Sep 26, 2023 at 12:13:11AM +0200, Ilya Maximets wrote:
>> On 9/25/23 23:24, Michael S. Tsirkin wrote:
>>> On Mon, Sep 25, 2023 at 10:58:05PM +0200, Ilya Maximets wrote:
>>>> On 9/25/23 17:38, Stefan Hajnoczi wrote:
>>>>> On Mon, 25 Sept 2023 at 11:36, Ilya Maximets  wrote:
>>>>>>
>>>>>> On 9/25/23 17:12, Stefan Hajnoczi wrote:
>>>>>>> On Mon, 25 Sept 2023 at 11:02, Ilya Maximets  wrote:
>>>>>>>>
>>>>>>>> On 9/25/23 16:23, Stefan Hajnoczi wrote:
>>>>>>>>> On Fri, 25 Aug 2023 at 13:04, Ilya Maximets  
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> We do not need the most up to date number of heads, we only want to
>>>>>>>>>> know if there is at least one.
>>>>>>>>>>
>>>>>>>>>> Use shadow variable as long as it is not equal to the last available
>>>>>>>>>> index checked.  This avoids expensive qatomic dereference of the
>>>>>>>>>> RCU-protected memory region cache as well as the memory access itself
>>>>>>>>>> and the subsequent memory barrier.
>>>>>>>>>>
>>>>>>>>>> The change improves performance of the af-xdp network backend by 
>>>>>>>>>> 2-3%.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Ilya Maximets 
>>>>>>>>>> ---
>>>>>>>>>>  hw/virtio/virtio.c | 10 +-
>>>>>>>>>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>>>>>>>>> index 309038fd46..04bf7cc977 100644
>>>>>>>>>> --- a/hw/virtio/virtio.c
>>>>>>>>>> +++ b/hw/virtio/virtio.c
>>>>>>>>>> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
>>>>>>>>>> VirtQueueElement *elem,
>>>>>>>>>>  /* Called within rcu_read_lock().  */
>>>>>>>>>>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>>>>>>>>>>  {
>>>>>>>>>> -uint16_t num_heads = vring_avail_idx(vq) - idx;
>>>>>>>>>> +uint16_t num_heads;
>>>>>>>>>> +
>>>>>>>>>> +if (vq->shadow_avail_idx != idx) {
>>>>>>>>>> +num_heads = vq->shadow_avail_idx - idx;
>>>>>>>>>> +
>>>>>>>>>> +return num_heads;
>>>>>>>>>
>>>>>>>>> This still needs to check num_heads > vq->vring.num and return -EINVAL
>>>>>>>>> as is done below.
>>>>>>>>
>>>>>>>> Hmm, yeas, you're right.  If the value was incorrect initially, the 
>>>>>>>> shadow
>>>>>>>> will be incorrect.  However, I think we should just not return here in 
>>>>>>>> this
>>>>>>>> case and let vring_avail_idx() to grab an actual new value below.  
>>>>>>>> Otherwise
>>>>>>>> we may never break out of this error.
>>>>>>>>
>>>>>>>> Does that make sense?
>>>>>>>
>>>>>>> No, because virtio_error() marks the device as broken. The device
>>>>>>> requires a reset in order to function again. Fetching
>>>>>>> vring_avail_idx() again won't help.
>>>>>>
>>>>>> OK, I see.  In this case we're talking about situation where
>>>>>> vring_avail_idx() was called in some other place and stored a bad value
>>>>>> in the shadow variable, then virtqueue_num_heads() got called.  Right?
>>>>
>>>> Hmm, I suppose we also need a read barrier after all even if we use
>>>> a shadow index.  Assuming the index is correct, but the shadow variable
>>>> was updated by a call outside of this function, then we may miss a
>>>> barrier and read the descriptor out of order, in theory.  Read barrier
>>>

Re: [PATCH] virtio: remove unnecessary thread fence while reading next descriptor

2023-09-27 Thread Ilya Maximets

On 9/25/23 20:04, Ilya Maximets wrote:
> On 9/25/23 16:32, Stefan Hajnoczi wrote:
>> On Fri, 25 Aug 2023 at 13:02, Ilya Maximets  wrote:
>>>
>>> It was supposed to be a compiler barrier and it was a compiler barrier
>>> initially called 'wmb' (??) when virtio core support was introduced.
>>> Later all the instances of 'wmb' were switched to smp_wmb to fix memory
>>> ordering issues on non-x86 platforms.  However, this one doesn't need
>>> to be an actual barrier.  It's enough for it to stay a compiler barrier
>>> as its only purpose is to ensure that the value is not read twice.
>>>
>>> There is no counterpart read barrier in the drivers, AFAICT.  And even
>>> if we needed an actual barrier, it shouldn't have been a write barrier.
>>>
>>> Signed-off-by: Ilya Maximets 
>>> ---
>>>  hw/virtio/virtio.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>> index 309038fd46..6eb8586858 100644
>>> --- a/hw/virtio/virtio.c
>>> +++ b/hw/virtio/virtio.c
>>> @@ -1051,7 +1051,7 @@ static int 
>>> virtqueue_split_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
>>>  /* Check they're not leading us off end of descriptors. */
>>>  *next = desc->next;
>>
>> I don't see a caller that uses *next. Can the argument be eliminated?
> 
> Yes, it can.  The 'next' was converted from a local variable to
> an output parameter in commit:
>   412e0e81b174 ("virtio: handle virtqueue_read_next_desc() errors")
> 
> And that didn't actually make sense even then, because all the
> actual uses of the 'i/next' as an output were removed a few months
> prior in commit:
>   aa570d6fb6bd ("virtio: combine the read of a descriptor")
> 
> I can post a separate patch for this.
> 
>>
>>>  /* Make sure compiler knows to grab that: we don't want it changing! */
>>> -smp_wmb();
>>> +barrier();
>>
>> What is the purpose of this barrier? desc is not guest memory and
>> nothing modifies desc's fields while this function is executing. I
>> think the barrier can be removed.
> 
> True.  In fact, that was the first thing I did, but then the comment
> derailed me into thinking that it somehow can be updated concurrently,
> so I went with a safer option.  :/
> It is indeed a local variable and the barrier is not needed today.
> It had a little more sense before the previously mentioned commit:
>   aa570d6fb6bd ("virtio: combine the read of a descriptor")
> because we were reading guest memory before the barrier and used the
> result after.
> 
> I'll remove it.

Converted this into a cleanup patch set.  Posted here:
  https://lists.gnu.org/archive/html/qemu-devel/2023-09/msg06780.html

Best regards, Ilya Maximets.

Re: [PATCH] virtio: remove unnecessary thread fence while reading next descriptor

2023-09-27 Thread Ilya Maximets

On 9/27/23 17:41, Michael S. Tsirkin wrote:
> On Wed, Sep 27, 2023 at 04:06:41PM +0200, Ilya Maximets wrote:
>> On 9/25/23 20:04, Ilya Maximets wrote:
>>> On 9/25/23 16:32, Stefan Hajnoczi wrote:
>>>> On Fri, 25 Aug 2023 at 13:02, Ilya Maximets  wrote:
>>>>>
>>>>> It was supposed to be a compiler barrier and it was a compiler barrier
>>>>> initially called 'wmb' (??) when virtio core support was introduced.
>>>>> Later all the instances of 'wmb' were switched to smp_wmb to fix memory
>>>>> ordering issues on non-x86 platforms.  However, this one doesn't need
>>>>> to be an actual barrier.  It's enough for it to stay a compiler barrier
>>>>> as its only purpose is to ensure that the value is not read twice.
>>>>>
>>>>> There is no counterpart read barrier in the drivers, AFAICT.  And even
>>>>> if we needed an actual barrier, it shouldn't have been a write barrier.
>>>>>
>>>>> Signed-off-by: Ilya Maximets 
>>>>> ---
>>>>>  hw/virtio/virtio.c | 2 +-
>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>>>>> index 309038fd46..6eb8586858 100644
>>>>> --- a/hw/virtio/virtio.c
>>>>> +++ b/hw/virtio/virtio.c
>>>>> @@ -1051,7 +1051,7 @@ static int 
>>>>> virtqueue_split_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
>>>>>  /* Check they're not leading us off end of descriptors. */
>>>>>  *next = desc->next;
>>>>
>>>> I don't see a caller that uses *next. Can the argument be eliminated?
>>>
>>> Yes, it can.  The 'next' was converted from a local variable to
>>> an output parameter in commit:
>>>   412e0e81b174 ("virtio: handle virtqueue_read_next_desc() errors")
>>>
>>> And that didn't actually make sense even then, because all the
>>> actual uses of the 'i/next' as an output were removed a few months
>>> prior in commit:
>>>   aa570d6fb6bd ("virtio: combine the read of a descriptor")
>>>
>>> I can post a separate patch for this.
>>>
>>>>
>>>>>  /* Make sure compiler knows to grab that: we don't want it changing! 
>>>>> */
>>>>> -smp_wmb();
>>>>> +barrier();
>>>>
>>>> What is the purpose of this barrier? desc is not guest memory and
>>>> nothing modifies desc's fields while this function is executing. I
>>>> think the barrier can be removed.
>>>
>>> True.  In fact, that was the first thing I did, but then the comment
>>> derailed me into thinking that it somehow can be updated concurrently,
>>> so I went with a safer option.  :/
>>> It is indeed a local variable and the barrier is not needed today.
>>> It had a little more sense before the previously mentioned commit:
>>>   aa570d6fb6bd ("virtio: combine the read of a descriptor")
>>> because we were reading guest memory before the barrier and used the
>>> result after.
>>>
>>> I'll remove it.
>>
>> Converted this into a cleanup patch set.  Posted here:
>>   https://lists.gnu.org/archive/html/qemu-devel/2023-09/msg06780.html
>>
>> Best regards, Ilya Maximets.
> 
> Ugh, these archives are useless. use lore please. 
> 

OK.  The lore link is the following:
  
https://lore.kernel.org/qemu-devel/20230927140016.2317404-1-i.maxim...@ovn.org/

Best regards, Ilya Maximets.

Re: [PULL 00/17] Net patches

2023-09-13 Thread Ilya Maximets

On 9/8/23 16:15, Daniel P. Berrangé wrote:
> On Fri, Sep 08, 2023 at 04:06:35PM +0200, Ilya Maximets wrote:
>> On 9/8/23 14:15, Daniel P. Berrangé wrote:
>>> On Fri, Sep 08, 2023 at 02:00:47PM +0200, Ilya Maximets wrote:
>>>> On 9/8/23 13:49, Daniel P. Berrangé wrote:
>>>>> On Fri, Sep 08, 2023 at 01:34:54PM +0200, Ilya Maximets wrote:
>>>>>> On 9/8/23 13:19, Stefan Hajnoczi wrote:
>>>>>>> Hi Ilya and Jason,
>>>>>>> There is a CI failure related to a missing Debian libxdp-dev package:
>>>>>>> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
>>>>>>>
>>>>>>> I think the issue is that the debian-amd64 container image that QEMU
>>>>>>> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
>>>>>>> and libxdp is not available on that release:
>>>>>>> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all
>>>>>>
>>>>>> Hmm.  Sorry about that.
>>>>>>
>>>>>>>
>>>>>>> If we need to support Debian 11 CI then either XDP could be disabled
>>>>>>> for that distro or libxdp could be compiled from source.
>>>>>>
>>>>>> I'd suggest we just remove the attempt to install the package for now,
>>>>>> building libxdp from sources may be a little painful to maintain.
>>>>>>
>>>>>> Can be re-added later once distributions with libxdp 1.4+ will be more
>>>>>> widely available, i.e. when fedora dockerfile will be updated to 39,
>>>>>> for example.  That should be soon-ish, right?
>>>>>
>>>>> If you follow the process in docs/devel/testing.rst for adding
>>>>> libxdp in libvirt-ci, then lcitool will "do the right thing"
>>>>> when we move the auto-generated dockerfiles to new distro versions.
>>>>
>>>> Thanks!  I'll prepare changes for libvirt-ci.
>>>>
>>>> In the meantime, none of the currently tested images will have a required
>>>> version of libxdp anyway, so I'm suggesting to just drop this one 
>>>> dockerfile
>>>> modification from the patch.  What do you think?
>>>
>>> Sure, if none of the distros have it, then lcitool won't emit the
>>> dockerfile changes until we update the inherited distro version.
>>> So it is sufficient to just update libvirt-ci.git with the mappings.yml
>>> info for libxdp, and add 'libxdp' to the tests/lcitool/projects/qemu.yml
>>> file in qemu.git. It will then 'just work' when someone updates the
>>> distro versions later.
>>
>> I posted an MR for libvirt-ci adding libxdp:
>>   https://gitlab.com/libvirt/libvirt-ci/-/merge_requests/429
>>
>> Please, take a look.
>>
>> The docs say that CI will try to build containers with the MR changes,
>> but I don't think anything except sanity checks is actually tested on MR.
>> Sorry if I missed something, never used GitLab pipelines before.
> 
> No, that's our fault - we've broken the CI and your change alerted
> me to that fact :-)
> 
>> Note that with this update we will be installing older version of libxdp
>> in many containers, even though they will not be used by QEMU, unless
>> they are newer than 1.4.0.
> 
> No problem, as it means QEMU CI will demonstrate the the meson.build
> change is ignoring the outdatd libxdp.
> 
>> tests/lcitool/projects/qemu.yml in qemu.git cannot be updated without
>> updating a submodule after the MR merge.
> 
> Yep.

Since all the required changes went into libvirt-ci project, I posted an
updated patch set named:

  '[PATCH v4 0/2] net: add initial support for AF_XDP network backend'

Please, take a look.

This should fix the CI issues, though I'm not sure how to run QEMU gitlab
pipelines myself, so I didn't actually test all the images.

Sent as a patch set because the libvirt-ci submodule bump brings in a few
unrelated changes.  So, I split that into a separate patch.

Best regards, Ilya Maximets.

Re: [PULL 00/17] Net patches

2023-09-18 Thread Ilya Maximets

On 9/14/23 10:13, Daniel P. Berrangé wrote:
> On Wed, Sep 13, 2023 at 08:46:42PM +0200, Ilya Maximets wrote:
>> On 9/8/23 16:15, Daniel P. Berrangé wrote:
>>> On Fri, Sep 08, 2023 at 04:06:35PM +0200, Ilya Maximets wrote:
>>>> On 9/8/23 14:15, Daniel P. Berrangé wrote:
>>>>> On Fri, Sep 08, 2023 at 02:00:47PM +0200, Ilya Maximets wrote:
>>>>>> On 9/8/23 13:49, Daniel P. Berrangé wrote:
>>>>>>> On Fri, Sep 08, 2023 at 01:34:54PM +0200, Ilya Maximets wrote:
>>>>>>>> On 9/8/23 13:19, Stefan Hajnoczi wrote:
>>>>>>>>> Hi Ilya and Jason,
>>>>>>>>> There is a CI failure related to a missing Debian libxdp-dev package:
>>>>>>>>> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
>>>>>>>>>
>>>>>>>>> I think the issue is that the debian-amd64 container image that QEMU
>>>>>>>>> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
>>>>>>>>> and libxdp is not available on that release:
>>>>>>>>> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all
>>>>>>>>
>>>>>>>> Hmm.  Sorry about that.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> If we need to support Debian 11 CI then either XDP could be disabled
>>>>>>>>> for that distro or libxdp could be compiled from source.
>>>>>>>>
>>>>>>>> I'd suggest we just remove the attempt to install the package for now,
>>>>>>>> building libxdp from sources may be a little painful to maintain.
>>>>>>>>
>>>>>>>> Can be re-added later once distributions with libxdp 1.4+ will be more
>>>>>>>> widely available, i.e. when fedora dockerfile will be updated to 39,
>>>>>>>> for example.  That should be soon-ish, right?
>>>>>>>
>>>>>>> If you follow the process in docs/devel/testing.rst for adding
>>>>>>> libxdp in libvirt-ci, then lcitool will "do the right thing"
>>>>>>> when we move the auto-generated dockerfiles to new distro versions.
>>>>>>
>>>>>> Thanks!  I'll prepare changes for libvirt-ci.
>>>>>>
>>>>>> In the meantime, none of the currently tested images will have a required
>>>>>> version of libxdp anyway, so I'm suggesting to just drop this one 
>>>>>> dockerfile
>>>>>> modification from the patch.  What do you think?
>>>>>
>>>>> Sure, if none of the distros have it, then lcitool won't emit the
>>>>> dockerfile changes until we update the inherited distro version.
>>>>> So it is sufficient to just update libvirt-ci.git with the mappings.yml
>>>>> info for libxdp, and add 'libxdp' to the tests/lcitool/projects/qemu.yml
>>>>> file in qemu.git. It will then 'just work' when someone updates the
>>>>> distro versions later.
>>>>
>>>> I posted an MR for libvirt-ci adding libxdp:
>>>>   https://gitlab.com/libvirt/libvirt-ci/-/merge_requests/429
>>>>
>>>> Please, take a look.
>>>>
>>>> The docs say that CI will try to build containers with the MR changes,
>>>> but I don't think anything except sanity checks is actually tested on MR.
>>>> Sorry if I missed something, never used GitLab pipelines before.
>>>
>>> No, that's our fault - we've broken the CI and your change alerted
>>> me to that fact :-)
>>>
>>>> Note that with this update we will be installing older version of libxdp
>>>> in many containers, even though they will not be used by QEMU, unless
>>>> they are newer than 1.4.0.
>>>
>>> No problem, as it means QEMU CI will demonstrate the the meson.build
>>> change is ignoring the outdatd libxdp.
>>>
>>>> tests/lcitool/projects/qemu.yml in qemu.git cannot be updated without
>>>> updating a submodule after the MR merge.
>>>
>>> Yep.
>>
>> Since all the required changes went into libvirt-ci project, I posted an
>> updated patch set named:
>>
>>   '[PATCH v4 0/2] net: add initial support for AF_XDP network backend'
>>
>> Please, take a look.
>>
>> This should fix the CI issues, though I'm not sure how to run QEMU gitlab
>> pipelines myself, so I didn't actually test all the images.
> 
>   git push gitlab  -o ci.variable=QEMU_CI=2
> 
> will create pipeline and immediately run all jobs.

Thanks!  That worked.  Though I wasn't able to test much anyway as
this thing burned through all my free compute credits less than
half way through the pipeline. :D

So, AFAIU, it's not something an occasional contributor like me can
use, unless they are spending their own money.


> 
> Replace 'gitlab' with the name of the git remote pointing to your
> gitlab fork of QEMU.
> 
> Using QEMU_CI=1 will create pipeline, but let you manually start
> individual jobs from the web UI.
> 
> For further details see docs/devel/ci-jobs.rst.inc
> 
> 
>>
>> Sent as a patch set because the libvirt-ci submodule bump brings in a few
>> unrelated changes.  So, I split that into a separate patch.
> 
> Yep, that's perfect thanks.
> 
> With regards,
> Daniel

Re: [PULL 00/17] Net patches

2023-09-19 Thread Ilya Maximets

On 9/19/23 10:40, Daniel P. Berrangé wrote:
> On Mon, Sep 18, 2023 at 09:36:10PM +0200, Ilya Maximets wrote:
>> On 9/14/23 10:13, Daniel P. Berrangé wrote:
>>> On Wed, Sep 13, 2023 at 08:46:42PM +0200, Ilya Maximets wrote:
>>>> On 9/8/23 16:15, Daniel P. Berrangé wrote:
>>>>> On Fri, Sep 08, 2023 at 04:06:35PM +0200, Ilya Maximets wrote:
>>>>>> On 9/8/23 14:15, Daniel P. Berrangé wrote:
>>>>>>> On Fri, Sep 08, 2023 at 02:00:47PM +0200, Ilya Maximets wrote:
>>>>>>>> On 9/8/23 13:49, Daniel P. Berrangé wrote:
>>>>>>>>> On Fri, Sep 08, 2023 at 01:34:54PM +0200, Ilya Maximets wrote:
>>>>>>>>>> On 9/8/23 13:19, Stefan Hajnoczi wrote:
>>>>>>>>>>> Hi Ilya and Jason,
>>>>>>>>>>> There is a CI failure related to a missing Debian libxdp-dev 
>>>>>>>>>>> package:
>>>>>>>>>>> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
>>>>>>>>>>>
>>>>>>>>>>> I think the issue is that the debian-amd64 container image that QEMU
>>>>>>>>>>> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
>>>>>>>>>>> and libxdp is not available on that release:
>>>>>>>>>>> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all
>>>>>>>>>>
>>>>>>>>>> Hmm.  Sorry about that.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> If we need to support Debian 11 CI then either XDP could be disabled
>>>>>>>>>>> for that distro or libxdp could be compiled from source.
>>>>>>>>>>
>>>>>>>>>> I'd suggest we just remove the attempt to install the package for 
>>>>>>>>>> now,
>>>>>>>>>> building libxdp from sources may be a little painful to maintain.
>>>>>>>>>>
>>>>>>>>>> Can be re-added later once distributions with libxdp 1.4+ will be 
>>>>>>>>>> more
>>>>>>>>>> widely available, i.e. when fedora dockerfile will be updated to 39,
>>>>>>>>>> for example.  That should be soon-ish, right?
>>>>>>>>>
>>>>>>>>> If you follow the process in docs/devel/testing.rst for adding
>>>>>>>>> libxdp in libvirt-ci, then lcitool will "do the right thing"
>>>>>>>>> when we move the auto-generated dockerfiles to new distro versions.
>>>>>>>>
>>>>>>>> Thanks!  I'll prepare changes for libvirt-ci.
>>>>>>>>
>>>>>>>> In the meantime, none of the currently tested images will have a 
>>>>>>>> required
>>>>>>>> version of libxdp anyway, so I'm suggesting to just drop this one 
>>>>>>>> dockerfile
>>>>>>>> modification from the patch.  What do you think?
>>>>>>>
>>>>>>> Sure, if none of the distros have it, then lcitool won't emit the
>>>>>>> dockerfile changes until we update the inherited distro version.
>>>>>>> So it is sufficient to just update libvirt-ci.git with the mappings.yml
>>>>>>> info for libxdp, and add 'libxdp' to the tests/lcitool/projects/qemu.yml
>>>>>>> file in qemu.git. It will then 'just work' when someone updates the
>>>>>>> distro versions later.
>>>>>>
>>>>>> I posted an MR for libvirt-ci adding libxdp:
>>>>>>   https://gitlab.com/libvirt/libvirt-ci/-/merge_requests/429
>>>>>>
>>>>>> Please, take a look.
>>>>>>
>>>>>> The docs say that CI will try to build containers with the MR changes,
>>>>>> but I don't think anything except sanity checks is actually tested on MR.
>>>>>> Sorry if I missed something, never used GitLab pipelines before.
>>>>>
>>>>> No, that's our fault - we've broken the CI and your change alerted
>>>>> me to that fact :-)
>>>>>
>>>>>&g

Re: [PATCH v2] virtio: don't zero out memory region cache for indirect descriptors

2023-09-25 Thread Ilya Maximets

On 8/11/23 16:34, Ilya Maximets wrote:
> Lots of virtio functions that are on a hot path in data transmission
> are initializing indirect descriptor cache at the point of stack
> allocation.  It's a 112 byte structure that is getting zeroed out on
> each call adding unnecessary overhead.  It's going to be correctly
> initialized later via special init function.  The only reason to
> actually initialize right away is the ability to safely destruct it.
> Replacing a designated initializer with a function to only initialize
> what is necessary.
> 
> Removal of the unnecessary stack initializations improves throughput
> of virtio-net devices in terms of 64B packets per second by 6-14 %
> depending on the case.  Tested with a proposed af-xdp network backend
> and a dpdk testpmd application in the guest, but should be beneficial
> for other virtio devices as well.
> 
> Signed-off-by: Ilya Maximets 
> ---
> 
> Version 2:
> 
>   * Introduced an initialization function, so we don't need to compare
> pointers in the end. [Stefan]
>   * Removed the now unused macro. [Jason]
> 
>  hw/virtio/virtio.c| 20 +++-
>  include/exec/memory.h | 16 +---
>  2 files changed, 28 insertions(+), 8 deletions(-)

Kind reminder.

This patch was posted quite some time ago and it has 2 reviewed/acked-by tags.
Just making sure it didn't fall through the cracks.

Best regards, Ilya Maximets.

Re: [PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-09-25 Thread Ilya Maximets

On 8/25/23 19:04, Ilya Maximets wrote:
> We do not need the most up to date number of heads, we only want to
> know if there is at least one.
> 
> Use shadow variable as long as it is not equal to the last available
> index checked.  This avoids expensive qatomic dereference of the
> RCU-protected memory region cache as well as the memory access itself
> and the subsequent memory barrier.
> 
> The change improves performance of the af-xdp network backend by 2-3%.
> 
> Signed-off-by: Ilya Maximets 
> ---

Kind reminder.

Best regards, Ilya Maximets.

>  hw/virtio/virtio.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 309038fd46..04bf7cc977 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const 
> VirtQueueElement *elem,
>  /* Called within rcu_read_lock().  */
>  static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
>  {
> -uint16_t num_heads = vring_avail_idx(vq) - idx;
> +uint16_t num_heads;
> +
> +if (vq->shadow_avail_idx != idx) {
> +num_heads = vq->shadow_avail_idx - idx;
> +
> +return num_heads;
> +}
> +
> +num_heads = vring_avail_idx(vq) - idx;
>  
>  /* Check it isn't doing very strange things with descriptor numbers. */
>  if (num_heads > vq->vring.num) {

Re: [PATCH] virtio: remove unnecessary thread fence while reading next descriptor

2023-09-25 Thread Ilya Maximets

On 8/25/23 19:01, Ilya Maximets wrote:
> It was supposed to be a compiler barrier and it was a compiler barrier
> initially called 'wmb' (??) when virtio core support was introduced.
> Later all the instances of 'wmb' were switched to smp_wmb to fix memory
> ordering issues on non-x86 platforms.  However, this one doesn't need
> to be an actual barrier.  It's enough for it to stay a compiler barrier
> as its only purpose is to ensure that the value is not read twice.
> 
> There is no counterpart read barrier in the drivers, AFAICT.  And even
> if we needed an actual barrier, it shouldn't have been a write barrier.
> 
> Signed-off-by: Ilya Maximets 
> ---

Kind reminder.

Best regards, Ilya Maximets.

>  hw/virtio/virtio.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 309038fd46..6eb8586858 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -1051,7 +1051,7 @@ static int virtqueue_split_read_next_desc(VirtIODevice 
> *vdev, VRingDesc *desc,
>  /* Check they're not leading us off end of descriptors. */
>  *next = desc->next;
>  /* Make sure compiler knows to grab that: we don't want it changing! */
> -smp_wmb();
> +barrier();
>  
>  if (*next >= max) {
>  virtio_error(vdev, "Desc next is %u", *next);

[PATCH] virtio: don't zero out memory region cache for indirect descriptors

2023-08-07 Thread Ilya Maximets

Lots of virtio functions that are on a hot path in data transmission
are initializing indirect descriptor cache at the point of stack
allocation.  It's a 112 byte structure that is getting zeroed out on
each call adding unnecessary overhead.  It's going to be correctly
initialized later via special init function.  The only reason to
actually initialize right away is the ability to safely destruct it.
However, we only need to destruct it when it was used, i.e. when a
desc_cache points to it.

Removing these unnecessary stack initializations improves throughput
of virtio-net devices in terms of 64B packets per second by 6-14 %
depending on the case.  Tested with a proposed af-xdp network backend
and a dpdk testpmd application in the guest, but should be beneficial
for other virtio devices as well.

Signed-off-by: Ilya Maximets 
---
 hw/virtio/virtio.c | 42 +++---
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 309038fd46..a65396e616 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1071,7 +1071,8 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
 VirtIODevice *vdev = vq->vdev;
 unsigned int idx;
 unsigned int total_bufs, in_total, out_total;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
+MemoryRegionCache *desc_cache = NULL;
 int64_t len = 0;
 int rc;
 
@@ -1079,7 +1080,6 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
 total_bufs = in_total = out_total = 0;
 
 while ((rc = virtqueue_num_heads(vq, idx)) > 0) {
-MemoryRegionCache *desc_cache = &caches->desc;
 unsigned int num_bufs;
 VRingDesc desc;
 unsigned int i;
@@ -1091,6 +1091,8 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
 goto err;
 }
 
+desc_cache = &caches->desc;
+
 vring_split_desc_read(vdev, &desc, desc_cache, i);
 
 if (desc.flags & VRING_DESC_F_INDIRECT) {
@@ -1156,7 +1158,9 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
 }
 
 done:
-address_space_cache_destroy(&indirect_desc_cache);
+if (desc_cache == &indirect_desc_cache) {
+address_space_cache_destroy(&indirect_desc_cache);
+}
 if (in_bytes) {
 *in_bytes = in_total;
 }
@@ -1207,8 +1211,8 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue 
*vq,
 VirtIODevice *vdev = vq->vdev;
 unsigned int idx;
 unsigned int total_bufs, in_total, out_total;
-MemoryRegionCache *desc_cache;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
+MemoryRegionCache *desc_cache = NULL;
 int64_t len = 0;
 VRingPackedDesc desc;
 bool wrap_counter;
@@ -1297,7 +1301,9 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue 
*vq,
 vq->shadow_avail_idx = idx;
 vq->shadow_avail_wrap_counter = wrap_counter;
 done:
-address_space_cache_destroy(&indirect_desc_cache);
+if (desc_cache == &indirect_desc_cache) {
+address_space_cache_destroy(&indirect_desc_cache);
+}
 if (in_bytes) {
 *in_bytes = in_total;
 }
@@ -1487,8 +1493,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 {
 unsigned int i, head, max;
 VRingMemoryRegionCaches *caches;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
-MemoryRegionCache *desc_cache;
+MemoryRegionCache indirect_desc_cache;
+MemoryRegionCache *desc_cache = NULL;
 int64_t len;
 VirtIODevice *vdev = vq->vdev;
 VirtQueueElement *elem = NULL;
@@ -1611,7 +1617,9 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 
 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
 done:
-address_space_cache_destroy(&indirect_desc_cache);
+if (desc_cache == &indirect_desc_cache) {
+address_space_cache_destroy(&indirect_desc_cache);
+}
 
 return elem;
 
@@ -1624,8 +1632,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 {
 unsigned int i, max;
 VRingMemoryRegionCaches *caches;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
-MemoryRegionCache *desc_cache;
+MemoryRegionCache indirect_desc_cache;
+MemoryRegionCache *desc_cache = NULL;
 int64_t len;
 VirtIODevice *vdev = vq->vdev;
 VirtQueueElement *elem = NULL;
@@ -1746,7 +1754,9 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 
 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
 done:
-address_space_cache_destroy(&indirect_desc_cache);
+if (desc_cache == &indirect_desc_cache) {
+address_space_cache_destroy(&indirect_desc_cache);
+}
 
 return elem;
 
@@ -3935,8 +3945,8 @@ VirtioQ

Re: [PATCH] virtio: don't zero out memory region cache for indirect descriptors

2023-08-11 Thread Ilya Maximets

On 8/9/23 04:37, Jason Wang wrote:
> On Tue, Aug 8, 2023 at 6:28 AM Ilya Maximets  wrote:
>>
>> Lots of virtio functions that are on a hot path in data transmission
>> are initializing indirect descriptor cache at the point of stack
>> allocation.  It's a 112 byte structure that is getting zeroed out on
>> each call adding unnecessary overhead.  It's going to be correctly
>> initialized later via special init function.  The only reason to
>> actually initialize right away is the ability to safely destruct it.
>> However, we only need to destruct it when it was used, i.e. when a
>> desc_cache points to it.
>>
>> Removing these unnecessary stack initializations improves throughput
>> of virtio-net devices in terms of 64B packets per second by 6-14 %
>> depending on the case.  Tested with a proposed af-xdp network backend
>> and a dpdk testpmd application in the guest, but should be beneficial
>> for other virtio devices as well.
>>
>> Signed-off-by: Ilya Maximets 
> 
> Acked-by: Jason Wang 
> 
> Btw, we can probably remove MEMORY_REGION_CACHE_INVALID.

Good point.  I can include that in the patch.  Or just replace it
with a function, as Stefan suggested.

Best regards, Ilya Maximets.

Re: [PATCH] virtio: don't zero out memory region cache for indirect descriptors

2023-08-11 Thread Ilya Maximets

On 8/10/23 17:50, Stefan Hajnoczi wrote:
> On Tue, Aug 08, 2023 at 12:28:47AM +0200, Ilya Maximets wrote:
>> Lots of virtio functions that are on a hot path in data transmission
>> are initializing indirect descriptor cache at the point of stack
>> allocation.  It's a 112 byte structure that is getting zeroed out on
>> each call adding unnecessary overhead.  It's going to be correctly
>> initialized later via special init function.  The only reason to
>> actually initialize right away is the ability to safely destruct it.
>> However, we only need to destruct it when it was used, i.e. when a
>> desc_cache points to it.
>>
>> Removing these unnecessary stack initializations improves throughput
>> of virtio-net devices in terms of 64B packets per second by 6-14 %
>> depending on the case.  Tested with a proposed af-xdp network backend
>> and a dpdk testpmd application in the guest, but should be beneficial
>> for other virtio devices as well.
>>
>> Signed-off-by: Ilya Maximets 
>> ---
>>  hw/virtio/virtio.c | 42 +++---
>>  1 file changed, 27 insertions(+), 15 deletions(-)
> 
> Another option is to create an address_space_cache_init_invalid()
> function that only assigns mrs.mr = NULL instead of touching all bytes
> of the struct like = MEMORY_REGION_CACHE_INVALID. There would be less
> code and the existing mrs.mr check in address_space_cache_destroy()
> would serve the same function as the desc_cache == &indirect_desc_cache
> check added by this patch.

It does look simpler this way, indeed.  Though I'm not sure about
a function name.  We have address_space_cache_invalidate() that
does a completely different thing and the invalidated cache can
still be used, while the cache initialized with the newly proposed
address_space_cache_init_invalid() can not be safely used.

I suppose, the problem is not new, since the macro was named similarly,
but making it a function seems to make the issue worse.

Maybe address_space_cache_init_empty() will be a better name?
E.g.:

**
 * address_space_cache_init_empty: Initialize empty #MemoryRegionCache
 *
 * @cache: The #MemoryRegionCache to operate on.
 *
 * Initializes #MemoryRegionCache structure without memory region attached.
 * Cache initialized this way can only be safely destroyed, but not used.
 */
static inline void address_space_cache_init_empty(MemoryRegionCache *cache)
{
cache->mrs.mr = NULL;
}

What do you think?

Best regards, Ilya Maximets.

Re: [PATCH] virtio: don't zero out memory region cache for indirect descriptors

2023-08-11 Thread Ilya Maximets

On 8/11/23 15:58, Stefan Hajnoczi wrote:
> 
> 
> On Fri, Aug 11, 2023, 08:50 Ilya Maximets  <mailto:i.maxim...@ovn.org>> wrote:
> 
> On 8/10/23 17:50, Stefan Hajnoczi wrote:
> > On Tue, Aug 08, 2023 at 12:28:47AM +0200, Ilya Maximets wrote:
> >> Lots of virtio functions that are on a hot path in data transmission
> >> are initializing indirect descriptor cache at the point of stack
> >> allocation.  It's a 112 byte structure that is getting zeroed out on
> >> each call adding unnecessary overhead.  It's going to be correctly
> >> initialized later via special init function.  The only reason to
> >> actually initialize right away is the ability to safely destruct it.
> >> However, we only need to destruct it when it was used, i.e. when a
> >> desc_cache points to it.
> >>
> >> Removing these unnecessary stack initializations improves throughput
> >> of virtio-net devices in terms of 64B packets per second by 6-14 %
> >> depending on the case.  Tested with a proposed af-xdp network backend
>     >> and a dpdk testpmd application in the guest, but should be beneficial
> >> for other virtio devices as well.
> >>
> >> Signed-off-by: Ilya Maximets  <mailto:i.maxim...@ovn.org>>
> >> ---
> >>  hw/virtio/virtio.c | 42 +++---
> >>  1 file changed, 27 insertions(+), 15 deletions(-)
> >
> > Another option is to create an address_space_cache_init_invalid()
> > function that only assigns mrs.mr <http://mrs.mr> = NULL instead of 
> touching all bytes
> > of the struct like = MEMORY_REGION_CACHE_INVALID. There would be less
> > code and the existing mrs.mr <http://mrs.mr> check in 
> address_space_cache_destroy()
> > would serve the same function as the desc_cache == &indirect_desc_cache
> > check added by this patch.
> 
> It does look simpler this way, indeed.  Though I'm not sure about
> a function name.  We have address_space_cache_invalidate() that
> does a completely different thing and the invalidated cache can
> still be used, while the cache initialized with the newly proposed
> address_space_cache_init_invalid() can not be safely used.
> 
> I suppose, the problem is not new, since the macro was named similarly,
> but making it a function seems to make the issue worse.
> 
> Maybe address_space_cache_init_empty() will be a better name?
> E.g.:
> 
> **
>  * address_space_cache_init_empty: Initialize empty #MemoryRegionCache
>  *
>  * @cache: The #MemoryRegionCache to operate on.
>  *
>  * Initializes #MemoryRegionCache structure without memory region 
> attached.
>  * Cache initialized this way can only be safely destroyed, but not used.
>  */
> static inline void address_space_cache_init_empty(MemoryRegionCache 
> *cache)
> {
>     cache->mrs.mr = NULL;
> }
> 
> What do you think?
> 
> 
> init_empty() is good.

I'll use it then.  Will send a v2 shortly.

Thanks!

> 
> Stefan
>

[PATCH v2] virtio: don't zero out memory region cache for indirect descriptors

2023-08-11 Thread Ilya Maximets

Lots of virtio functions that are on a hot path in data transmission
are initializing indirect descriptor cache at the point of stack
allocation.  It's a 112 byte structure that is getting zeroed out on
each call adding unnecessary overhead.  It's going to be correctly
initialized later via special init function.  The only reason to
actually initialize right away is the ability to safely destruct it.
Replacing a designated initializer with a function to only initialize
what is necessary.

Removal of the unnecessary stack initializations improves throughput
of virtio-net devices in terms of 64B packets per second by 6-14 %
depending on the case.  Tested with a proposed af-xdp network backend
and a dpdk testpmd application in the guest, but should be beneficial
for other virtio devices as well.

Signed-off-by: Ilya Maximets 
---

Version 2:

  * Introduced an initialization function, so we don't need to compare
pointers in the end. [Stefan]
  * Removed the now unused macro. [Jason]

 hw/virtio/virtio.c| 20 +++-
 include/exec/memory.h | 16 +---
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 309038fd46..3d768fda5a 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1071,10 +1071,12 @@ static void virtqueue_split_get_avail_bytes(VirtQueue 
*vq,
 VirtIODevice *vdev = vq->vdev;
 unsigned int idx;
 unsigned int total_bufs, in_total, out_total;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
 int64_t len = 0;
 int rc;
 
+address_space_cache_init_empty(&indirect_desc_cache);
+
 idx = vq->last_avail_idx;
 total_bufs = in_total = out_total = 0;
 
@@ -1207,12 +1209,14 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue 
*vq,
 VirtIODevice *vdev = vq->vdev;
 unsigned int idx;
 unsigned int total_bufs, in_total, out_total;
+MemoryRegionCache indirect_desc_cache;
 MemoryRegionCache *desc_cache;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
 int64_t len = 0;
 VRingPackedDesc desc;
 bool wrap_counter;
 
+address_space_cache_init_empty(&indirect_desc_cache);
+
 idx = vq->last_avail_idx;
 wrap_counter = vq->last_avail_wrap_counter;
 total_bufs = in_total = out_total = 0;
@@ -1487,7 +1491,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 {
 unsigned int i, head, max;
 VRingMemoryRegionCaches *caches;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
 MemoryRegionCache *desc_cache;
 int64_t len;
 VirtIODevice *vdev = vq->vdev;
@@ -1498,6 +1502,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
 VRingDesc desc;
 int rc;
 
+address_space_cache_init_empty(&indirect_desc_cache);
+
 RCU_READ_LOCK_GUARD();
 if (virtio_queue_empty_rcu(vq)) {
 goto done;
@@ -1624,7 +1630,7 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 {
 unsigned int i, max;
 VRingMemoryRegionCaches *caches;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
 MemoryRegionCache *desc_cache;
 int64_t len;
 VirtIODevice *vdev = vq->vdev;
@@ -1636,6 +1642,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 uint16_t id;
 int rc;
 
+address_space_cache_init_empty(&indirect_desc_cache);
+
 RCU_READ_LOCK_GUARD();
 if (virtio_queue_packed_empty_rcu(vq)) {
 goto done;
@@ -3935,13 +3943,15 @@ VirtioQueueElement 
*qmp_x_query_virtio_queue_element(const char *path,
 } else {
 unsigned int head, i, max;
 VRingMemoryRegionCaches *caches;
-MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+MemoryRegionCache indirect_desc_cache;
 MemoryRegionCache *desc_cache;
 VRingDesc desc;
 VirtioRingDescList *list = NULL;
 VirtioRingDescList *node;
 int rc; int ndescs;
 
+address_space_cache_init_empty(&indirect_desc_cache);
+
 RCU_READ_LOCK_GUARD();
 
 max = vq->vring.num;
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 68284428f8..b1c4329d11 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2664,9 +2664,6 @@ struct MemoryRegionCache {
 bool is_write;
 };
 
-#define MEMORY_REGION_CACHE_INVALID ((MemoryRegionCache) { .mrs.mr = NULL })
-
-
 /* address_space_ld*_cached: load from a cached #MemoryRegion
  * address_space_st*_cached: store into a cached #MemoryRegion
  *
@@ -2755,6 +2752,19 @@ int64_t address_space_cache_init(MemoryRegionCache 
*cache,
  hwaddr len,
  bool is_write);
 
+/**
+ * address_space_cache_init_empty: Initialize empty #

Re: [PATCH 1/2] virtio: use blk_io_plug_call() in virtio_irqfd_notify()

2023-08-16 Thread Ilya Maximets

On 8/15/23 14:08, Stefan Hajnoczi wrote:
> virtio-blk and virtio-scsi invoke virtio_irqfd_notify() to send Used
> Buffer Notifications from an IOThread. This involves an eventfd
> write(2) syscall. Calling this repeatedly when completing multiple I/O
> requests in a row is wasteful.

Hi, Stefan.  This is an interesting change!

There is more or less exactly the same problem with fast network backends
and I was playing around with similar ideas in this area while working on
af-xdp network backend recently.  Primarily, implementation of the Rx BH
for virtio-net device and locking the RCU before passing packets from the
backend to the device one by one.

> 
> Use the blk_io_plug_call() API to batch together virtio_irqfd_notify()
> calls made during Linux AIO (aio=native) or io_uring (aio=io_uring)
> completion processing. Do not modify the thread pool (aio=threads) to
> avoid introducing a dependency from util/ onto the block layer.

But you're introducing a dependency from generic virtio code onto the
block layer in this patch.  This seem to break the module abstraction.

It looks like there are 2 options to resolve the semantics issue here:

1. Move virtio_notify_irqfd() from virtio.c down to the block layer.
   Block layer is the only user, so that may be justified, but it
   doesn't seem like a particularly good solution.  (I'm actually not
   sure why block devices are the only ones using this function...)

2. Move and rename the block/plug library somewhere generic.  The plug
   library seems to not have any dependencies on a block layer, other
   than a name, so it should not be hard to generalize it (well, the
   naming might be hard).

In general, while looking at the plug library, it seems to do what is
typically implemented via RCU frameworks - the delayed function call.
The only difference is that RCU doesn't check for duplicates and the
callbacks are global.  Should not be hard to add some new functionality
to RCU framework in order to address these, e.g. rcu_call_local() for
calls that should be executed once the current thread exits its own
critical section.

Using RCU for non-RCU-protected things might be considered as an abuse.
However, we might solve two issues in one shot if instead of entering
blk_io_plug/unplug section we will enter an RCU critical section and
call callbacks at the exit.  The first issue is the notification batching
that this patch is trying to fix, the second is an excessive number of
thread fences on RCU exits every time virtio_notify_irqfd() and other
virtio functions are invoked.  The second issue can be avoided by using
RCU_READ_LOCK_GUARD() in completion functions.  Not sure if that will
improve performance, but it definitely removes a lot of noise from the
perf top for network backends.  This makes the code a bit less explicit
though, the lock guard will definitely need a comment.  Though, the reason
for blk_io_plug() calls is not fully clear for a module code alone either.

I'm not sure what is the best way forward.  I'm trying to figure out
that myself, since a similar solution will in the end be needed for
network backends and so it might be better to have something more generic.
What do you think?

> 
> Behavior is unchanged for emulated devices that do not use blk_io_plug()
> since blk_io_plug_call() immediately invokes the callback when called
> outside a blk_io_plug()/blk_io_unplug() region.
> 
> fio rw=randread bs=4k iodepth=64 numjobs=8 IOPS increases by ~9% with a
> single IOThread and 8 vCPUs. iodepth=1 decreases by ~1% but this could
> be noise. Detailed performance data and configuration specifics are
> available here:
> https://gitlab.com/stefanha/virt-playbooks/-/tree/blk_io_plug-irqfd
> 
> This duplicates the BH that virtio-blk uses for batching. The next
> commit will remove it.

I'm likely missing something, but could you explain why it is safe to
batch unconditionally here?  The current BH code, as you mentioned in
the second patch, is only batching if EVENT_IDX is not set.
Maybe worth adding a few words in the commit message for people like
me, who are a bit rusty on QEMU/virtio internals. :)

Best regards, Ilya Maximets.

> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  block/io_uring.c   |  6 ++
>  block/linux-aio.c  |  4 
>  hw/virtio/virtio.c | 10 +-
>  3 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/block/io_uring.c b/block/io_uring.c
> index 69d9820928..749cf83934 100644
> --- a/block/io_uring.c
> +++ b/block/io_uring.c
> @@ -124,6 +124,9 @@ static void luring_process_completions(LuringState *s)
>  {
>  struct io_uring_cqe *cqes;
>  int total_bytes;
> +
> +blk_io_plug();
> +
>  /*
>   * Request completion callbacks can run the nested event loop.
>   * Schedule ourselves so the nested event loop will "see&

Re: [PATCH 1/2] virtio: use blk_io_plug_call() in virtio_irqfd_notify()

2023-08-16 Thread Ilya Maximets

On 8/16/23 17:30, Stefan Hajnoczi wrote:
> On Wed, Aug 16, 2023 at 03:36:32PM +0200, Ilya Maximets wrote:
>> On 8/15/23 14:08, Stefan Hajnoczi wrote:
>>> virtio-blk and virtio-scsi invoke virtio_irqfd_notify() to send Used
>>> Buffer Notifications from an IOThread. This involves an eventfd
>>> write(2) syscall. Calling this repeatedly when completing multiple I/O
>>> requests in a row is wasteful.
>>
>> Hi, Stefan.  This is an interesting change!
>>
>> There is more or less exactly the same problem with fast network backends
>> and I was playing around with similar ideas in this area while working on
>> af-xdp network backend recently.  Primarily, implementation of the Rx BH
>> for virtio-net device and locking the RCU before passing packets from the
>> backend to the device one by one.
>>
>>>
>>> Use the blk_io_plug_call() API to batch together virtio_irqfd_notify()
>>> calls made during Linux AIO (aio=native) or io_uring (aio=io_uring)
>>> completion processing. Do not modify the thread pool (aio=threads) to
>>> avoid introducing a dependency from util/ onto the block layer.
>>
>> But you're introducing a dependency from generic virtio code onto the
>> block layer in this patch.  This seem to break the module abstraction.
>>
>> It looks like there are 2 options to resolve the semantics issue here:
> 
> Yes, it's a layering violation.
> 
>>
>> 1. Move virtio_notify_irqfd() from virtio.c down to the block layer.
>>Block layer is the only user, so that may be justified, but it
>>doesn't seem like a particularly good solution.  (I'm actually not
>>sure why block devices are the only ones using this function...)
> 
> Yes, this is the easiest way to avoid the layering violation for now.
> 
> The virtio_notify_irqfd() API is necessary when running in an IOThread
> because the normal QEMU irq API must run under the Big QEMU Lock. Block
> devices are the only ones that raise interrupts from IOThreads at the
> moment.

Ack.  Thanks for explanation!

> 
>>
>> 2. Move and rename the block/plug library somewhere generic.  The plug
>>library seems to not have any dependencies on a block layer, other
>>than a name, so it should not be hard to generalize it (well, the
>>naming might be hard).
> 
> Yes, it should be possible to make it generic quite easily. I will give
> this a try in the next version of the patch.

OK.  Sounds good to me.

> 
>> In general, while looking at the plug library, it seems to do what is
>> typically implemented via RCU frameworks - the delayed function call.
>> The only difference is that RCU doesn't check for duplicates and the
>> callbacks are global.  Should not be hard to add some new functionality
>> to RCU framework in order to address these, e.g. rcu_call_local() for
>> calls that should be executed once the current thread exits its own
>> critical section.
> 
> This rcu_call_local() idea is unrelated to Read Copy Update, so I don't
> think it should be part of the RCU API.

Agreed.

> Another deferred function call mechanism is QEMUBH. It already supports
> coalescing. However, BHs are invoked once per AioContext event loop
> iteration and there is no way invoke the BH earlier. Also the BH pointer
> needs to be passed to every function that wishes to schedule a deferred
> call, which can be tedious (e.g. block/linux-aio.c should defer the
> io_submit(2) syscall until the end of virtio-blk request processing -
> there are a lot of layers of code between those two components).
> 
>>
>> Using RCU for non-RCU-protected things might be considered as an abuse.
>> However, we might solve two issues in one shot if instead of entering
>> blk_io_plug/unplug section we will enter an RCU critical section and
>> call callbacks at the exit.  The first issue is the notification batching
>> that this patch is trying to fix, the second is an excessive number of
>> thread fences on RCU exits every time virtio_notify_irqfd() and other
>> virtio functions are invoked.  The second issue can be avoided by using
>> RCU_READ_LOCK_GUARD() in completion functions.  Not sure if that will
>> improve performance, but it definitely removes a lot of noise from the
>> perf top for network backends.  This makes the code a bit less explicit
>> though, the lock guard will definitely need a comment.  Though, the reason
>> for blk_io_plug() calls is not fully clear for a module code alone either.
> 
> util/aio-posix.c:run_poll_handlers() has a top-level
> RCU_READ_LOCK_GUARD() for this reason.

Nice, didn't know that.

> May

Re: [PATCH v2 3/4] virtio: use defer_call() in virtio_irqfd_notify()

2023-08-21 Thread Ilya Maximets

On 8/17/23 17:58, Stefan Hajnoczi wrote:
> virtio-blk and virtio-scsi invoke virtio_irqfd_notify() to send Used
> Buffer Notifications from an IOThread. This involves an eventfd
> write(2) syscall. Calling this repeatedly when completing multiple I/O
> requests in a row is wasteful.
> 
> Use the defer_call() API to batch together virtio_irqfd_notify() calls
> made during thread pool (aio=threads), Linux AIO (aio=native), and
> io_uring (aio=io_uring) completion processing.
> 
> Behavior is unchanged for emulated devices that do not use
> defer_call_begin()/defer_call_end() since defer_call() immediately
> invokes the callback when called outside a
> defer_call_begin()/defer_call_end() region.
> 
> fio rw=randread bs=4k iodepth=64 numjobs=8 IOPS increases by ~9% with a
> single IOThread and 8 vCPUs. iodepth=1 decreases by ~1% but this could
> be noise. Detailed performance data and configuration specifics are
> available here:
> https://gitlab.com/stefanha/virt-playbooks/-/tree/blk_io_plug-irqfd
> 
> This duplicates the BH that virtio-blk uses for batching. The next
> commit will remove it.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  block/io_uring.c   |  6 ++
>  block/linux-aio.c  |  4 
>  hw/virtio/virtio.c | 11 ++-
>  util/thread-pool.c |  5 +
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/block/io_uring.c b/block/io_uring.c
> index 3a1e1f45b3..7cdd00e9f1 100644
> --- a/block/io_uring.c
> +++ b/block/io_uring.c
> @@ -125,6 +125,9 @@ static void luring_process_completions(LuringState *s)
>  {
>  struct io_uring_cqe *cqes;
>  int total_bytes;
> +
> +defer_call_begin();
> +
>  /*
>   * Request completion callbacks can run the nested event loop.
>   * Schedule ourselves so the nested event loop will "see" remaining
> @@ -217,7 +220,10 @@ end:
>  aio_co_wake(luringcb->co);
>  }
>  }
> +
>  qemu_bh_cancel(s->completion_bh);
> +
> +defer_call_end();
>  }
>  
>  static int ioq_submit(LuringState *s)
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 62380593c8..ab607ade6a 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -205,6 +205,8 @@ static void qemu_laio_process_completions(LinuxAioState 
> *s)
>  {
>  struct io_event *events;
>  
> +defer_call_begin();
> +
>  /* Reschedule so nested event loops see currently pending completions */
>  qemu_bh_schedule(s->completion_bh);
>  
> @@ -231,6 +233,8 @@ static void qemu_laio_process_completions(LinuxAioState 
> *s)
>   * own `for` loop.  If we are the last all counters droped to zero. */
>  s->event_max = 0;
>  s->event_idx = 0;
> +
> +defer_call_end();
>  }
>  
>  static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 309038fd46..5eb1f91b41 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -15,6 +15,7 @@
>  #include "qapi/error.h"
>  #include "qapi/qapi-commands-virtio.h"
>  #include "trace.h"
> +#include "qemu/defer-call.h"
>  #include "qemu/error-report.h"
>  #include "qemu/log.h"
>  #include "qemu/main-loop.h"
> @@ -28,6 +29,7 @@
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/qdev-properties.h"
>  #include "hw/virtio/virtio-access.h"
> +#include "sysemu/block-backend.h"

An artifact from the previous version.

>  #include "sysemu/dma.h"
>  #include "sysemu/runstate.h"
>  #include "virtio-qmp.h"
> @@ -2426,6 +2428,13 @@ static bool virtio_should_notify(VirtIODevice *vdev, 
> VirtQueue *vq)
>  }
>  }
>  
> +/* Batch irqs while inside a defer_call_begin()/defer_call_end() section */
> +static void virtio_notify_irqfd_deferred_fn(void *opaque)
> +{
> +EventNotifier *notifier = opaque;
> +event_notifier_set(notifier);
> +}
> +
>  void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq)
>  {
>  WITH_RCU_READ_LOCK_GUARD() {
> @@ -2452,7 +2461,7 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue 
> *vq)
>   * to an atomic operation.
>   */
>  virtio_set_isr(vq->vdev, 0x1);
> -event_notifier_set(&vq->guest_notifier);
> +defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier);

Should we move the trace from this function to deferred one?
Or maybe add a new trace?

>  }
>  
>  static void virtio_irq(VirtQueue *vq)
> diff --git a/util/thread-pool.c b/util/thread-pool.c
> index e3d8292d14..d84961779a 100644
> --- a/util/thread-pool.c
> +++ b/util/thread-pool.c
> @@ -15,6 +15,7 @@
>   * GNU GPL, version 2 or (at your option) any later version.
>   */
>  #include "qemu/osdep.h"
> +#include "qemu/defer-call.h"
>  #include "qemu/queue.h"
>  #include "qemu/thread.h"
>  #include "qemu/coroutine.h"
> @@ -175,6 +176,8 @@ static void thread_pool_completion_bh(void *opaque)
>  ThreadPool *pool = opaque;
>  ThreadPoolElement *elem, *next;
>  
> +defer_call_begin(); /* cb() may use defer_call() to coalesce work */
> +
>  restart:
>  QLIST_FOREACH_SAFE(elem, &pool

[PATCH] memory: initialize 'fv' in MemoryRegionCache to make Coverity happy

2023-10-09 Thread Ilya Maximets

Coverity scan reports multiple false-positive "defects" for the
following series of actions in virtio.c:

  MemoryRegionCache indirect_desc_cache;
  address_space_cache_init_empty(&indirect_desc_cache);
  address_space_cache_destroy(&indirect_desc_cache);

For some reason it's unable to recognize the dependency between 'mrs.mr'
and 'fv' and insists that '!mrs.mr' check in address_space_cache_destroy
may take a 'false' branch, even though it is explicitly initialized to
NULL in the address_space_cache_init_empty():

  *** CID 1522371:  Memory - illegal accesses  (UNINIT)
  /qemu/hw/virtio/virtio.c: 1627 in virtqueue_split_pop()
  1621 }
  1622
  1623 vq->inuse++;
  1624
  1625 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
  1626 done:
  >>> CID 1522371:  Memory - illegal accesses  (UNINIT)
  >>> Using uninitialized value "indirect_desc_cache.fv" when
  >>> calling "address_space_cache_destroy".
  1627 address_space_cache_destroy(&indirect_desc_cache);
  1628
  1629 return elem;
  1630
  1631 err_undo_map:
  1632 virtqueue_undo_map_desc(out_num, in_num, iov);

  ** CID 1522370:  Memory - illegal accesses  (UNINIT)

Instead of trying to silence these false positive reports in 4
different places, initializing 'fv' as well, as this doesn't result
in any noticeable performance impact.

Signed-off-by: Ilya Maximets 
---
 include/exec/memory.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index c99842d2fc..1ce80c4e82 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2770,6 +2770,8 @@ int64_t address_space_cache_init(MemoryRegionCache *cache,
 static inline void address_space_cache_init_empty(MemoryRegionCache *cache)
 {
 cache->mrs.mr = NULL;
+/* There is no real need to initialize fv, but it makes Coverity happy. */
+cache->fv = NULL;
 }
 
 /**
-- 
2.41.0

[PATCH v2] net: add initial support for AF_XDP network backend

2023-07-05 Thread Ilya Maximets

o not
support any kinds of checksum or segmentation offloading.  Buffers
are limited to a page size (4K), i.e. MTU is limited.  Multi-buffer
support implementation for AF_XDP is in progress, but not ready yet.
Also, transmission in all non-zero-copy modes is synchronous, i.e.
done in a syscall.  That doesn't allow high packet rates on virtual
interfaces.

However, keeping in mind all of these challenges, current implementation
of the AF_XDP backend shows a decent performance while running on top
of a physical NIC with zero-copy support.

Test setup:

2 VMs running on 2 physical hosts connected via ConnectX6-Dx card.
Network backend is configured to open the NIC directly in native mode.
The driver supports zero-copy.  NIC is configured to use 1 queue.

Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd
for PPS testing.

iperf3 result:
 TCP stream  : 19.1 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
 Tx only : 3.4 Mpps
 Rx only : 2.0 Mpps
 L2 FWD Loopback : 1.5 Mpps

In skb mode the same setup shows much lower performance, similar to
the setup where pair of physical NICs is replaced with veth pair:

iperf3 result:
  TCP stream  : 9 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
  Tx only : 1.2 Mpps
  Rx only : 1.0 Mpps
  L2 FWD Loopback : 0.7 Mpps

Results in skb mode or over the veth are close to results of a tap
backend with vhost=on and disabled segmentation offloading bridged
with a NIC.

Signed-off-by: Ilya Maximets 
---

Version 2:

  - Added support for running with no capabilities by passing
pre-created AF_XDP socket file descriptors via 'sock-fds' option.
Conditionally complied because requires unreleased libxdp 1.4.0.
The last restriction is having 32 MB of RLIMIT_MEMLOCK per queue.

  - Refined and extended documentation.


 MAINTAINERS   |   4 +
 hmp-commands.hx   |   2 +-
 meson.build   |  19 +
 meson_options.txt |   2 +
 net/af-xdp.c  | 570 ++
 net/clients.h |   5 +
 net/meson.build   |   3 +
 net/net.c |   6 +
 qapi/net.json |  60 +-
 qemu-options.hx   |  83 ++-
 .../ci/org.centos/stream/8/x86_64/configure   |   1 +
 scripts/meson-buildoptions.sh |   3 +
 tests/docker/dockerfiles/debian-amd64.docker  |   1 +
 13 files changed, 756 insertions(+), 3 deletions(-)
 create mode 100644 net/af-xdp.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 7164cf55a1..80d4ba4004 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2929,6 +2929,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c
 
+AF_XDP network backend
+R: Ilya Maximets 
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand 
 M: Igor Mammedov 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..af9ffe4681 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1295,7 +1295,7 @@ ERST
 {
 .name   = "netdev_add",
 .args_type  = "netdev:O",
-.params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+.params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|af-xdp|vhost-user"
 #ifdef CONFIG_VMNET
   "|vmnet-host|vmnet-shared|vmnet-bridged"
 #endif
diff --git a/meson.build b/meson.build
index a9ba0bfab3..1f8772ea5d 100644
--- a/meson.build
+++ b/meson.build
@@ -1891,6 +1891,18 @@ if libbpf.found() and not cc.links('''
   endif
 endif
 
+# libxdp
+libxdp = dependency('libxdp', required: get_option('af_xdp'), method: 
'pkg-config')
+if libxdp.found() and \
+  not (libbpf.found() and libbpf.version().version_compare('>=0.7'))
+  libxdp = not_found
+  if get_option('af_xdp').enabled()
+error('af-xdp support requires libbpf version >= 0.7')
+  else
+warning('af-xdp support requires libbpf version >= 0.7, disabling')
+  endif
+endif
+
 # libdw
 libdw = not_found
 if not get_option('libdw').auto() or \
@@ -2112,6 +2124,12 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', 
get_option('hexagon_idef_pars
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
+if libxdp.found()
+  config_host_data.set('HAVE_XSK_UMEM__CREATE_WITH_FD',
+   cc.has_function('xsk_umem__create_with_fd',
+

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-07-07 Thread Ilya Maximets

On 7/7/23 03:43, Jason Wang wrote:
> On Fri, Jul 7, 2023 at 3:08 AM Stefan Hajnoczi  wrote:
>>
>> On Wed, 5 Jul 2023 at 02:02, Jason Wang  wrote:
>>>
>>> On Mon, Jul 3, 2023 at 5:03 PM Stefan Hajnoczi  wrote:
>>>>
>>>> On Fri, 30 Jun 2023 at 09:41, Jason Wang  wrote:
>>>>>
>>>>> On Thu, Jun 29, 2023 at 8:36 PM Stefan Hajnoczi  
>>>>> wrote:
>>>>>>
>>>>>> On Thu, 29 Jun 2023 at 07:26, Jason Wang  wrote:
>>>>>>>
>>>>>>> On Wed, Jun 28, 2023 at 4:25 PM Stefan Hajnoczi  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> On Wed, 28 Jun 2023 at 10:19, Jason Wang  wrote:
>>>>>>>>>
>>>>>>>>> On Wed, Jun 28, 2023 at 4:15 PM Stefan Hajnoczi  
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> On Wed, 28 Jun 2023 at 09:59, Jason Wang  wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Wed, Jun 28, 2023 at 3:46 PM Stefan Hajnoczi 
>>>>>>>>>>>  wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On Wed, 28 Jun 2023 at 05:28, Jason Wang  
>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Wed, Jun 28, 2023 at 6:45 AM Ilya Maximets 
>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 6/27/23 04:54, Jason Wang wrote:
>>>>>>>>>>>>>>> On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets 
>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 6/26/23 08:32, Jason Wang wrote:
>>>>>>>>>>>>>>>>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang 
>>>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets 
>>>>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>> It is noticeably more performant than a tap with vhost=on in 
>>>>>>>>>>>>>>>> terms of PPS.
>>>>>>>>>>>>>>>> So, that might be one case.  Taking into account that just rcu 
>>>>>>>>>>>>>>>> lock and
>>>>>>>>>>>>>>>> unlock in virtio-net code takes more time than a packet copy, 
>>>>>>>>>>>>>>>> some batching
>>>>>>>>>>>>>>>> on QEMU side should improve performance significantly.  And it 
>>>>>>>>>>>>>>>> shouldn't be
>>>>>>>>>>>>>>>> too hard to implement.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Performance over virtual interfaces may potentially be 
>>>>>>>>>>>>>>>> improved by creating
>>>>>>>>>>>>>>>> a kernel thread for async Tx.  Similarly to what io_uring 
>>>>>>>>>>>>>>>> allows.  Currently
>>>>>>>>>>>>>>>> Tx on non-zero-copy interfaces is synchronous, and that 
>>>>>>>>>>>>>>>> doesn't allow to
>>>>>>>>>>>>>>>> scale well.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Interestingly, actually, there are a lot of "duplication" 
>>>>>>>>>>>>>>> between
>>>>>>>>>>>>>>> io_uring and AF_XDP:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> 1) both have similar memory model (user register)
>>>>>>>>>>>>>>> 2) both use ring for communication
>>>>>>>>>>&g

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-07-10 Thread Ilya Maximets

On 7/10/23 05:51, Jason Wang wrote:
> On Fri, Jul 7, 2023 at 7:21 PM Ilya Maximets  wrote:
>>
>> On 7/7/23 03:43, Jason Wang wrote:
>>> On Fri, Jul 7, 2023 at 3:08 AM Stefan Hajnoczi  wrote:
>>>>
>>>> On Wed, 5 Jul 2023 at 02:02, Jason Wang  wrote:
>>>>>
>>>>> On Mon, Jul 3, 2023 at 5:03 PM Stefan Hajnoczi  wrote:
>>>>>>
>>>>>> On Fri, 30 Jun 2023 at 09:41, Jason Wang  wrote:
>>>>>>>
>>>>>>> On Thu, Jun 29, 2023 at 8:36 PM Stefan Hajnoczi  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> On Thu, 29 Jun 2023 at 07:26, Jason Wang  wrote:
>>>>>>>>>
>>>>>>>>> On Wed, Jun 28, 2023 at 4:25 PM Stefan Hajnoczi  
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> On Wed, 28 Jun 2023 at 10:19, Jason Wang  wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Wed, Jun 28, 2023 at 4:15 PM Stefan Hajnoczi 
>>>>>>>>>>>  wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On Wed, 28 Jun 2023 at 09:59, Jason Wang  
>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Wed, Jun 28, 2023 at 3:46 PM Stefan Hajnoczi 
>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On Wed, 28 Jun 2023 at 05:28, Jason Wang  
>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On Wed, Jun 28, 2023 at 6:45 AM Ilya Maximets 
>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 6/27/23 04:54, Jason Wang wrote:
>>>>>>>>>>>>>>>>> On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets 
>>>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On 6/26/23 08:32, Jason Wang wrote:
>>>>>>>>>>>>>>>>>>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang 
>>>>>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets 
>>>>>>>>>>>>>>>>>>>>  wrote:
>>>>>>>>>>>>>>>>>> It is noticeably more performant than a tap with vhost=on in 
>>>>>>>>>>>>>>>>>> terms of PPS.
>>>>>>>>>>>>>>>>>> So, that might be one case.  Taking into account that just 
>>>>>>>>>>>>>>>>>> rcu lock and
>>>>>>>>>>>>>>>>>> unlock in virtio-net code takes more time than a packet 
>>>>>>>>>>>>>>>>>> copy, some batching
>>>>>>>>>>>>>>>>>> on QEMU side should improve performance significantly.  And 
>>>>>>>>>>>>>>>>>> it shouldn't be
>>>>>>>>>>>>>>>>>> too hard to implement.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Performance over virtual interfaces may potentially be 
>>>>>>>>>>>>>>>>>> improved by creating
>>>>>>>>>>>>>>>>>> a kernel thread for async Tx.  Similarly to what io_uring 
>>>>>>>>>>>>>>>>>> allows.  Currently
>>>>>>>>>>>>>>>>>> Tx on non-zero-copy interfaces is synchronous, and that 
>>>>>>>>>>>>>>>>>> doesn't allow to
>>>>>>>>>>>>>>>>>> scale well.
>>>>>

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-26 Thread Ilya Maximets

On 6/26/23 08:32, Jason Wang wrote:
> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang  wrote:
>>
>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets  wrote:
>>>
>>> AF_XDP is a network socket family that allows communication directly
>>> with the network device driver in the kernel, bypassing most or all
>>> of the kernel networking stack.  In the essence, the technology is
>>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>>> and works with any network interfaces without driver modifications.
>>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>>> require access to character devices or unix sockets.  Only access to
>>> the network interface itself is necessary.
>>>
>>> This patch implements a network backend that communicates with the
>>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>>> Fill and Completion) are placed in that memory along with a pool of
>>> memory buffers for the packet data.  Data transmission is done by
>>> allocating one of the buffers, copying packet data into it and
>>> placing the pointer into Tx ring.  After transmission, device will
>>> return the buffer via Completion ring.  On Rx, device will take
>>> a buffer form a pre-populated Fill ring, write the packet data into
>>> it and place the buffer into Rx ring.
>>>
>>> AF_XDP network backend takes on the communication with the host
>>> kernel and the network interface and forwards packets to/from the
>>> peer device in QEMU.
>>>
>>> Usage example:
>>>
>>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>>
>>> XDP program bridges the socket with a network interface.  It can be
>>> attached to the interface in 2 different modes:
>>>
>>> 1. skb - this mode should work for any interface and doesn't require
>>>  driver support.  With a caveat of lower performance.
>>>
>>> 2. native - this does require support from the driver and allows to
>>> bypass skb allocation in the kernel and potentially use
>>> zero-copy while getting packets in/out userspace.
>>>
>>> By default, QEMU will try to use native mode and fall back to skb.
>>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>>> mode, use 'force-copy=on' option.  This might be useful if there is
>>> some issue with the driver.
>>>
>>> Option 'queues=N' allows to specify how many device queues should
>>> be open.  Note that all the queues that are not open are still
>>> functional and can receive traffic, but it will not be delivered to
>>> QEMU.  So, the number of device queues should generally match the
>>> QEMU configuration, unless the device is shared with something
>>> else and the traffic re-direction to appropriate queues is correctly
>>> configured on a device level (e.g. with ethtool -N).
>>> 'start-queue=M' option can be used to specify from which queue id
>>> QEMU should start configuring 'N' queues.  It might also be necessary
>>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>>> for examples.
>>>
>>> In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
>>> capabilities in order to load default XSK/XDP programs to the
>>> network interface and configure BTF maps.
>>
>> I think you mean "BPF" actually?

"BPF Type Format maps" kind of makes some sense, but yes. :)

>>
>>>  It is possible, however,
>>> to run only with CAP_NET_RAW.
>>
>> Qemu often runs without any privileges, so we need to fix it.
>>
>> I think adding support for SCM_RIGHTS via monitor would be a way to go.

I looked through the code and it seems like we can run completely
non-privileged as far as kernel concerned.  We'll need an API
modification in libxdp though.

The thing is, IIUC, the only syscall that requires CAP_NET_RAW is
a base socket creation.  Binding and other configuration doesn't
require any privileges.  So, we could create a socket externally
and pass it to QEMU.  Should work, unless it's an oversight from
the kernel side that needs to be patched. :)  libxdp doesn't have
a way to specify externally created socket today, so we'll need
to change that.  Should be easy to do though.  I c

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-27 Thread Ilya Maximets

On 6/27/23 04:54, Jason Wang wrote:
> On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets  wrote:
>>
>> On 6/26/23 08:32, Jason Wang wrote:
>>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang  wrote:
>>>>
>>>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets  wrote:
>>>>>
>>>>> AF_XDP is a network socket family that allows communication directly
>>>>> with the network device driver in the kernel, bypassing most or all
>>>>> of the kernel networking stack.  In the essence, the technology is
>>>>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>>>>> and works with any network interfaces without driver modifications.
>>>>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>>>>> require access to character devices or unix sockets.  Only access to
>>>>> the network interface itself is necessary.
>>>>>
>>>>> This patch implements a network backend that communicates with the
>>>>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>>>>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>>>>> Fill and Completion) are placed in that memory along with a pool of
>>>>> memory buffers for the packet data.  Data transmission is done by
>>>>> allocating one of the buffers, copying packet data into it and
>>>>> placing the pointer into Tx ring.  After transmission, device will
>>>>> return the buffer via Completion ring.  On Rx, device will take
>>>>> a buffer form a pre-populated Fill ring, write the packet data into
>>>>> it and place the buffer into Rx ring.
>>>>>
>>>>> AF_XDP network backend takes on the communication with the host
>>>>> kernel and the network interface and forwards packets to/from the
>>>>> peer device in QEMU.
>>>>>
>>>>> Usage example:
>>>>>
>>>>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>>>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>>>>
>>>>> XDP program bridges the socket with a network interface.  It can be
>>>>> attached to the interface in 2 different modes:
>>>>>
>>>>> 1. skb - this mode should work for any interface and doesn't require
>>>>>  driver support.  With a caveat of lower performance.
>>>>>
>>>>> 2. native - this does require support from the driver and allows to
>>>>> bypass skb allocation in the kernel and potentially use
>>>>> zero-copy while getting packets in/out userspace.
>>>>>
>>>>> By default, QEMU will try to use native mode and fall back to skb.
>>>>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>>>>> mode, use 'force-copy=on' option.  This might be useful if there is
>>>>> some issue with the driver.
>>>>>
>>>>> Option 'queues=N' allows to specify how many device queues should
>>>>> be open.  Note that all the queues that are not open are still
>>>>> functional and can receive traffic, but it will not be delivered to
>>>>> QEMU.  So, the number of device queues should generally match the
>>>>> QEMU configuration, unless the device is shared with something
>>>>> else and the traffic re-direction to appropriate queues is correctly
>>>>> configured on a device level (e.g. with ethtool -N).
>>>>> 'start-queue=M' option can be used to specify from which queue id
>>>>> QEMU should start configuring 'N' queues.  It might also be necessary
>>>>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>>>>> for examples.
>>>>>
>>>>> In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
>>>>> capabilities in order to load default XSK/XDP programs to the
>>>>> network interface and configure BTF maps.
>>>>
>>>> I think you mean "BPF" actually?
>>
>> "BPF Type Format maps" kind of makes some sense, but yes. :)
>>
>>>>
>>>>>  It is possible, however,
>>>>> to run only with CAP_NET_RAW.
>>>>
>>>> Qemu often runs without any privileges, so we need to fix it.
>>>>
>

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-27 Thread Ilya Maximets

On 6/27/23 10:56, Stefan Hajnoczi wrote:
> Can multiple VMs share a host netdev by filtering incoming traffic
> based on each VM's MAC address and directing it to the appropriate
> XSK? If yes, then I think AF_XDP is interesting when SR-IOV or similar
> hardware features are not available.

Good point.  Thanks!

Yes, they can.  Traffic can be re-directed via 'ethtool -N' similarly
to an example in the patch.  Or, potentially, via custom XDP program.
Then different QEMU instances may use different start-queue arguments
and use their own range of queues this way.

> 
> The idea of an AF_XDP passthrough device seems interesting because it
> would minimize the overhead and avoid some of the existing software
> limitations (mostly in QEMU's networking subsystem) that you
> described. I don't know whether the AF_XDP API is suitable or can be
> extended to build a hardware emulation interface, but it seems
> plausible. When Stefano Garzarella played with io_uring passthrough
> into the guest, one of the issues was guest memory translation (since
> the guest doesn't use host userspace virtual addresses). I guess
> AF_XDP would need an API for adding/removing memory translations or
> operate in a mode where addresses are relative offsets from the start
> of the umem regions

Actually, addresses in AF_XDP rings are already offsets from the
start of the umem region.  For example, xsk_umem__get_data is
implemented as &((char *)umem_area)[addr]; inside libxdp.  So, that
should not be an issue.

> (but this may be impractical if it limits where
> the guest can allocate packet payload buffers).

Yeah, we will either need to:

a. register the whole guest memory as umem and offset buffer pointers
   in the guest driver by the start of guest physical memory.

   (I'm not familiar much with QEMU memory subsystem.  Is guest physical
memory always start at 0? I know that it's not always true for the
real hardware.)

b. or require the guest driver to allocate a chunk of aligned contiguous
   memory and copy all the packets there on Tx.  And populate the Fill
   ring only with buffers from that area.  Assuming guest pages align
   with the host pages.  Again, a single copy might not be that bad,
   but it's hard to tell what the actual impact will be without testing.

> 
> Whether you pursue the passthrough approach or not, making -netdev
> af-xdp work in an environment where QEMU runs unprivileged seems like
> the most important practical issue to solve.

Yes, working on it.  Doesn't seem to be hard to do, but I need to test.

Best regards, Ilya Maximets.

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-28 Thread Ilya Maximets

On 6/28/23 05:27, Jason Wang wrote:
> On Wed, Jun 28, 2023 at 6:45 AM Ilya Maximets  wrote:
>>
>> On 6/27/23 04:54, Jason Wang wrote:
>>> On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets  wrote:
>>>>
>>>> On 6/26/23 08:32, Jason Wang wrote:
>>>>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang  wrote:
>>>>>>
>>>>>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets  wrote:
>>>>>>>
>>>>>>> AF_XDP is a network socket family that allows communication directly
>>>>>>> with the network device driver in the kernel, bypassing most or all
>>>>>>> of the kernel networking stack.  In the essence, the technology is
>>>>>>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>>>>>>> and works with any network interfaces without driver modifications.
>>>>>>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>>>>>>> require access to character devices or unix sockets.  Only access to
>>>>>>> the network interface itself is necessary.
>>>>>>>
>>>>>>> This patch implements a network backend that communicates with the
>>>>>>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>>>>>>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>>>>>>> Fill and Completion) are placed in that memory along with a pool of
>>>>>>> memory buffers for the packet data.  Data transmission is done by
>>>>>>> allocating one of the buffers, copying packet data into it and
>>>>>>> placing the pointer into Tx ring.  After transmission, device will
>>>>>>> return the buffer via Completion ring.  On Rx, device will take
>>>>>>> a buffer form a pre-populated Fill ring, write the packet data into
>>>>>>> it and place the buffer into Rx ring.
>>>>>>>
>>>>>>> AF_XDP network backend takes on the communication with the host
>>>>>>> kernel and the network interface and forwards packets to/from the
>>>>>>> peer device in QEMU.
>>>>>>>
>>>>>>> Usage example:
>>>>>>>
>>>>>>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>>>>>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>>>>>>
>>>>>>> XDP program bridges the socket with a network interface.  It can be
>>>>>>> attached to the interface in 2 different modes:
>>>>>>>
>>>>>>> 1. skb - this mode should work for any interface and doesn't require
>>>>>>>  driver support.  With a caveat of lower performance.
>>>>>>>
>>>>>>> 2. native - this does require support from the driver and allows to
>>>>>>> bypass skb allocation in the kernel and potentially use
>>>>>>> zero-copy while getting packets in/out userspace.
>>>>>>>
>>>>>>> By default, QEMU will try to use native mode and fall back to skb.
>>>>>>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>>>>>>> mode, use 'force-copy=on' option.  This might be useful if there is
>>>>>>> some issue with the driver.
>>>>>>>
>>>>>>> Option 'queues=N' allows to specify how many device queues should
>>>>>>> be open.  Note that all the queues that are not open are still
>>>>>>> functional and can receive traffic, but it will not be delivered to
>>>>>>> QEMU.  So, the number of device queues should generally match the
>>>>>>> QEMU configuration, unless the device is shared with something
>>>>>>> else and the traffic re-direction to appropriate queues is correctly
>>>>>>> configured on a device level (e.g. with ethtool -N).
>>>>>>> 'start-queue=M' option can be used to specify from which queue id
>>>>>>> QEMU should start configuring 'N' queues.  It might also be necessary
>>>>>>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>>>>>>> for examples.
>>>>>>>
>>>>>

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-30 Thread Ilya Maximets

On 6/30/23 09:44, Jason Wang wrote:
> On Wed, Jun 28, 2023 at 7:14 PM Ilya Maximets  wrote:
>>
>> On 6/28/23 05:27, Jason Wang wrote:
>>> On Wed, Jun 28, 2023 at 6:45 AM Ilya Maximets  wrote:
>>>>
>>>> On 6/27/23 04:54, Jason Wang wrote:
>>>>> On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets  wrote:
>>>>>>
>>>>>> On 6/26/23 08:32, Jason Wang wrote:
>>>>>>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang  wrote:
>>>>>>>>
>>>>>>>> On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets  
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> AF_XDP is a network socket family that allows communication directly
>>>>>>>>> with the network device driver in the kernel, bypassing most or all
>>>>>>>>> of the kernel networking stack.  In the essence, the technology is
>>>>>>>>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>>>>>>>>> and works with any network interfaces without driver modifications.
>>>>>>>>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>>>>>>>>> require access to character devices or unix sockets.  Only access to
>>>>>>>>> the network interface itself is necessary.
>>>>>>>>>
>>>>>>>>> This patch implements a network backend that communicates with the
>>>>>>>>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>>>>>>>>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>>>>>>>>> Fill and Completion) are placed in that memory along with a pool of
>>>>>>>>> memory buffers for the packet data.  Data transmission is done by
>>>>>>>>> allocating one of the buffers, copying packet data into it and
>>>>>>>>> placing the pointer into Tx ring.  After transmission, device will
>>>>>>>>> return the buffer via Completion ring.  On Rx, device will take
>>>>>>>>> a buffer form a pre-populated Fill ring, write the packet data into
>>>>>>>>> it and place the buffer into Rx ring.
>>>>>>>>>
>>>>>>>>> AF_XDP network backend takes on the communication with the host
>>>>>>>>> kernel and the network interface and forwards packets to/from the
>>>>>>>>> peer device in QEMU.
>>>>>>>>>
>>>>>>>>> Usage example:
>>>>>>>>>
>>>>>>>>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>>>>>>>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>>>>>>>>
>>>>>>>>> XDP program bridges the socket with a network interface.  It can be
>>>>>>>>> attached to the interface in 2 different modes:
>>>>>>>>>
>>>>>>>>> 1. skb - this mode should work for any interface and doesn't require
>>>>>>>>>  driver support.  With a caveat of lower performance.
>>>>>>>>>
>>>>>>>>> 2. native - this does require support from the driver and allows to
>>>>>>>>> bypass skb allocation in the kernel and potentially use
>>>>>>>>> zero-copy while getting packets in/out userspace.
>>>>>>>>>
>>>>>>>>> By default, QEMU will try to use native mode and fall back to skb.
>>>>>>>>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>>>>>>>>> mode, use 'force-copy=on' option.  This might be useful if there is
>>>>>>>>> some issue with the driver.
>>>>>>>>>
>>>>>>>>> Option 'queues=N' allows to specify how many device queues should
>>>>>>>>> be open.  Note that all the queues that are not open are still
>>>>>>>>> functional and can receive traffic, but it will not be delivered to
>>>>>>>>> QEMU.  So, the number of device queues should generally match the
>>>>>>>>> QEMU configuration, unle

[PATCH] net: add initial support for AF_XDP network backend

2023-06-22 Thread Ilya Maximets

ever, keeping in mind all of these challenges, current implementation
of the AF_XDP backend shows a decent performance while running on top
of a physical NIC with zero-copy support.

Test setup:

2 VMs running on 2 physical hosts connected via ConnectX6-Dx card.
Network backend is configured to open the NIC directly in native mode.
The driver supports zero-copy.  NIC is configured to use 1 queue.

Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd
for PPS testing.

iperf3 result:
 TCP stream  : 19.1 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
 Tx only : 3.4 Mpps
 Rx only : 2.0 Mpps
 L2 FWD Loopback : 1.5 Mpps

In skb mode the same setup shows much lower performance, similar to
the setup where pair of physical NICs is replaced with veth a pair:

iperf3 result:
  TCP stream  : 9 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
  Tx only : 1.2 Mpps
  Rx only : 1.0 Mpps
  L2 FWD Loopback : 0.7 Mpps

Results in skb mode or over the veth are close to results of a tap
backend with vhost=on and disabled segmentation offloading bridged
with a NIC.

Signed-off-by: Ilya Maximets 
---
 MAINTAINERS   |   4 +
 hmp-commands.hx   |   2 +-
 meson.build   |  14 +
 meson_options.txt |   2 +
 net/af-xdp.c  | 501 ++
 net/clients.h |   5 +
 net/meson.build   |   3 +
 net/net.c |   6 +
 qapi/net.json |  54 +-
 qemu-options.hx   |  61 ++-
 .../ci/org.centos/stream/8/x86_64/configure   |   1 +
 scripts/meson-buildoptions.sh |   3 +
 tests/docker/dockerfiles/debian-amd64.docker  |   1 +
 13 files changed, 654 insertions(+), 3 deletions(-)
 create mode 100644 net/af-xdp.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 7f323cd2eb..ca85422676 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2925,6 +2925,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c
 
+AF_XDP network backend
+R: Ilya Maximets 
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand 
 M: Igor Mammedov 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..af9ffe4681 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1295,7 +1295,7 @@ ERST
 {
 .name   = "netdev_add",
 .args_type  = "netdev:O",
-.params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+.params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|af-xdp|vhost-user"
 #ifdef CONFIG_VMNET
   "|vmnet-host|vmnet-shared|vmnet-bridged"
 #endif
diff --git a/meson.build b/meson.build
index 6ef78ea278..d0abb658c5 100644
--- a/meson.build
+++ b/meson.build
@@ -1883,6 +1883,18 @@ if libbpf.found() and not cc.links('''
   endif
 endif
 
+# libxdp
+libxdp = dependency('libxdp', required: get_option('af_xdp'), method: 
'pkg-config')
+if libxdp.found() and \
+  not (libbpf.found() and libbpf.version().version_compare('>=0.7'))
+  libxdp = not_found
+  if get_option('af_xdp').enabled()
+error('af-xdp support requires libbpf version >= 0.7')
+  else
+warning('af-xdp support requires libbpf version >= 0.7, disabling')
+  endif
+endif
+
 # libdw
 libdw = not_found
 if not get_option('libdw').auto() or \
@@ -2106,6 +2118,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', 
get_option('hexagon_idef_pars
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
 config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
 config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
 config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@@ -4279,6 +4292,7 @@ summary_info += {'PVRDMA support':have_pvrdma}
 summary_info += {'fdt support':   fdt_opt == 'disabled' ? false : fdt_opt}
 summary_info += {'libcap-ng support': libcap_ng}
 summary_info += {'bpf support':   libbpf}
+summary_info += {'AF_XDP support':libxdp}
 summary_info += {'rbd support':   rbd}
 summary_info += {'smartcard support': cacard}
 summary_info += {'U2F support':   u2f}
diff --git a/meson_options.txt b/meson_options.txt
index 90237389e2..31596d59f1 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -120,6 +120,8 @@ option('avx51

Re: [PULL 00/17] Net patches

2023-09-08 Thread Ilya Maximets

On 9/8/23 13:19, Stefan Hajnoczi wrote:
> Hi Ilya and Jason,
> There is a CI failure related to a missing Debian libxdp-dev package:
> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
> 
> I think the issue is that the debian-amd64 container image that QEMU
> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
> and libxdp is not available on that release:
> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all

Hmm.  Sorry about that.

> 
> If we need to support Debian 11 CI then either XDP could be disabled
> for that distro or libxdp could be compiled from source.

I'd suggest we just remove the attempt to install the package for now,
building libxdp from sources may be a little painful to maintain.

Can be re-added later once distributions with libxdp 1.4+ will be more
widely available, i.e. when fedora dockerfile will be updated to 39,
for example.  That should be soon-ish, right?

> 
> I have CCed Daniel Berrangé, because I think he knows how lcitool and
> QEMU's minimum distro requirements work. Maybe he can suggest a way
> forward.
> 
> Thanks,
> Stefan

Re: [PULL 12/17] net: add initial support for AF_XDP network backend

2023-09-08 Thread Ilya Maximets

On 9/8/23 13:48, Daniel P. Berrangé wrote:
> On Fri, Sep 08, 2023 at 02:45:02PM +0800, Jason Wang wrote:
>> From: Ilya Maximets 
>>
>> AF_XDP is a network socket family that allows communication directly
>> with the network device driver in the kernel, bypassing most or all
>> of the kernel networking stack.  In the essence, the technology is
>> pretty similar to netmap.  But, unlike netmap, AF_XDP is Linux-native
>> and works with any network interfaces without driver modifications.
>> Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't
>> require access to character devices or unix sockets.  Only access to
>> the network interface itself is necessary.
>>
>> This patch implements a network backend that communicates with the
>> kernel by creating an AF_XDP socket.  A chunk of userspace memory
>> is shared between QEMU and the host kernel.  4 ring buffers (Tx, Rx,
>> Fill and Completion) are placed in that memory along with a pool of
>> memory buffers for the packet data.  Data transmission is done by
>> allocating one of the buffers, copying packet data into it and
>> placing the pointer into Tx ring.  After transmission, device will
>> return the buffer via Completion ring.  On Rx, device will take
>> a buffer form a pre-populated Fill ring, write the packet data into
>> it and place the buffer into Rx ring.
>>
>> AF_XDP network backend takes on the communication with the host
>> kernel and the network interface and forwards packets to/from the
>> peer device in QEMU.
>>
>> Usage example:
>>
>>   -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C
>>   -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1
>>
>> XDP program bridges the socket with a network interface.  It can be
>> attached to the interface in 2 different modes:
>>
>> 1. skb - this mode should work for any interface and doesn't require
>>  driver support.  With a caveat of lower performance.
>>
>> 2. native - this does require support from the driver and allows to
>> bypass skb allocation in the kernel and potentially use
>> zero-copy while getting packets in/out userspace.
>>
>> By default, QEMU will try to use native mode and fall back to skb.
>> Mode can be forced via 'mode' option.  To force 'copy' even in native
>> mode, use 'force-copy=on' option.  This might be useful if there is
>> some issue with the driver.
>>
>> Option 'queues=N' allows to specify how many device queues should
>> be open.  Note that all the queues that are not open are still
>> functional and can receive traffic, but it will not be delivered to
>> QEMU.  So, the number of device queues should generally match the
>> QEMU configuration, unless the device is shared with something
>> else and the traffic re-direction to appropriate queues is correctly
>> configured on a device level (e.g. with ethtool -N).
>> 'start-queue=M' option can be used to specify from which queue id
>> QEMU should start configuring 'N' queues.  It might also be necessary
>> to use this option with certain NICs, e.g. MLX5 NICs.  See the docs
>> for examples.
>>
>> In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN
>> or CAP_BPF capabilities in order to load default XSK/XDP programs to
>> the network interface and configure BPF maps.  It is possible, however,
>> to run with no capabilities.  For that to work, an external process
>> with enough capabilities will need to pre-load default XSK program,
>> create AF_XDP sockets and pass their file descriptors to QEMU process
>> on startup via 'sock-fds' option.  Network backend will need to be
>> configured with 'inhibit=on' to avoid loading of the program.
>> QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue
>> or CAP_IPC_LOCK.
>>
>> There are few performance challenges with the current network backends.
>>
>> First is that they do not support IO threads.  This means that data
>> path is handled by the main thread in QEMU and may slow down other
>> work or may be slowed down by some other work.  This also means that
>> taking advantage of multi-queue is generally not possible today.
>>
>> Another thing is that data path is going through the device emulation
>> code, which is not really optimized for performance.  The fastest
>> "frontend" device is virtio-net.  But it's not optimized for heavy
>> traffic either, because it expects such use-cases to be handled via
>>

Re: [PULL 00/17] Net patches

2023-09-08 Thread Ilya Maximets

On 9/8/23 13:49, Daniel P. Berrangé wrote:
> On Fri, Sep 08, 2023 at 01:34:54PM +0200, Ilya Maximets wrote:
>> On 9/8/23 13:19, Stefan Hajnoczi wrote:
>>> Hi Ilya and Jason,
>>> There is a CI failure related to a missing Debian libxdp-dev package:
>>> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
>>>
>>> I think the issue is that the debian-amd64 container image that QEMU
>>> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
>>> and libxdp is not available on that release:
>>> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all
>>
>> Hmm.  Sorry about that.
>>
>>>
>>> If we need to support Debian 11 CI then either XDP could be disabled
>>> for that distro or libxdp could be compiled from source.
>>
>> I'd suggest we just remove the attempt to install the package for now,
>> building libxdp from sources may be a little painful to maintain.
>>
>> Can be re-added later once distributions with libxdp 1.4+ will be more
>> widely available, i.e. when fedora dockerfile will be updated to 39,
>> for example.  That should be soon-ish, right?
> 
> If you follow the process in docs/devel/testing.rst for adding
> libxdp in libvirt-ci, then lcitool will "do the right thing"
> when we move the auto-generated dockerfiles to new distro versions.

Thanks!  I'll prepare changes for libvirt-ci.

In the meantime, none of the currently tested images will have a required
version of libxdp anyway, so I'm suggesting to just drop this one dockerfile
modification from the patch.  What do you think?

Best regards, Ilya Maximets.

Re: [PULL 00/17] Net patches

2023-09-08 Thread Ilya Maximets

On 9/8/23 14:15, Daniel P. Berrangé wrote:
> On Fri, Sep 08, 2023 at 02:00:47PM +0200, Ilya Maximets wrote:
>> On 9/8/23 13:49, Daniel P. Berrangé wrote:
>>> On Fri, Sep 08, 2023 at 01:34:54PM +0200, Ilya Maximets wrote:
>>>> On 9/8/23 13:19, Stefan Hajnoczi wrote:
>>>>> Hi Ilya and Jason,
>>>>> There is a CI failure related to a missing Debian libxdp-dev package:
>>>>> https://gitlab.com/qemu-project/qemu/-/jobs/5046139967
>>>>>
>>>>> I think the issue is that the debian-amd64 container image that QEMU
>>>>> uses for testing is based on Debian 11 ("bullseye" aka "oldstable")
>>>>> and libxdp is not available on that release:
>>>>> https://packages.debian.org/search?keywords=libxdp&searchon=names&suite=oldstable§ion=all
>>>>
>>>> Hmm.  Sorry about that.
>>>>
>>>>>
>>>>> If we need to support Debian 11 CI then either XDP could be disabled
>>>>> for that distro or libxdp could be compiled from source.
>>>>
>>>> I'd suggest we just remove the attempt to install the package for now,
>>>> building libxdp from sources may be a little painful to maintain.
>>>>
>>>> Can be re-added later once distributions with libxdp 1.4+ will be more
>>>> widely available, i.e. when fedora dockerfile will be updated to 39,
>>>> for example.  That should be soon-ish, right?
>>>
>>> If you follow the process in docs/devel/testing.rst for adding
>>> libxdp in libvirt-ci, then lcitool will "do the right thing"
>>> when we move the auto-generated dockerfiles to new distro versions.
>>
>> Thanks!  I'll prepare changes for libvirt-ci.
>>
>> In the meantime, none of the currently tested images will have a required
>> version of libxdp anyway, so I'm suggesting to just drop this one dockerfile
>> modification from the patch.  What do you think?
> 
> Sure, if none of the distros have it, then lcitool won't emit the
> dockerfile changes until we update the inherited distro version.
> So it is sufficient to just update libvirt-ci.git with the mappings.yml
> info for libxdp, and add 'libxdp' to the tests/lcitool/projects/qemu.yml
> file in qemu.git. It will then 'just work' when someone updates the
> distro versions later.

I posted an MR for libvirt-ci adding libxdp:
  https://gitlab.com/libvirt/libvirt-ci/-/merge_requests/429

Please, take a look.

The docs say that CI will try to build containers with the MR changes,
but I don't think anything except sanity checks is actually tested on MR.
Sorry if I missed something, never used GitLab pipelines before.

Note that with this update we will be installing older version of libxdp
in many containers, even though they will not be used by QEMU, unless
they are newer than 1.4.0.

tests/lcitool/projects/qemu.yml in qemu.git cannot be updated without
updating a submodule after the MR merge.

Best regards, Ilya Maximets.

[PATCH v4 0/2] net: add initial support for AF_XDP network backend

2023-09-13 Thread Ilya Maximets

See the feature description in patch #2.

Version 4:

  - Fixed image builds with libvirt-ci by adding a new commit that
bumps a submodule version and then using that new version for
libxdp support in the main qemu build.
Based on discussion in the PULL request:
  https://lists.gnu.org/archive/html/qemu-devel/2023-09/msg01730.html

Version 3:

  - Bump requirements to libxdp 1.4.0+.  Having that, removed all
the conditional compilation parts, since all the needed APIs
are available in this version of libxdp.

  - Also removed the ability to pass xsks map fd, since ability
to just pass socket fds is now always available and it doesn't
require any capabilities untile manipulations with BPF maps.

  - Updated documentation to not call out specific vendors, memory
numbers or specific required capabilities.

  - Changed logic of returning peeked at but unused descriptors.

  - Minor cleanups.


Version 2:

  - Added support for running with no capabilities by passing
pre-created AF_XDP socket file descriptors via 'sock-fds' option.
Conditionally complied because requires unreleased libxdp 1.4.0.
The last restriction is having 32 MB of RLIMIT_MEMLOCK per queue.

  - Refined and extended documentation.


Ilya Maximets (2):
  tests: bump libvirt-ci for libasan and libxdp
  net: add initial support for AF_XDP network backend

 MAINTAINERS   |   4 +
 hmp-commands.hx   |   3 +
 meson.build   |   9 +
 meson_options.txt |   2 +
 net/af-xdp.c  | 526 ++
 net/clients.h |   5 +
 net/meson.build   |   3 +
 net/net.c |   6 +
 qapi/net.json |  58 ++
 qemu-options.hx   |  70 ++-
 .../ci/org.centos/stream/8/x86_64/configure   |   1 +
 scripts/meson-buildoptions.sh |   3 +
 tests/docker/dockerfiles/alpine.docker|   1 +
 tests/docker/dockerfiles/centos8.docker   |   1 +
 .../dockerfiles/debian-amd64-cross.docker |   2 +-
 tests/docker/dockerfiles/debian-amd64.docker  |   2 +-
 .../dockerfiles/debian-arm64-cross.docker |   2 +-
 .../dockerfiles/debian-armel-cross.docker |   2 +-
 .../dockerfiles/debian-armhf-cross.docker |   2 +-
 .../dockerfiles/debian-ppc64el-cross.docker   |   2 +-
 .../dockerfiles/debian-s390x-cross.docker |   2 +-
 tests/docker/dockerfiles/fedora.docker|   1 +
 tests/docker/dockerfiles/opensuse-leap.docker |   2 +-
 tests/docker/dockerfiles/ubuntu2004.docker|   2 +-
 tests/docker/dockerfiles/ubuntu2204.docker|   2 +-
 tests/lcitool/libvirt-ci  |   2 +-
 tests/lcitool/projects/qemu.yml   |   1 +
 27 files changed, 704 insertions(+), 12 deletions(-)
 create mode 100644 net/af-xdp.c

-- 
2.41.0

[PATCH v4 1/2] tests: bump libvirt-ci for libasan and libxdp

2023-09-13 Thread Ilya Maximets

This pulls in the fixes for libasan version as well as support for
libxdp that will be used for af-xdp netdev in the next commits.

Signed-off-by: Ilya Maximets 
---
 tests/docker/dockerfiles/debian-amd64-cross.docker   | 2 +-
 tests/docker/dockerfiles/debian-amd64.docker | 2 +-
 tests/docker/dockerfiles/debian-arm64-cross.docker   | 2 +-
 tests/docker/dockerfiles/debian-armel-cross.docker   | 2 +-
 tests/docker/dockerfiles/debian-armhf-cross.docker   | 2 +-
 tests/docker/dockerfiles/debian-ppc64el-cross.docker | 2 +-
 tests/docker/dockerfiles/debian-s390x-cross.docker   | 2 +-
 tests/docker/dockerfiles/opensuse-leap.docker| 2 +-
 tests/docker/dockerfiles/ubuntu2004.docker   | 2 +-
 tests/docker/dockerfiles/ubuntu2204.docker   | 2 +-
 tests/lcitool/libvirt-ci | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/docker/dockerfiles/debian-amd64-cross.docker 
b/tests/docker/dockerfiles/debian-amd64-cross.docker
index b66b9cc191..0cf3ba6d60 100644
--- a/tests/docker/dockerfiles/debian-amd64-cross.docker
+++ b/tests/docker/dockerfiles/debian-amd64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   g++-x86-64-linux-gnu \
   gcc-x86-64-linux-gnu \
   libaio-dev:amd64 \
-  libasan5:amd64 \
+  libasan6:amd64 \
   libasound2-dev:amd64 \
   libattr1-dev:amd64 \
   libbpf-dev:amd64 \
diff --git a/tests/docker/dockerfiles/debian-amd64.docker 
b/tests/docker/dockerfiles/debian-amd64.docker
index 02262bc70e..e3e1de25dd 100644
--- a/tests/docker/dockerfiles/debian-amd64.docker
+++ b/tests/docker/dockerfiles/debian-amd64.docker
@@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   git \
   hostname \
   libaio-dev \
-  libasan5 \
+  libasan6 \
   libasound2-dev \
   libattr1-dev \
   libbpf-dev \
diff --git a/tests/docker/dockerfiles/debian-arm64-cross.docker 
b/tests/docker/dockerfiles/debian-arm64-cross.docker
index a0a968b8c6..d8cd4f87b6 100644
--- a/tests/docker/dockerfiles/debian-arm64-cross.docker
+++ b/tests/docker/dockerfiles/debian-arm64-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   g++-aarch64-linux-gnu \
   gcc-aarch64-linux-gnu \
   libaio-dev:arm64 \
-  libasan5:arm64 \
+  libasan6:arm64 \
   libasound2-dev:arm64 \
   libattr1-dev:arm64 \
   libbpf-dev:arm64 \
diff --git a/tests/docker/dockerfiles/debian-armel-cross.docker 
b/tests/docker/dockerfiles/debian-armel-cross.docker
index f1fc34a28a..75342c09b0 100644
--- a/tests/docker/dockerfiles/debian-armel-cross.docker
+++ b/tests/docker/dockerfiles/debian-armel-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   g++-arm-linux-gnueabi \
   gcc-arm-linux-gnueabi \
   libaio-dev:armel \
-  libasan5:armel \
+  libasan6:armel \
   libasound2-dev:armel \
   libattr1-dev:armel \
   libbpf-dev:armel \
diff --git a/tests/docker/dockerfiles/debian-armhf-cross.docker 
b/tests/docker/dockerfiles/debian-armhf-cross.docker
index a278578211..f45cfedd3f 100644
--- a/tests/docker/dockerfiles/debian-armhf-cross.docker
+++ b/tests/docker/dockerfiles/debian-armhf-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   g++-arm-linux-gnueabihf \
   gcc-arm-linux-gnueabihf \
   libaio-dev:armhf \
-  libasan5:armhf \
+  libasan6:armhf \
   libasound2-dev:armhf \
   libattr1-dev:armhf \
   libbpf-dev:armhf \
diff --git a/tests/docker/dockerfiles/debian-ppc64el-cross.docker 
b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
index 30e5efa986..52f8c34814 100644
--- a/tests/docker/dockerfiles/debian-ppc64el-cross.docker
+++ b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
@@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
   g++-powerpc64le-linux-gnu \
   gcc-powerpc64le-linux-gnu \
   libaio-dev:ppc64el \
-  libasan5:ppc64el \
+  libasan6:ppc64el \
   libasound2-dev:ppc64el \
   libattr1-dev:ppc64el \
   libbpf-dev:ppc64el \
diff --git a/tests/docker/docke

[PATCH v4 2/2] net: add initial support for AF_XDP network backend

2023-09-13 Thread Ilya Maximets

on virtual
interfaces.

However, keeping in mind all of these challenges, current implementation
of the AF_XDP backend shows a decent performance while running on top
of a physical NIC with zero-copy support.

Test setup:

2 VMs running on 2 physical hosts connected via ConnectX6-Dx card.
Network backend is configured to open the NIC directly in native mode.
The driver supports zero-copy.  NIC is configured to use 1 queue.

Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd
for PPS testing.

iperf3 result:
 TCP stream  : 19.1 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
 Tx only : 3.4 Mpps
 Rx only : 2.0 Mpps
 L2 FWD Loopback : 1.5 Mpps

In skb mode the same setup shows much lower performance, similar to
the setup where pair of physical NICs is replaced with veth pair:

iperf3 result:
  TCP stream  : 9 Gbps

dpdk-testpmd (single queue, single CPU core, 64 B packets) results:
  Tx only : 1.2 Mpps
  Rx only : 1.0 Mpps
  L2 FWD Loopback : 0.7 Mpps

Results in skb mode or over the veth are close to results of a tap
backend with vhost=on and disabled segmentation offloading bridged
with a NIC.

Signed-off-by: Ilya Maximets 
---
 MAINTAINERS   |   4 +
 hmp-commands.hx   |   3 +
 meson.build   |   9 +
 meson_options.txt |   2 +
 net/af-xdp.c  | 526 ++
 net/clients.h |   5 +
 net/meson.build   |   3 +
 net/net.c |   6 +
 qapi/net.json |  58 ++
 qemu-options.hx   |  70 ++-
 .../ci/org.centos/stream/8/x86_64/configure   |   1 +
 scripts/meson-buildoptions.sh |   3 +
 tests/docker/dockerfiles/alpine.docker|   1 +
 tests/docker/dockerfiles/centos8.docker   |   1 +
 tests/docker/dockerfiles/fedora.docker|   1 +
 tests/lcitool/projects/qemu.yml   |   1 +
 16 files changed, 693 insertions(+), 1 deletion(-)
 create mode 100644 net/af-xdp.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 00562f924f..67cefaa6f2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c
 
+AF_XDP network backend
+R: Ilya Maximets 
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand 
 M: Igor Mammedov 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..63eac22734 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1296,6 +1296,9 @@ ERST
 .name   = "netdev_add",
 .args_type  = "netdev:O",
 .params = 
"[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+  "|af-xdp"
+#endif
 #ifdef CONFIG_VMNET
   "|vmnet-host|vmnet-shared|vmnet-bridged"
 #endif
diff --git a/meson.build b/meson.build
index 0e31bdfabf..8e445d180a 100644
--- a/meson.build
+++ b/meson.build
@@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
   endif
 endif
 
+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+libxdp = dependency('libxdp', required: get_option('af_xdp'),
+version: '>=1.4.0', method: 'pkg-config')
+endif
+
 # libdw
 libdw = not_found
 if not get_option('libdw').auto() or \
@@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', 
get_option('hexagon_idef_pars
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
 config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
 config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
 config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@@ -4270,6 +4278,7 @@ summary_info = {}
 if targetos == 'darwin'
   summary_info += {'vmnet.framework support': vmnet}
 endif
+summary_info += {'AF_XDP support':libxdp}
 summary_info += {'slirp support': slirp}
 summary_info += {'vde support':   vde}
 summary_info += {'netmap support':have_netmap}
diff --git a/meson_options.txt b/meson_options.txt
index f82d88b7c6..2ca40f22e9 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
+option('

Re: [PATCH v3 2/4] tests/lcitool: Refresh generated files

2024-01-02 Thread Ilya Maximets

On 1/2/24 19:19, Philippe Mathieu-Daudé wrote:
> On 12/7/23 15:07, Daniel P. Berrangé wrote:
>> On Wed, Jul 12, 2023 at 02:55:10PM +0200, Philippe Mathieu-Daudé wrote:
>>> On 12/7/23 13:12, Daniel P. Berrangé wrote:
>>>> On Wed, Jul 12, 2023 at 01:00:38PM +0200, Philippe Mathieu-Daudé wrote:
>>>>> On 11/7/23 19:58, Daniel P. Berrangé wrote:
>>>>>> On Tue, Jul 11, 2023 at 04:49:20PM +0200, Philippe Mathieu-Daudé wrote:
>>>>>>> Refresh the generated files by running:
>>>>>>>
>>>>>>>  $ make lcitool-refresh
>>>>>>>
>>>>>>> Signed-off-by: Philippe Mathieu-Daudé 
>>>>>>> ---
>>>>>>> tests/docker/dockerfiles/debian-amd64.docker |  2 -
>>>>>>> tests/docker/dockerfiles/ubuntu2004.docker   |  2 -
>>>>>>> tests/docker/dockerfiles/ubuntu2204.docker   |  2 -
>>>>>>
>>>>>> I don't know why this would be removing xen/pmem from these files. If
>>>>>> I run 'lcitool-refresh' on current git master that doesn't happen
> 
> FTR since commit cb039ef3d9 libxdp-devel is also being changed on my
> host, similarly to libpmem-devel, so I suppose it also has some host
> specific restriction.

Yeah, many distributions are not building libxdp for non-x86
architectures today.  There are no real reasons not to, but
they just do not, because the package is relatively new.  It
will likely become available in the future.

I see this thread is a bit old.  Was a solution found for the
pmem/xen?  i.e. do I need to do something specific for libxdp
at this point?

Best regards, Ilya Maximets.

> 
>>>>>> Do you have any other local changes on top ?
>>>
>>> (I just noticed manually updating the libvirt-ci submodule is
>>>   pointless because the 'lcitool-refresh' rule does that)
>>>
>>>>> diff --git a/tests/docker/dockerfiles/ubuntu2204.docker
>>>>> b/tests/docker/dockerfiles/ubuntu2204.docker
>>>>> index 1d442cdfe6..5162927016 100644
>>>>> --- a/tests/docker/dockerfiles/ubuntu2204.docker
>>>>> +++ b/tests/docker/dockerfiles/ubuntu2204.docker
>>>>> @@ -73 +72,0 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
>>>>> -  libpmem-dev \
>>>>> @@ -99 +97,0 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
>>>>> -  libxen-dev \
>>>>
>>>> What architecture are you running from ? I'm suspecting it is a non
>>>> x86_64 architecture as these pacakges are both arch conditionalized.
>>>
>>> My host is Darwin aarch64, but how is this relevant to what we
>>> generate for guests? Are we missing specifying the target arch
>>> somewhere? (This is not the '--cross-arch' argument, because we
>>> don't want to cross-build). These dockerfiles always targeted x86_64,
>>> in particular the debian-amd64.docker one...
>>
>> This is an unfortunate design choice of lcitool.
>>
>> If you give '--cross-arch' it will always spit out the dockerfile
>> suitable for cross-compiling to that arch.
>>
>> If you omit '--cross-arch' it will always spit out the dockerfile
>> suitable for compiling on the *current* host arch.
>>
>> When we're committing the dockerfiles to qemu (or any libvirt project
>> using lcitool), we've got an implicit assumption that the non-cross
>> dockerfiles were for x86_64, and hence an implicit assumption that
>> the person who ran 'lcitool' was also on x86_64.
>>
>> This is clearly a bad design choice mistake in retrospect as people
>> are increasingly likely to be using aarch64 for day-to-day dev work.
>>
>> So I guess we need to come up with a more explicit way to say give
>> me a native container for x86_64.
>>
>> With regards,
>> Daniel
>

[PATCH] virtio: remove unnecessary thread fence while reading next descriptor

2023-08-25 Thread Ilya Maximets

It was supposed to be a compiler barrier and it was a compiler barrier
initially called 'wmb' (??) when virtio core support was introduced.
Later all the instances of 'wmb' were switched to smp_wmb to fix memory
ordering issues on non-x86 platforms.  However, this one doesn't need
to be an actual barrier.  It's enough for it to stay a compiler barrier
as its only purpose is to ensure that the value is not read twice.

There is no counterpart read barrier in the drivers, AFAICT.  And even
if we needed an actual barrier, it shouldn't have been a write barrier.

Signed-off-by: Ilya Maximets 
---
 hw/virtio/virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 309038fd46..6eb8586858 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1051,7 +1051,7 @@ static int virtqueue_split_read_next_desc(VirtIODevice 
*vdev, VRingDesc *desc,
 /* Check they're not leading us off end of descriptors. */
 *next = desc->next;
 /* Make sure compiler knows to grab that: we don't want it changing! */
-smp_wmb();
+barrier();
 
 if (*next >= max) {
 virtio_error(vdev, "Desc next is %u", *next);
-- 
2.40.1

[PATCH] virtio: use shadow_avail_idx while checking number of heads

2023-08-25 Thread Ilya Maximets

We do not need the most up to date number of heads, we only want to
know if there is at least one.

Use shadow variable as long as it is not equal to the last available
index checked.  This avoids expensive qatomic dereference of the
RCU-protected memory region cache as well as the memory access itself
and the subsequent memory barrier.

The change improves performance of the af-xdp network backend by 2-3%.

Signed-off-by: Ilya Maximets 
---
 hw/virtio/virtio.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 309038fd46..04bf7cc977 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -999,7 +999,15 @@ void virtqueue_push(VirtQueue *vq, const VirtQueueElement 
*elem,
 /* Called within rcu_read_lock().  */
 static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
 {
-uint16_t num_heads = vring_avail_idx(vq) - idx;
+uint16_t num_heads;
+
+if (vq->shadow_avail_idx != idx) {
+num_heads = vq->shadow_avail_idx - idx;
+
+return num_heads;
+}
+
+num_heads = vring_avail_idx(vq) - idx;
 
 /* Check it isn't doing very strange things with descriptor numbers. */
 if (num_heads > vq->vring.num) {
-- 
2.40.1

Re: [Qemu-devel] [v5, 09/31] vhost: fix calling vhost_dev_cleanup() after vhost_dev_init()

2016-07-25 Thread Ilya Maximets

On 21.07.2016 11:57, Marc-André Lureau wrote:
> From: Marc-André Lureau 
> 
> vhost_net_init() calls vhost_dev_init() and in case of failure, calls
> vhost_dev_cleanup() directly. However, the structure is already
> partially cleaned on error. Calling vhost_dev_cleanup() again will call
> vhost_virtqueue_cleanup() on already clean queues, and causing potential
> double-close. Instead, adjust dev->nvqs and simplify vhost_dev_init()
> code to not call vhost_virtqueue_cleanup() but vhost_dev_cleanup()
> instead.
> 
> Signed-off-by: Marc-André Lureau 
> ---
>  hw/virtio/vhost.c | 13 -
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 9400b47..c61302a 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -1047,7 +1047,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>  for (i = 0; i < hdev->nvqs; ++i) {
>  r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
>  if (r < 0) {
> -goto fail_vq;
> +hdev->nvqs = i;
> +goto fail;
>  }
>  }
>  
> @@ -1104,19 +1105,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void 
> *opaque,
>  memory_listener_register(&hdev->memory_listener, &address_space_memory);
>  QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
>  return 0;
> +
>  fail_busyloop:
>  while (--i >= 0) {
>  vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
>  }
> -i = hdev->nvqs;
> -fail_vq:
> -while (--i >= 0) {
> -vhost_virtqueue_cleanup(hdev->vqs + i);
> -}
>  fail:
> -r = -errno;
> -hdev->vhost_ops->vhost_backend_cleanup(hdev);
> -QLIST_REMOVE(hdev, entry);
> +vhost_dev_cleanup(hdev);
>  return r;
>  }
>  
> 

This patch introduces closing of zero fd on backend init failure or any
other error before virtqueue_init loop because of calling
'vhost_virtqueue_cleanup()' on not initialized virtqueues.

I'm suggesting following fixup:

--
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 6175d8b..d7428c5 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1038,8 +1038,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type, uint32_t busyloop_timeout)
 {
 uint64_t features;
-int i, r;
+int i, r, n_initialized_vqs;
 
+n_initialized_vqs = 0;
 hdev->migration_blocker = NULL;
 
 r = vhost_set_backend_type(hdev, backend_type);

@@ -1069,10 +1071,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 goto fail;
 }
 
-for (i = 0; i < hdev->nvqs; ++i) {
+for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
 if (r < 0) {
-hdev->nvqs = i;
 goto fail;
 }
 }
@@ -1136,6 +1137,7 @@ fail_busyloop:
     vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
 }
 fail:
+hdev->nvqs = n_initialized_vqs;
 vhost_dev_cleanup(hdev);
 return r;
 }
--

Best regards, Ilya Maximets.

Re: [Qemu-devel] [v5, 17/31] vhost-user: keep vhost_net after a disconnection

2016-07-25 Thread Ilya Maximets

;
> +}
> +vhost_user_stop(i, ncs);
>  return -1;
>  }
>  
> @@ -150,6 +145,7 @@ static void vhost_user_cleanup(NetClientState *nc)
>  
>  if (s->vhost_net) {
>  vhost_net_cleanup(s->vhost_net);
> +g_free(s->vhost_net);
>  s->vhost_net = NULL;
>  }
>  if (s->chr) {
> 

In patch number 7 of this patch set was introduced 'memset()' inside
'vhost_dev_cleanup()' which clears 'acked_features' field of 'vhost_dev'
structure. This patch uses already zeroed value of this field for
features restore after reconnection.

You should not remove 'acked_features' from 'struct VhostUserState' or
avoid cleaning of this field in 'vhost_dev'.

I'm suggesting following fixup (mainly, just a partial revert):
--
diff --git a/net/vhost-user.c b/net/vhost-user.c
index 60fab9a..3100a5e 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -23,6 +23,7 @@ typedef struct VhostUserState {
 CharDriverState *chr;
 VHostNetState *vhost_net;
 guint watch;
+uint64_t acked_features;
 } VhostUserState;
 
 typedef struct VhostUserChardevProps {
@@ -41,7 +42,7 @@ uint64_t vhost_user_get_acked_features(NetClientState *nc)
 {
 VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
-return s->vhost_net ? vhost_net_get_acked_features(s->vhost_net) : 0;
+return s->acked_features;
 }
 
 static void vhost_user_stop(int queues, NetClientState *ncs[])
@@ -55,6 +56,11 @@ static void vhost_user_stop(int queues, NetClientState 
*ncs[])
 s = DO_UPCAST(VhostUserState, nc, ncs[i]);
 
 if (s->vhost_net) {
+/* save acked features */
+uint64_t features = vhost_net_get_acked_features(s->vhost_net);
+if (features) {
+s->acked_features = features;
+}
 vhost_net_cleanup(s->vhost_net);
 }
 }
--

Best regards, Ilya Maximets.

Re: [Qemu-devel] [v5, 09/31] vhost: fix calling vhost_dev_cleanup() after vhost_dev_init()

2016-07-25 Thread Ilya Maximets

On 25.07.2016 15:45, Marc-André Lureau wrote:
> Hi
> 
> - Original Message -
>> On 21.07.2016 11:57, Marc-André Lureau wrote:
>>> From: Marc-André Lureau 
>>>
>>> vhost_net_init() calls vhost_dev_init() and in case of failure, calls
>>> vhost_dev_cleanup() directly. However, the structure is already
>>> partially cleaned on error. Calling vhost_dev_cleanup() again will call
>>> vhost_virtqueue_cleanup() on already clean queues, and causing potential
>>> double-close. Instead, adjust dev->nvqs and simplify vhost_dev_init()
>>> code to not call vhost_virtqueue_cleanup() but vhost_dev_cleanup()
>>> instead.
>>>
>>> Signed-off-by: Marc-André Lureau 
>>> ---
>>>  hw/virtio/vhost.c | 13 -
>>>  1 file changed, 4 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>> index 9400b47..c61302a 100644
>>> --- a/hw/virtio/vhost.c
>>> +++ b/hw/virtio/vhost.c
>>> @@ -1047,7 +1047,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>> *opaque,
>>>  for (i = 0; i < hdev->nvqs; ++i) {
>>>  r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
>>>  if (r < 0) {
>>> -goto fail_vq;
>>> +hdev->nvqs = i;
>>> +goto fail;
>>>  }
>>>  }
>>>  
>>> @@ -1104,19 +1105,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>> *opaque,
>>>  memory_listener_register(&hdev->memory_listener,
>>>  &address_space_memory);
>>>  QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
>>>  return 0;
>>> +
>>>  fail_busyloop:
>>>  while (--i >= 0) {
>>>  vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
>>>  }
>>> -i = hdev->nvqs;
>>> -fail_vq:
>>> -while (--i >= 0) {
>>> -vhost_virtqueue_cleanup(hdev->vqs + i);
>>> -}
>>>  fail:
>>> -r = -errno;
>>> -hdev->vhost_ops->vhost_backend_cleanup(hdev);
>>> -QLIST_REMOVE(hdev, entry);
>>> +vhost_dev_cleanup(hdev);
>>>  return r;
>>>  }
>>>  
>>>
>>
>> This patch introduces closing of zero fd on backend init failure or any
>> other error before virtqueue_init loop because of calling
>> 'vhost_virtqueue_cleanup()' on not initialized virtqueues.
>>
>> I'm suggesting following fixup:
>>
>> --
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index 6175d8b..d7428c5 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -1038,8 +1038,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>> *opaque,
>> VhostBackendType backend_type, uint32_t busyloop_timeout)
>>  {
>>  uint64_t features;
>> -int i, r;
>> +int i, r, n_initialized_vqs;
>>  
>> +n_initialized_vqs = 0;
>>  hdev->migration_blocker = NULL;
>>  
>>  r = vhost_set_backend_type(hdev, backend_type);
>>
>> @@ -1069,10 +1071,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>> *opaque,
>>  goto fail;
>>  }
>>  
>> -for (i = 0; i < hdev->nvqs; ++i) {
>> +for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
>>  r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
>>  if (r < 0) {
>> -hdev->nvqs = i;
> 
> Isn't that assignment doing the same thing?

Yes.
But assignment to zero (hdev->nvqs = 0) required before all previous
'goto fail;' instructions. I think, it's not a clean solution.

> btw, thanks for the review
> 
>>  goto fail;
>>  }
>>  }
>> @@ -1136,6 +1137,7 @@ fail_busyloop:
>>  vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
>>  }
>>  fail:
>> +hdev->nvqs = n_initialized_vqs;
>>  vhost_dev_cleanup(hdev);
>>  return r;
>>  }
>> --
>>
>> Best regards, Ilya Maximets.
>>
> 
>

Re: [Qemu-devel] [v5, 09/31] vhost: fix calling vhost_dev_cleanup() after vhost_dev_init()

2016-07-25 Thread Ilya Maximets

On 25.07.2016 16:05, Marc-André Lureau wrote:
> Hi
> 
> - Original Message -
>> On 25.07.2016 15:45, Marc-André Lureau wrote:
>>> Hi
>>>
>>> - Original Message -
>>>> On 21.07.2016 11:57, Marc-André Lureau wrote:
>>>>> From: Marc-André Lureau 
>>>>>
>>>>> vhost_net_init() calls vhost_dev_init() and in case of failure, calls
>>>>> vhost_dev_cleanup() directly. However, the structure is already
>>>>> partially cleaned on error. Calling vhost_dev_cleanup() again will call
>>>>> vhost_virtqueue_cleanup() on already clean queues, and causing potential
>>>>> double-close. Instead, adjust dev->nvqs and simplify vhost_dev_init()
>>>>> code to not call vhost_virtqueue_cleanup() but vhost_dev_cleanup()
>>>>> instead.
>>>>>
>>>>> Signed-off-by: Marc-André Lureau 
>>>>> ---
>>>>>  hw/virtio/vhost.c | 13 -
>>>>>  1 file changed, 4 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>> index 9400b47..c61302a 100644
>>>>> --- a/hw/virtio/vhost.c
>>>>> +++ b/hw/virtio/vhost.c
>>>>> @@ -1047,7 +1047,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>>>> *opaque,
>>>>>  for (i = 0; i < hdev->nvqs; ++i) {
>>>>>  r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index +
>>>>>  i);
>>>>>  if (r < 0) {
>>>>> -goto fail_vq;
>>>>> +hdev->nvqs = i;
>>>>> +goto fail;
>>>>>  }
>>>>>  }
>>>>>  
>>>>> @@ -1104,19 +1105,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>>>> *opaque,
>>>>>  memory_listener_register(&hdev->memory_listener,
>>>>>  &address_space_memory);
>>>>>  QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
>>>>>  return 0;
>>>>> +
>>>>>  fail_busyloop:
>>>>>  while (--i >= 0) {
>>>>>  vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
>>>>>  0);
>>>>>  }
>>>>> -i = hdev->nvqs;
>>>>> -fail_vq:
>>>>> -while (--i >= 0) {
>>>>> -vhost_virtqueue_cleanup(hdev->vqs + i);
>>>>> -}
>>>>>  fail:
>>>>> -r = -errno;
>>>>> -hdev->vhost_ops->vhost_backend_cleanup(hdev);
>>>>> -QLIST_REMOVE(hdev, entry);
>>>>> +vhost_dev_cleanup(hdev);
>>>>>  return r;
>>>>>  }
>>>>>  
>>>>>
>>>>
>>>> This patch introduces closing of zero fd on backend init failure or any
>>>> other error before virtqueue_init loop because of calling
>>>> 'vhost_virtqueue_cleanup()' on not initialized virtqueues.
>>>>
>>>> I'm suggesting following fixup:
>>>>
>>>> --
>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>> index 6175d8b..d7428c5 100644
>>>> --- a/hw/virtio/vhost.c
>>>> +++ b/hw/virtio/vhost.c
>>>> @@ -1038,8 +1038,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>>> *opaque,
>>>> VhostBackendType backend_type, uint32_t
>>>> busyloop_timeout)
>>>>  {
>>>>  uint64_t features;
>>>> -int i, r;
>>>> +int i, r, n_initialized_vqs;
>>>>  
>>>> +n_initialized_vqs = 0;
>>>>  hdev->migration_blocker = NULL;
>>>>  
>>>>  r = vhost_set_backend_type(hdev, backend_type);
>>>>
>>>> @@ -1069,10 +1071,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void
>>>> *opaque,
>>>>  goto fail;
>>>>  }
>>>>  
>>>> -for (i = 0; i < hdev->nvqs; ++i) {
>>>> +for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
>>>>  r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index +
>>>>  i);
>>>>  if (r < 0) {
>>>> -hdev->nvqs = i;
>>>
>>> Isn't that assignment doing the same thing?
>>
>> Yes.
>> But assignment to zero (hdev->nvqs = 0) required before all previous
>> 'goto fail;' instructions. I think, it's not a clean solution.
>>
> 
> Good point, I'll squash your change,

Thanks for fixing it.

> should I add your sign-off-by?

I don't mind if you want to.

Best regards, Ilya Maximets.

[Qemu-devel] [PATCH] vhost: check for vhost_ops before using.

2016-08-02 Thread Ilya Maximets

'vhost_set_vring_enable()' tries to call function using pointer to
'vhost_ops' which can be already zeroized in 'vhost_dev_cleanup()'
while vhost disconnection.

Fix that by checking 'vhost_ops' before using. This fixes QEMU crash
on calling 'ethtool -L eth0 combined 2' if vhost disconnected.

Signed-off-by: Ilya Maximets 
---
 hw/net/vhost_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index dc61dc1..f2d49ad 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -428,7 +428,7 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
 
 nc->vring_enable = enable;
 
-if (vhost_ops->vhost_set_vring_enable) {
+if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
 return vhost_ops->vhost_set_vring_enable(&net->dev, enable);
 }
 
-- 
2.7.4

[Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-06 Thread Ilya Maximets

In case virtio error occured after vhost_dev_close(), qemu will crash
in nested cleanup while checking IOMMU flag because dev->vdev already
set to zero and resources are already freed.

Example:

Program received signal SIGSEGV, Segmentation fault.
vhost_virtqueue_stop at hw/virtio/vhost.c:1155

#0  vhost_virtqueue_stop at hw/virtio/vhost.c:1155
#1  vhost_dev_stop at hw/virtio/vhost.c:1594
#2  vhost_net_stop_one at hw/net/vhost_net.c:289
#3  vhost_net_stop at hw/net/vhost_net.c:368

Nested call to vhost_net_stop(). First time was at #14.

#4  virtio_net_vhost_status at hw/net/virtio-net.c:180
#5  virtio_net_set_status (status=79) at hw/net/virtio-net.c:254
#6  virtio_set_status at hw/virtio/virtio.c:1146
#7  virtio_error at hw/virtio/virtio.c:2455

virtqueue_get_head() failed here.

#8  virtqueue_get_head at hw/virtio/virtio.c:543
#9  virtqueue_drop_all at hw/virtio/virtio.c:984
#10 virtio_net_drop_tx_queue_data at hw/net/virtio-net.c:240
#11 virtio_bus_set_host_notifier at hw/virtio/virtio-bus.c:297
#12 vhost_dev_disable_notifiers at hw/virtio/vhost.c:1431

vhost_dev_stop() was executed here. dev->vdev == NULL now.

#13 vhost_net_stop_one at hw/net/vhost_net.c:290
#14 vhost_net_stop at hw/net/vhost_net.c:368
#15 virtio_net_vhost_status at hw/net/virtio-net.c:180
#16 virtio_net_set_status (status=15) at hw/net/virtio-net.c:254
#17 qmp_set_link ("hostnet0", up=false) at net/net.c:1430
#18 chr_closed_bh at net/vhost-user.c:214
#19 aio_bh_call at util/async.c:90
#20 aio_bh_poll at util/async.c:118
#21 aio_dispatch at util/aio-posix.c:429
#22 aio_ctx_dispatch at util/async.c:261
#23 g_main_context_dispatch
#24 glib_pollfds_poll at util/main-loop.c:213
#25 os_host_main_loop_wait at util/main-loop.c:261
#26 main_loop_wait at util/main-loop.c:515
#27 main_loop () at vl.c:1917
#28 main at vl.c:4795

Above backtrace captured from qemu crash on vhost disconnect while
virtio driver in guest was in failed state.

We can just add checking for 'vdev' in 'vhost_dev_has_iommu()' but
it will assert further trying to free already freed ioeventfds. The
real problem is that we're allowing nested calls to 'vhost_net_stop'.

This patch is aimed to forbid nested calls to 'vhost_net_stop' to avoid
any possible double frees and segmentation faults doue to using of
already freed resources by setting 'vhost_started' flag to zero prior
to 'vhost_net_stop' call.

Signed-off-by: Ilya Maximets 
---

This issue was already addressed more than a year ago by the following
patch: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06732.html
but it was dropped without review due to not yet implemented re-connection
of vhost-user. Re-connection implementation lately fixed most of the
nested calls, but few of them still exists. For example, above backtrace
captured after 'virtqueue_get_head()' failure on vhost-user disconnection.

 hw/net/virtio-net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 38674b0..4d95a18 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -177,8 +177,8 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t 
status)
 n->vhost_started = 0;
 }
 } else {
-vhost_net_stop(vdev, n->nic->ncs, queues);
 n->vhost_started = 0;
+vhost_net_stop(vdev, n->nic->ncs, queues);
 }
 }
 
-- 
2.7.4

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-06 Thread Ilya Maximets

On 06.12.2017 19:45, Michael S. Tsirkin wrote:
> On Wed, Dec 06, 2017 at 04:06:18PM +0300, Ilya Maximets wrote:
>> In case virtio error occured after vhost_dev_close(), qemu will crash
>> in nested cleanup while checking IOMMU flag because dev->vdev already
>> set to zero and resources are already freed.
>>
>> Example:
>>
>> Program received signal SIGSEGV, Segmentation fault.
>> vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>>
>> #0  vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>> #1  vhost_dev_stop at hw/virtio/vhost.c:1594
>> #2  vhost_net_stop_one at hw/net/vhost_net.c:289
>> #3  vhost_net_stop at hw/net/vhost_net.c:368
>>
>> Nested call to vhost_net_stop(). First time was at #14.
>>
>> #4  virtio_net_vhost_status at hw/net/virtio-net.c:180
>> #5  virtio_net_set_status (status=79) at hw/net/virtio-net.c:254
>> #6  virtio_set_status at hw/virtio/virtio.c:1146
>> #7  virtio_error at hw/virtio/virtio.c:2455
>>
>> virtqueue_get_head() failed here.
>>
>> #8  virtqueue_get_head at hw/virtio/virtio.c:543
>> #9  virtqueue_drop_all at hw/virtio/virtio.c:984
>> #10 virtio_net_drop_tx_queue_data at hw/net/virtio-net.c:240
>> #11 virtio_bus_set_host_notifier at hw/virtio/virtio-bus.c:297
>> #12 vhost_dev_disable_notifiers at hw/virtio/vhost.c:1431
>>
>> vhost_dev_stop() was executed here. dev->vdev == NULL now.
>>
>> #13 vhost_net_stop_one at hw/net/vhost_net.c:290
>> #14 vhost_net_stop at hw/net/vhost_net.c:368
>> #15 virtio_net_vhost_status at hw/net/virtio-net.c:180
>> #16 virtio_net_set_status (status=15) at hw/net/virtio-net.c:254
>> #17 qmp_set_link ("hostnet0", up=false) at net/net.c:1430
>> #18 chr_closed_bh at net/vhost-user.c:214
>> #19 aio_bh_call at util/async.c:90
>> #20 aio_bh_poll at util/async.c:118
>> #21 aio_dispatch at util/aio-posix.c:429
>> #22 aio_ctx_dispatch at util/async.c:261
>> #23 g_main_context_dispatch
>> #24 glib_pollfds_poll at util/main-loop.c:213
>> #25 os_host_main_loop_wait at util/main-loop.c:261
>> #26 main_loop_wait at util/main-loop.c:515
>> #27 main_loop () at vl.c:1917
>> #28 main at vl.c:4795
>>
>> Above backtrace captured from qemu crash on vhost disconnect while
>> virtio driver in guest was in failed state.
>>
>> We can just add checking for 'vdev' in 'vhost_dev_has_iommu()' but
>> it will assert further trying to free already freed ioeventfds. The
>> real problem is that we're allowing nested calls to 'vhost_net_stop'.
>>
>> This patch is aimed to forbid nested calls to 'vhost_net_stop' to avoid
>> any possible double frees and segmentation faults doue to using of
>> already freed resources by setting 'vhost_started' flag to zero prior
>> to 'vhost_net_stop' call.
>>
>> Signed-off-by: Ilya Maximets 
>> ---
>>
>> This issue was already addressed more than a year ago by the following
>> patch: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06732.html
>> but it was dropped without review due to not yet implemented re-connection
>> of vhost-user. Re-connection implementation lately fixed most of the
>> nested calls, but few of them still exists. For example, above backtrace
>> captured after 'virtqueue_get_head()' failure on vhost-user disconnection.
>>
>>  hw/net/virtio-net.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>> index 38674b0..4d95a18 100644
>> --- a/hw/net/virtio-net.c
>> +++ b/hw/net/virtio-net.c
>> @@ -177,8 +177,8 @@ static void virtio_net_vhost_status(VirtIONet *n, 
>> uint8_t status)
>>  n->vhost_started = 0;
>>  }
>>  } else {
>> -vhost_net_stop(vdev, n->nic->ncs, queues);
>>  n->vhost_started = 0;
>> +vhost_net_stop(vdev, n->nic->ncs, queues);
>>  }
>>  }
> 
> Well the wider context is
> 
> 
> n->vhost_started = 1;
> r = vhost_net_start(vdev, n->nic->ncs, queues);
> if (r < 0) {
> error_report("unable to start vhost net: %d: "
>  "falling back on userspace virtio", -r);
> n->vhost_started = 0;
> }
> } else {
> vhost_net_stop(vdev, n->nic->ncs, queues);
> n->vhost_started = 0;
> 
> So we set it to 1 before start, we should clear after stop.

OK. I agree that clearing after is a bit safer. But in this case we need
a separate flag or other way to detect that we're already inside the
'vhost_net_stop()'.

What do you think about that old patch:
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06732.html  ?

It implements the same thing but introduces additional flag. It even could
be still applicable.

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-08 Thread Ilya Maximets

On 07.12.2017 20:27, Michael S. Tsirkin wrote:
> On Thu, Dec 07, 2017 at 09:39:36AM +0300, Ilya Maximets wrote:
>> On 06.12.2017 19:45, Michael S. Tsirkin wrote:
>>> On Wed, Dec 06, 2017 at 04:06:18PM +0300, Ilya Maximets wrote:
>>>> In case virtio error occured after vhost_dev_close(), qemu will crash
>>>> in nested cleanup while checking IOMMU flag because dev->vdev already
>>>> set to zero and resources are already freed.
>>>>
>>>> Example:
>>>>
>>>> Program received signal SIGSEGV, Segmentation fault.
>>>> vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>>>>
>>>> #0  vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>>>> #1  vhost_dev_stop at hw/virtio/vhost.c:1594
>>>> #2  vhost_net_stop_one at hw/net/vhost_net.c:289
>>>> #3  vhost_net_stop at hw/net/vhost_net.c:368
>>>>
>>>> Nested call to vhost_net_stop(). First time was at #14.
>>>>
>>>> #4  virtio_net_vhost_status at hw/net/virtio-net.c:180
>>>> #5  virtio_net_set_status (status=79) at hw/net/virtio-net.c:254
>>>> #6  virtio_set_status at hw/virtio/virtio.c:1146
>>>> #7  virtio_error at hw/virtio/virtio.c:2455
>>>>
>>>> virtqueue_get_head() failed here.
>>>>
>>>> #8  virtqueue_get_head at hw/virtio/virtio.c:543
>>>> #9  virtqueue_drop_all at hw/virtio/virtio.c:984
>>>> #10 virtio_net_drop_tx_queue_data at hw/net/virtio-net.c:240
>>>> #11 virtio_bus_set_host_notifier at hw/virtio/virtio-bus.c:297
>>>> #12 vhost_dev_disable_notifiers at hw/virtio/vhost.c:1431
>>>>
>>>> vhost_dev_stop() was executed here. dev->vdev == NULL now.
>>>>
>>>> #13 vhost_net_stop_one at hw/net/vhost_net.c:290
>>>> #14 vhost_net_stop at hw/net/vhost_net.c:368
>>>> #15 virtio_net_vhost_status at hw/net/virtio-net.c:180
>>>> #16 virtio_net_set_status (status=15) at hw/net/virtio-net.c:254
>>>> #17 qmp_set_link ("hostnet0", up=false) at net/net.c:1430
>>>> #18 chr_closed_bh at net/vhost-user.c:214
>>>> #19 aio_bh_call at util/async.c:90
>>>> #20 aio_bh_poll at util/async.c:118
>>>> #21 aio_dispatch at util/aio-posix.c:429
>>>> #22 aio_ctx_dispatch at util/async.c:261
>>>> #23 g_main_context_dispatch
>>>> #24 glib_pollfds_poll at util/main-loop.c:213
>>>> #25 os_host_main_loop_wait at util/main-loop.c:261
>>>> #26 main_loop_wait at util/main-loop.c:515
>>>> #27 main_loop () at vl.c:1917
>>>> #28 main at vl.c:4795
>>>>
>>>> Above backtrace captured from qemu crash on vhost disconnect while
>>>> virtio driver in guest was in failed state.
>>>>
>>>> We can just add checking for 'vdev' in 'vhost_dev_has_iommu()' but
>>>> it will assert further trying to free already freed ioeventfds. The
>>>> real problem is that we're allowing nested calls to 'vhost_net_stop'.
>>>>
>>>> This patch is aimed to forbid nested calls to 'vhost_net_stop' to avoid
>>>> any possible double frees and segmentation faults doue to using of
>>>> already freed resources by setting 'vhost_started' flag to zero prior
>>>> to 'vhost_net_stop' call.
>>>>
>>>> Signed-off-by: Ilya Maximets 
>>>> ---
>>>>
>>>> This issue was already addressed more than a year ago by the following
>>>> patch: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06732.html
>>>> but it was dropped without review due to not yet implemented re-connection
>>>> of vhost-user. Re-connection implementation lately fixed most of the
>>>> nested calls, but few of them still exists. For example, above backtrace
>>>> captured after 'virtqueue_get_head()' failure on vhost-user disconnection.
>>>>
>>>>  hw/net/virtio-net.c | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>>>> index 38674b0..4d95a18 100644
>>>> --- a/hw/net/virtio-net.c
>>>> +++ b/hw/net/virtio-net.c
>>>> @@ -177,8 +177,8 @@ static void virtio_net_vhost_status(VirtIONet *n, 
>>>> uint8_t status)
>>>>  n->

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-13 Thread Ilya Maximets

On 11.12.2017 07:35, Michael S. Tsirkin wrote:
> On Fri, Dec 08, 2017 at 05:54:18PM +0300, Ilya Maximets wrote:
>> On 07.12.2017 20:27, Michael S. Tsirkin wrote:
>>> On Thu, Dec 07, 2017 at 09:39:36AM +0300, Ilya Maximets wrote:
>>>> On 06.12.2017 19:45, Michael S. Tsirkin wrote:
>>>>> On Wed, Dec 06, 2017 at 04:06:18PM +0300, Ilya Maximets wrote:
>>>>>> In case virtio error occured after vhost_dev_close(), qemu will crash
>>>>>> in nested cleanup while checking IOMMU flag because dev->vdev already
>>>>>> set to zero and resources are already freed.
>>>>>>
>>>>>> Example:
>>>>>>
>>>>>> Program received signal SIGSEGV, Segmentation fault.
>>>>>> vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>>>>>>
>>>>>> #0  vhost_virtqueue_stop at hw/virtio/vhost.c:1155
>>>>>> #1  vhost_dev_stop at hw/virtio/vhost.c:1594
>>>>>> #2  vhost_net_stop_one at hw/net/vhost_net.c:289
>>>>>> #3  vhost_net_stop at hw/net/vhost_net.c:368
>>>>>>
>>>>>> Nested call to vhost_net_stop(). First time was at #14.
>>>>>>
>>>>>> #4  virtio_net_vhost_status at hw/net/virtio-net.c:180
>>>>>> #5  virtio_net_set_status (status=79) at hw/net/virtio-net.c:254
>>>>>> #6  virtio_set_status at hw/virtio/virtio.c:1146
>>>>>> #7  virtio_error at hw/virtio/virtio.c:2455
>>>>>>
>>>>>> virtqueue_get_head() failed here.
>>>>>>
>>>>>> #8  virtqueue_get_head at hw/virtio/virtio.c:543
>>>>>> #9  virtqueue_drop_all at hw/virtio/virtio.c:984
>>>>>> #10 virtio_net_drop_tx_queue_data at hw/net/virtio-net.c:240
>>>>>> #11 virtio_bus_set_host_notifier at hw/virtio/virtio-bus.c:297
>>>>>> #12 vhost_dev_disable_notifiers at hw/virtio/vhost.c:1431
>>>>>>
>>>>>> vhost_dev_stop() was executed here. dev->vdev == NULL now.
>>>>>>
>>>>>> #13 vhost_net_stop_one at hw/net/vhost_net.c:290
>>>>>> #14 vhost_net_stop at hw/net/vhost_net.c:368
>>>>>> #15 virtio_net_vhost_status at hw/net/virtio-net.c:180
>>>>>> #16 virtio_net_set_status (status=15) at hw/net/virtio-net.c:254
>>>>>> #17 qmp_set_link ("hostnet0", up=false) at net/net.c:1430
>>>>>> #18 chr_closed_bh at net/vhost-user.c:214
>>>>>> #19 aio_bh_call at util/async.c:90
>>>>>> #20 aio_bh_poll at util/async.c:118
>>>>>> #21 aio_dispatch at util/aio-posix.c:429
>>>>>> #22 aio_ctx_dispatch at util/async.c:261
>>>>>> #23 g_main_context_dispatch
>>>>>> #24 glib_pollfds_poll at util/main-loop.c:213
>>>>>> #25 os_host_main_loop_wait at util/main-loop.c:261
>>>>>> #26 main_loop_wait at util/main-loop.c:515
>>>>>> #27 main_loop () at vl.c:1917
>>>>>> #28 main at vl.c:4795
>>>>>>
>>>>>> Above backtrace captured from qemu crash on vhost disconnect while
>>>>>> virtio driver in guest was in failed state.
>>>>>>
>>>>>> We can just add checking for 'vdev' in 'vhost_dev_has_iommu()' but
>>>>>> it will assert further trying to free already freed ioeventfds. The
>>>>>> real problem is that we're allowing nested calls to 'vhost_net_stop'.
>>>>>>
>>>>>> This patch is aimed to forbid nested calls to 'vhost_net_stop' to avoid
>>>>>> any possible double frees and segmentation faults doue to using of
>>>>>> already freed resources by setting 'vhost_started' flag to zero prior
>>>>>> to 'vhost_net_stop' call.
>>>>>>
>>>>>> Signed-off-by: Ilya Maximets 
>>>>>> ---
>>>>>>
>>>>>> This issue was already addressed more than a year ago by the following
>>>>>> patch: 
>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06732.html
>>>>>> but it was dropped without review due to not yet implemented 
>>>>>> re-connection
>>>>>> of vhost-user. Re-connectio

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-13 Thread Ilya Maximets

On 13.12.2017 22:48, Michael S. Tsirkin wrote:
> On Wed, Dec 13, 2017 at 04:45:20PM +0300, Ilya Maximets wrote:
>>>> That
>>>> looks very strange. Some of the functions gets 'old_status', others
>>>> the 'new_status'. I'm a bit confused.
>>>
>>> OK, fair enough. Fixed - let's pass old status everywhere,
>>> users that need the new one can get it from the vdev.
>>>
>>>> And it's not functional in current state:
>>>>
>>>> hw/net/virtio-net.c:264:28: error: ‘status’ undeclared
>>>
>>> Fixed too. new version below.
>>
>> This doesn't fix the segmentation fault.
> 
> Hmm you are right. Looking into it.
> 
>> I have exactly same crash stacktrace:
>>
>> #0  vhost_memory_unmap hw/virtio/vhost.c:446
>> #1  vhost_virtqueue_stop hw/virtio/vhost.c:1155
>> #2  vhost_dev_stop hw/virtio/vhost.c:1594
>> #3  vhost_net_stop_one hw/net/vhost_net.c:289
>> #4  vhost_net_stop hw/net/vhost_net.c:368
>> #5  virtio_net_vhost_status (old_status=15 '\017', n=0x5625f3901100) at 
>> hw/net/virtio-net.c:180
>> #6  virtio_net_set_status (vdev=0x5625f3901100, old_status=) 
>> at hw/net/virtio-net.c:254
>> #7  virtio_set_status (vdev=vdev@entry=0x5625f3901100, val=) 
>> at hw/virtio/virtio.c:1152
>> #8  virtio_error (vdev=0x5625f3901100, fmt=fmt@entry=0x5625f014f688 "Guest 
>> says index %u is available") at hw/virtio/virtio.c:2460
> 
> BTW what is causing this? Why is guest avail index corrupted?

My testing environment for the issue:

* QEMU 2.10.1
* testpmd from a bit outdated DPDK 16.07.0 in guest with uio_pci_generic
* OVS 2.8 with DPDK 17.05.2 on the host.
* 2 vhost-user ports in VM in server mode.
* Testpmd just forwards packets from one port to another with --forward-mode=mac

testpmd with virtio-net driver sometimes crashes after killing the OVS if some
heavy traffic flows. After restarting of the OVS and stopping it again QEMU
crashes on vhost disconnect and virtqueue_get_head() failure.

So, the sequence is follows:

1. Start OVS, Start QEMU, start testpmd, start external packet generator.
2. pkill -11 ovs-vswitchd
3. If testpmd not crashed in guest goto step #1.
4. Start OVS (testpmd with virtio driver still in down state)
5. pkill -11 ovs-vswitchd
6. Observe QEMU crash

I suspect that virtio-net driver from DPDK 16.07.0 corrupts vrings
just before crash at step #2.

I didn't tried actually to investigate virtio driver crash because it's a bit
out of my scope and I have no enough time and a driver slightly outdated.
But the stability of QEMU itself is really important thing.

One interesting thing is that I can not reproduce virtio driver crash with
"virtio: rework set_status callbacks" applied. I had to break vrings manually
to reproduce original nested call of vhost_net_stop().

> 
>> #9  virtqueue_get_head at hw/virtio/virtio.c:543
>> #10 virtqueue_drop_all hw/virtio/virtio.c:984
>> #11 virtio_net_drop_tx_queue_data hw/net/virtio-net.c:240
>> #12 virtio_bus_set_host_notifier at hw/virtio/virtio-bus.c:297
>> #13 vhost_dev_disable_notifiers at hw/virtio/vhost.c:1431
>> #14 vhost_net_stop_one at hw/net/vhost_net.c:290
>> #15 vhost_net_stop at hw/net/vhost_net.c:368
>> #16 virtio_net_vhost_status (old_status=15 '\017', n=0x5625f3901100) at 
>> hw/net/virtio-net.c:180
>> #17 virtio_net_set_status (vdev=0x5625f3901100, old_status=) 
>> at hw/net/virtio-net.c:254
>> #18 qmp_set_link at net/net.c:1430
>> #19 chr_closed_bh at net/vhost-user.c:214
>> #20 aio_bh_call at util/async.c:90
>> #21 aio_bh_poll at util/async.c:118
>> #22 aio_dispatch at util/aio-posix.c:429
>> #23 aio_ctx_dispatch at util/async.c:261
>> #24 g_main_context_dispatch () from /lib64/libglib-2.0.so.0
>> #25 glib_pollfds_poll () at util/main-loop.c:213
>> #26 os_host_main_loop_wait at util/main-loop.c:261
>> #27 main_loop_wait at util/main-loop.c:515
>> #28 main_loop () at vl.c:1917
>> #29 main
>>
>>
>> Actually the logic doesn't change. In function  virtio_net_vhost_status():
>>
>> -if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
>> +if ((virtio_net_started(n, vdev->status) && !nc->peer->link_down) ==
>>  !!n->vhost_started) {
>>  return;
>>  }
>>
>> previously new 'status' was checked and the new 'vdev->status' checked now.
>> It's the same condition that doesn't work because vhost_started flag is still
>> set to 1.
>> Anyway, nc->peer->link_down is true in our case, so there is no differenc

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-14 Thread Ilya Maximets

One update for the testing scenario:

No need to kill OVS. The issue reproducible with simple 'del-port'
and 'add-port'. virtio driver in guest could crash on both operations.
Most times it crashes in my case on 'add-port' after deletion.

Hi Maxime,
I already saw below patches and original linux kernel virtio issue.
Just had no enough time to test them.
Now I tested below patches and they fixes virtio driver crash.
Thanks for suggestion.

Michael,
I tested "[PATCH] virtio_error: don't invoke status callbacks "
and it fixes the QEMU crash in case of broken guest index.
Thanks.

Best regards, Ilya Maximets.

P.S. Previously I mentioned that I can not reproduce virtio driver
 crash with "[PATCH] virtio_error: don't invoke status callbacks"
 applied. I was wrong. I can reproduce now. System was misconfigured.
 Sorry.


On 14.12.2017 12:01, Maxime Coquelin wrote:
> Hi Ilya,
> 
> On 12/14/2017 08:06 AM, Ilya Maximets wrote:
>> On 13.12.2017 22:48, Michael S. Tsirkin wrote:
>>> On Wed, Dec 13, 2017 at 04:45:20PM +0300, Ilya Maximets wrote:
>>>>>> That
>>>>>> looks very strange. Some of the functions gets 'old_status', others
>>>>>> the 'new_status'. I'm a bit confused.
>>>>>
>>>>> OK, fair enough. Fixed - let's pass old status everywhere,
>>>>> users that need the new one can get it from the vdev.
>>>>>
>>>>>> And it's not functional in current state:
>>>>>>
>>>>>> hw/net/virtio-net.c:264:28: error: ‘status’ undeclared
>>>>>
>>>>> Fixed too. new version below.
>>>>
>>>> This doesn't fix the segmentation fault.
>>>
>>> Hmm you are right. Looking into it.
>>>
>>>> I have exactly same crash stacktrace:
>>>>
>>>> #0  vhost_memory_unmap hw/virtio/vhost.c:446
>>>> #1  vhost_virtqueue_stop hw/virtio/vhost.c:1155
>>>> #2  vhost_dev_stop hw/virtio/vhost.c:1594
>>>> #3  vhost_net_stop_one hw/net/vhost_net.c:289
>>>> #4  vhost_net_stop hw/net/vhost_net.c:368
>>>> #5  virtio_net_vhost_status (old_status=15 '\017', n=0x5625f3901100) at 
>>>> hw/net/virtio-net.c:180
>>>> #6  virtio_net_set_status (vdev=0x5625f3901100, old_status=>>> out>) at hw/net/virtio-net.c:254
>>>> #7  virtio_set_status (vdev=vdev@entry=0x5625f3901100, val=>>> out>) at hw/virtio/virtio.c:1152
>>>> #8  virtio_error (vdev=0x5625f3901100, fmt=fmt@entry=0x5625f014f688 "Guest 
>>>> says index %u is available") at hw/virtio/virtio.c:2460
>>>
>>> BTW what is causing this? Why is guest avail index corrupted?
>>
>> My testing environment for the issue:
>>
>> * QEMU 2.10.1
> 
> Could you try to backport below patch and try again killing OVS?
> 
> commit 2ae39a113af311cb56a0c35b7f212dafcef15303
> Author: Maxime Coquelin 
> Date:   Thu Nov 16 19:48:35 2017 +0100
> 
>     vhost: restore avail index from vring used index on disconnection
> 
>     vhost_virtqueue_stop() gets avail index value from the backend,
>     except if the backend is not responding.
> 
>     It happens when the backend crashes, and in this case, internal
>     state of the virtio queue is inconsistent, making packets
>     to corrupt the vring state.
> 
>     With a Linux guest, it results in following error message on
>     backend reconnection:
> 
>     [   22.444905] virtio_net virtio0: output.0:id 0 is not a head!
>     [   22.446746] net enp0s3: Unexpected TXQ (0) queue failure: -5
>     [   22.476360] net enp0s3: Unexpected TXQ (0) queue failure: -5
> 
>     Fixes: 283e2c2adcb8 ("net: virtio-net discards TX data after link down")
>     Cc: qemu-sta...@nongnu.org
>     Signed-off-by: Maxime Coquelin 
>     Reviewed-by: Michael S. Tsirkin 
>     Signed-off-by: Michael S. Tsirkin 
> 
> commit 2d4ba6cc741df15df6fbb4feaa706a02e103083a
> Author: Maxime Coquelin 
> Date:   Thu Nov 16 19:48:34 2017 +0100
> 
>     virtio: Add queue interface to restore avail index from vring used index
> 
>     In case of backend crash, it is not possible to restore internal
>     avail index from the backend value as vhost_get_vring_base
>     callback fails.
> 
>     This patch provides a new interface to restore internal avail index
>     from the vring used index, as done by some vhost-user backend on
>     reconnection.
> 
>     Signed-off-by: Maxime Coquelin 
>     Reviewed-by: Michael S. Tsirkin 
>     Signed-off-by: Michael S. Tsirkin 
> 
> 
> Cheers,
> Maxime
> 
> 
>

Re: [Qemu-devel] [PATCH] vhost: fix crash on virtio_error while device stop

2017-12-14 Thread Ilya Maximets



On 14.12.2017 17:31, Ilya Maximets wrote:
> One update for the testing scenario:
> 
> No need to kill OVS. The issue reproducible with simple 'del-port'
> and 'add-port'. virtio driver in guest could crash on both operations.
> Most times it crashes in my case on 'add-port' after deletion.
> 
> Hi Maxime,
> I already saw below patches and original linux kernel virtio issue.
> Just had no enough time to test them.
> Now I tested below patches and they fixes virtio driver crash.
> Thanks for suggestion.
> 
> Michael,
> I tested "[PATCH] virtio_error: don't invoke status callbacks "
> and it fixes the QEMU crash in case of broken guest index.
> Thanks.
> 
> Best regards, Ilya Maximets.
> 
> P.S. Previously I mentioned that I can not reproduce virtio driver
>  crash with "[PATCH] virtio_error: don't invoke status callbacks"

It should be "[PATCH dontapply] virtio: rework set_status callbacks".
Sorry again.

>  applied. I was wrong. I can reproduce now. System was misconfigured.
>  Sorry.
> 
> 
> On 14.12.2017 12:01, Maxime Coquelin wrote:
>> Hi Ilya,
>>
>> On 12/14/2017 08:06 AM, Ilya Maximets wrote:
>>> On 13.12.2017 22:48, Michael S. Tsirkin wrote:
>>>> On Wed, Dec 13, 2017 at 04:45:20PM +0300, Ilya Maximets wrote:
>>>>>>> That
>>>>>>> looks very strange. Some of the functions gets 'old_status', others
>>>>>>> the 'new_status'. I'm a bit confused.
>>>>>>
>>>>>> OK, fair enough. Fixed - let's pass old status everywhere,
>>>>>> users that need the new one can get it from the vdev.
>>>>>>
>>>>>>> And it's not functional in current state:
>>>>>>>
>>>>>>> hw/net/virtio-net.c:264:28: error: ‘status’ undeclared
>>>>>>
>>>>>> Fixed too. new version below.
>>>>>
>>>>> This doesn't fix the segmentation fault.
>>>>
>>>> Hmm you are right. Looking into it.
>>>>
>>>>> I have exactly same crash stacktrace:
>>>>>
>>>>> #0  vhost_memory_unmap hw/virtio/vhost.c:446
>>>>> #1  vhost_virtqueue_stop hw/virtio/vhost.c:1155
>>>>> #2  vhost_dev_stop hw/virtio/vhost.c:1594
>>>>> #3  vhost_net_stop_one hw/net/vhost_net.c:289
>>>>> #4  vhost_net_stop hw/net/vhost_net.c:368
>>>>> #5  virtio_net_vhost_status (old_status=15 '\017', n=0x5625f3901100) at 
>>>>> hw/net/virtio-net.c:180
>>>>> #6  virtio_net_set_status (vdev=0x5625f3901100, old_status=>>>> out>) at hw/net/virtio-net.c:254
>>>>> #7  virtio_set_status (vdev=vdev@entry=0x5625f3901100, val=>>>> out>) at hw/virtio/virtio.c:1152
>>>>> #8  virtio_error (vdev=0x5625f3901100, fmt=fmt@entry=0x5625f014f688 
>>>>> "Guest says index %u is available") at hw/virtio/virtio.c:2460
>>>>
>>>> BTW what is causing this? Why is guest avail index corrupted?
>>>
>>> My testing environment for the issue:
>>>
>>> * QEMU 2.10.1
>>
>> Could you try to backport below patch and try again killing OVS?
>>
>> commit 2ae39a113af311cb56a0c35b7f212dafcef15303
>> Author: Maxime Coquelin 
>> Date:   Thu Nov 16 19:48:35 2017 +0100
>>
>>     vhost: restore avail index from vring used index on disconnection
>>
>>     vhost_virtqueue_stop() gets avail index value from the backend,
>>     except if the backend is not responding.
>>
>>     It happens when the backend crashes, and in this case, internal
>>     state of the virtio queue is inconsistent, making packets
>>     to corrupt the vring state.
>>
>>     With a Linux guest, it results in following error message on
>>     backend reconnection:
>>
>>     [   22.444905] virtio_net virtio0: output.0:id 0 is not a head!
>>     [   22.446746] net enp0s3: Unexpected TXQ (0) queue failure: -5
>>     [   22.476360] net enp0s3: Unexpected TXQ (0) queue failure: -5
>>
>>     Fixes: 283e2c2adcb8 ("net: virtio-net discards TX data after link down")
>>     Cc: qemu-sta...@nongnu.org
>>     Signed-off-by: Maxime Coquelin 
>>     Reviewed-by: Michael S. Tsirkin 
>>     Signed-off-by: Michael S. Tsirkin 
>>
>> commit 2d4ba6cc741df15df6fbb4feaa706a02e103083a
>> Author: Maxime Coquelin 
>> Date:   Thu Nov 16 19:48:34 2017 +0100
>>
>>     virtio: Add queue interface to restore avail index from vring used index
>>
>>     In case of backend crash, it is not possible to restore internal
>>     avail index from the backend value as vhost_get_vring_base
>>     callback fails.
>>
>>     This patch provides a new interface to restore internal avail index
>>     from the vring used index, as done by some vhost-user backend on
>>     reconnection.
>>
>>     Signed-off-by: Maxime Coquelin 
>>     Reviewed-by: Michael S. Tsirkin 
>>     Signed-off-by: Michael S. Tsirkin 
>>
>>
>> Cheers,
>> Maxime
>>
>>
>>
> 
>

Re: [Qemu-devel] [PATCH v2 1/4] hostmem-memfd: disable for systems wihtout sealing support

2018-12-11 Thread Ilya Maximets

On 10.12.2018 19:18, Igor Mammedov wrote:
> On Tue, 27 Nov 2018 16:50:27 +0300
> Ilya Maximets  wrote:
> 
> s/wihtout/without/ in subj
> 
>> If seals are not supported, memfd_create() will fail.
>> Furthermore, there is no way to disable it in this case because
>> '.seal' property is not registered.
>>
>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>
>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>   failed to create memfd: Invalid argument
>>
>> and actually breaks the feature on such systems.
>>
>> Let's restrict memfd backend to systems with sealing support.
>>
>> Signed-off-by: Ilya Maximets 
>> ---
>>  backends/hostmem-memfd.c | 18 --
>>  tests/vhost-user-test.c  |  6 +++---
>>  2 files changed, 11 insertions(+), 13 deletions(-)
>>
>> diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
>> index b6836b28e5..a3455da9c9 100644
>> --- a/backends/hostmem-memfd.c
>> +++ b/backends/hostmem-memfd.c
>> @@ -156,15 +156,13 @@ memfd_backend_class_init(ObjectClass *oc, void *data)
>>"Huge pages size (ex: 2M, 
>> 1G)",
>>&error_abort);
>>  }
>> -if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
>> -object_class_property_add_bool(oc, "seal",
>> -   memfd_backend_get_seal,
>> -   memfd_backend_set_seal,
>> -   &error_abort);
>> -object_class_property_set_description(oc, "seal",
>> -  "Seal growing & shrinking",
>> -  &error_abort);
>> -}
>> +object_class_property_add_bool(oc, "seal",
>> +   memfd_backend_get_seal,
>> +   memfd_backend_set_seal,
>> +   &error_abort);
>> +object_class_property_set_description(oc, "seal",
>> +  "Seal growing & shrinking",
>> +  &error_abort);
>>  }
>>  
>>  static const TypeInfo memfd_backend_info = {
>> @@ -177,7 +175,7 @@ static const TypeInfo memfd_backend_info = {
>>  
>>  static void register_types(void)
>>  {
>> -if (qemu_memfd_check(0)) {
>> +if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
>>  type_register_static(&memfd_backend_info);
> that would either lead to not clear error that type doesn't exist.
> it could be better to report sensible error from memfd_backend_memory_alloc() 
> if
> the feature is required but not supported by host 

I'm not sure, but this could break the libvirt capability discovering.

Current patch changes behaviour probably only for RHEL/CentOS 7.2.
All other systems are not affected. Do you think that we need to
change behaviour on all the systems?

> 
>>  }
>>  }
>> diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
>> index 45d58d8ea2..e3e9a33580 100644
>> --- a/tests/vhost-user-test.c
>> +++ b/tests/vhost-user-test.c
>> @@ -169,7 +169,7 @@ static char *get_qemu_cmd(TestServer *s,
>>int mem, enum test_memfd memfd, const char 
>> *mem_path,
>>const char *chr_opts, const char *extra)
>>  {
>> -if (memfd == TEST_MEMFD_AUTO && qemu_memfd_check(0)) {
>> +if (memfd == TEST_MEMFD_AUTO && qemu_memfd_check(MFD_ALLOW_SEALING)) {
>>  memfd = TEST_MEMFD_YES;
>>  }
>>  
>> @@ -903,7 +903,7 @@ static void test_multiqueue(void)
>>  s->queues = 2;
>>  test_server_listen(s);
>>  
>> -if (qemu_memfd_check(0)) {
>> +if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
>>  cmd = g_strdup_printf(
>>  QEMU_CMD_MEMFD QEMU_CMD_CHR QEMU_CMD_NETDEV ",queues=%d "
>>  "-device virtio-net-pci,netdev=net0,mq=on,vectors=%d",
>> @@ -963,7 +963,7 @@ int main(int argc, char **argv)
>>  /* run the main loop thread so the chardev may operate */
>>  thread = g_thread_new(NULL, thread_function, loop);
>>  
>> -if (qemu_memfd_check(0)) {
>> +if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
>>  qtest_add_data_func("/vhost-user/read-guest-mem/memfd",
>>  GINT_TO_POINTER(TEST_MEMFD_YES),
>>  test_read_guest_mem);
> 
> 
>

Re: [Qemu-devel] [PATCH v2 1/4] hostmem-memfd: disable for systems wihtout sealing support

2018-12-11 Thread Ilya Maximets

On 11.12.2018 13:53, Daniel P. Berrangé wrote:
> On Tue, Nov 27, 2018 at 04:50:27PM +0300, Ilya Maximets wrote:
>> If seals are not supported, memfd_create() will fail.
>> Furthermore, there is no way to disable it in this case because
>> '.seal' property is not registered.
> 
> Isn't the real problem here that memfd_backend_instance_init() has
> unconditionally set  "m->seal = true"
> 
> Surely, if we don't register the '.seal' property, we should default
> that flag to false.
> 
>>
>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>
>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>   failed to create memfd: Invalid argument
>>
>> and actually breaks the feature on such systems.
>>
>> Let's restrict memfd backend to systems with sealing support.
> 
> I don't think we need todo that - sealing is optional in the QEMU code,
> we simply have it set to the wrong default when sealing is not available.

That was literally what I've fixed in v1:
https://lists.nongnu.org/archive/html/qemu-devel/2018-11/msg05483.html

but 2 people suggested me to disable memfd entirely for this case.
Do you think I need to get patch from v1 back ?

Gerd, Marc-André, what do you think?

> 
>> Signed-off-by: Ilya Maximets 
> 
> 
> Regards,
> Daniel
>

[Qemu-devel] Are FreeBSD guest images working?

2018-11-15 Thread Ilya Maximets

> Hi, the list,
> 
> I am trying to boot a FreeBSD guest but failed.  It hangs at the
> kernel booting phase:
> 
> /boot/ker]el/kernel text=0x14ed860 data=0x132538+0x4baa68 
> syms=[0x8+0x159ee8+0x8
> Booting...
> (nothing more)
> 
> It's just as simple as downloading the image and boot so I can't think
> of anything strange within my procedures so far:
> 
> https://wiki.qemu.org/Hosts/BSD#FreeBSD
> 
> I also tried the latest image here:
> 
> https://download.freebsd.org/ftp/releases/VM-IMAGES/11.2-RELEASE/amd64/Latest/FreeBSD-11.2-RELEASE-amd64.qcow2.xz
> 
> but it's having the same problem as 11.0.
> 
> Am downloading an fresh ISO, but before I continue I'm just curious on
> whether anyone is using these images and whether there's quick answers
> to what I have encountered.
> 
> Thanks in advance,
> 
> -- 
> Peter Xu

Hi,
I have one VM with FreeBSD-11.2-RELEASE-amd64.qcow2 image and it works
fine under qemu 2.12.1. But it's controlled by libvirt + virt-manager,
so it has a bit more cmdline arguments than in wiki.

In general, those images has serial console disabled by default.

Creating the following file in VM fs:
[root@freebsd ~]# cat /boot/loader.conf 
boot_multicons="YES"
boot_serial="YES"
comconsole_speed="115200"
console="comconsole,vidconsole"

allows me to successfully boot with:
# qemu-system-x86_64 -machine accel=kvm -m 2048  \
  -cpu host -enable-kvm -nographic -smp 2 \
  -drive if=virtio,file=./FreeBSD-11.2-RELEASE-amd64.qcow2,format=qcow2

Best regards, Ilya Maximets.

[Qemu-devel] [RFC 0/2] vhost+postcopy fixes

2018-10-08 Thread Ilya Maximets

Sending as RFC because it's not fully tested yet.

Ilya Maximets (2):
  migration: Stop postcopy fault thread before notifying
  vhost-user: Fix userfaultfd leak

 hw/virtio/vhost-user.c   |  7 +++
 migration/postcopy-ram.c | 11 ++-
 2 files changed, 13 insertions(+), 5 deletions(-)

-- 
2.17.1

[Qemu-devel] [RFC 2/2] vhost-user: Fix userfaultfd leak

2018-10-08 Thread Ilya Maximets

'fd' received from the vhost side is never freed.
Also, everything (including 'postcopy_listen' state) should be
cleaned up on vhost cleanup.

Fixes: 46343570c06e ("vhost+postcopy: Wire up POSTCOPY_END notify")
Fixes: f82c11165ffa ("vhost+postcopy: Register shared ufd with postcopy")
Cc: qemu-sta...@nongnu.org
Signed-off-by: Ilya Maximets 
---
 hw/virtio/vhost-user.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index c442daa562..e09bed0e4a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1280,6 +1280,7 @@ static int vhost_user_postcopy_end(struct vhost_dev *dev, 
Error **errp)
 return ret;
 }
 postcopy_unregister_shared_ufd(&u->postcopy_fd);
+close(u->postcopy_fd.fd);
 u->postcopy_fd.handler = NULL;
 
 trace_vhost_user_postcopy_end_exit();
@@ -1419,6 +1420,12 @@ static int vhost_user_backend_cleanup(struct vhost_dev 
*dev)
 postcopy_remove_notifier(&u->postcopy_notifier);
 u->postcopy_notifier.notify = NULL;
 }
+u->postcopy_listen = false;
+if (u->postcopy_fd.handler) {
+postcopy_unregister_shared_ufd(&u->postcopy_fd);
+close(u->postcopy_fd.fd);
+u->postcopy_fd.handler = NULL;
+}
 if (u->slave_fd >= 0) {
 qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL);
 close(u->slave_fd);
-- 
2.17.1

[Qemu-devel] [RFC 1/2] migration: Stop postcopy fault thread before notifying

2018-10-08 Thread Ilya Maximets

POSTCOPY_NOTIFY_INBOUND_END handlers will remove userfault fds
from the postcopy_remote_fds array which could be still in
use by the fault thread. Let's stop the thread before
notification to avoid possible accessing wrong memory.

Fixes: 46343570c06e ("vhost+postcopy: Wire up POSTCOPY_END notify")
Cc: qemu-sta...@nongnu.org
Signed-off-by: Ilya Maximets 
---
 migration/postcopy-ram.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 853d8b32ca..e5c02a32c5 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -533,6 +533,12 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState 
*mis)
 if (mis->have_fault_thread) {
 Error *local_err = NULL;
 
+/* Let the fault thread quit */
+atomic_set(&mis->fault_thread_quit, 1);
+postcopy_fault_thread_notify(mis);
+trace_postcopy_ram_incoming_cleanup_join();
+qemu_thread_join(&mis->fault_thread);
+
 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
 error_report_err(local_err);
 return -1;
@@ -541,11 +547,6 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState 
*mis)
 if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) {
 return -1;
 }
-/* Let the fault thread quit */
-atomic_set(&mis->fault_thread_quit, 1);
-postcopy_fault_thread_notify(mis);
-trace_postcopy_ram_incoming_cleanup_join();
-qemu_thread_join(&mis->fault_thread);
 
 trace_postcopy_ram_incoming_cleanup_closeuf();
 close(mis->userfault_fd);
-- 
2.17.1

[Qemu-devel] [PATCH] vhost-user: Don't ask for reply on postcopy mem table set

2018-10-02 Thread Ilya Maximets

According to documentation, NEED_REPLY_MASK should not be set
for VHOST_USER_SET_MEM_TABLE request in postcopy mode.
This restriction was mistakenly applied to 'reply_supported'
variable, which is local and used only for non-postcopy case.

CC: Dr. David Alan Gilbert 
Fixes: 9bb38019942c ("vhost+postcopy: Send address back to qemu")
Signed-off-by: Ilya Maximets 
---
 hw/virtio/vhost-user.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index b041343632..c442daa562 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -374,8 +374,6 @@ static int vhost_user_set_mem_table_postcopy(struct 
vhost_dev *dev,
 int fds[VHOST_MEMORY_MAX_NREGIONS];
 int i, fd;
 size_t fd_num = 0;
-bool reply_supported = virtio_has_feature(dev->protocol_features,
-  VHOST_USER_PROTOCOL_F_REPLY_ACK);
 VhostUserMsg msg_reply;
 int region_i, msg_i;
 
@@ -384,10 +382,6 @@ static int vhost_user_set_mem_table_postcopy(struct 
vhost_dev *dev,
 .hdr.flags = VHOST_USER_VERSION,
 };
 
-if (reply_supported) {
-msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
-}
-
 if (u->region_rb_len < dev->mem->nregions) {
 u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions);
 u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset,
@@ -503,10 +497,6 @@ static int vhost_user_set_mem_table_postcopy(struct 
vhost_dev *dev,
 return -1;
 }
 
-if (reply_supported) {
-return process_message_reply(dev, &msg);
-}
-
 return 0;
 }
 
@@ -519,8 +509,7 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
 size_t fd_num = 0;
 bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
 bool reply_supported = virtio_has_feature(dev->protocol_features,
-  VHOST_USER_PROTOCOL_F_REPLY_ACK) &&
-  !do_postcopy;
+  VHOST_USER_PROTOCOL_F_REPLY_ACK);
 
 if (do_postcopy) {
 /* Postcopy has enough differences that it's best done in it's own
-- 
2.17.1

[Qemu-devel] Have multiple virtio-net devices, but only one of them receives all traffic

2018-10-02 Thread Ilya Maximets

> Hi,
> 
> I'm using QEMU 3.0.0 and Linux kernel 4.15.0 on x86 machines. I'm
> observing pretty weird behavior when I have multiple virtio-net
> devices. My KVM VM has two virtio-net devices (vhost=off) and I'm
> using a Linux bridge in the host. The two devices have different
> MAC/IP addresses.
> 
> When I tried to access the VM using two different IP addresses (e.g
> ping or ssh), I found that only one device in VM gets all incoming
> network traffic while I expected that two devices get traffic for
> their own IP addresses.
> 
> I checked it in the several ways.
> 
> 1) I did ping with two IP addresses from the host/other physical
> machines in the same subnet, and only one device's interrupt count is
> increased.
> 2) I checked the ARP table from the ping sources, and two different IP
> addresses have the same MAC address. In fact, I dumped ARP messages
> using tcpdump, and the VM (or the bridge?) replied with the same MAC
> address for two different IP addresses as attached below.
> 3) I monitored the host bridge (# bridge monitor) and found that only
> one device's MAC address is registered.
> 
> It looks like one device's IP/MAC address is not advertised properly,
> but I'm not really sure. When I turned off the device getting all the
> traffic, then the other device starts getting incoming packets; the
> device's MAC address is registered in the host bridge. The active
> device only gets traffic for its own IP address, of course.
> 
> Here's the tcpdump result. IP 10.10.1.100 and 10.10.1.221 are VM's IP
> addresses. IP 10.10.1.221 is assigned to a device having
> 52:54:00:12:34:58, but the log shows it is advertised as having
> ...:57.

I assume that both devices are in the same subnet. In this case kernel
inside guest confused about to which port packets from this subnet should
go. Look at the routing table (route -n) and you'll see that it uses only
one port for sending packets. You just can't have two interfaces in the
same subnet in Linux/any other OS. There is nothing QEMU specific here.

> 
> 23:24:10.983700 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has
> 10.10.1.100 tell kvm-dest-link-1, length 46
> 23:24:10.983771 ARP, Ethernet (len 6), IPv4 (len 4), Reply 10.10.1.100
> is-at 52:54:00:12:34:57 (oui Unknown), length 28
> 23:24:17.794811 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has
> 10.10.1.211 tell kvm-dest-link-1, length 46
> 23:24:17.794869 ARP, Ethernet (len 6), IPv4 (len 4), Reply 10.10.1.211
> is-at 52:54:00:12:34:57 (oui Unknown), length 28
> 
> I would appreciate any help!
> 
> Thanks,
> Jintack

[Qemu-devel] [PATCH 1/4] hostmem-memfd: enable seals only if supported

2018-11-27 Thread Ilya Maximets

If seals are not supported, memfd_create() will fail.
Furthermore, there is no way to disable it in this case because
'.seal' property is not registered.

This issue leads to vhost-user-test failures on RHEL 7.2:

  qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
  failed to create memfd: Invalid argument

Signed-off-by: Ilya Maximets 
---
 backends/hostmem-memfd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index b6836b28e5..ee39bdbde6 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -129,8 +129,8 @@ memfd_backend_instance_init(Object *obj)
 {
 HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
 
-/* default to sealed file */
-m->seal = true;
+/* default to sealed file if supported */
+m->seal = qemu_memfd_check(MFD_ALLOW_SEALING);
 }
 
 static void
-- 
2.17.1

[Qemu-devel] [PATCH 0/4] memfd fixes.

2018-11-27 Thread Ilya Maximets

Ilya Maximets (4):
  hostmem-memfd: enable seals only if supported
  memfd: always check for MFD_CLOEXEC
  memfd: set up correct errno if not supported
  memfd: improve error messages

 backends/hostmem-memfd.c |  4 ++--
 util/memfd.c | 10 --
 2 files changed, 10 insertions(+), 4 deletions(-)

-- 
2.17.1

[Qemu-devel] [PATCH 3/4] memfd: set up correct errno if not supported

2018-11-27 Thread Ilya Maximets

qemu_memfd_create() prints the value of 'errno' which is not
set in this case.

Signed-off-by: Ilya Maximets 
---
 util/memfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/memfd.c b/util/memfd.c
index d74ce4d793..393d23da96 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -40,6 +40,7 @@ static int memfd_create(const char *name, unsigned int flags)
 #ifdef __NR_memfd_create
 return syscall(__NR_memfd_create, name, flags);
 #else
+errno = ENOSYS;
 return -1;
 #endif
 }
-- 
2.17.1

[Qemu-devel] [PATCH 2/4] memfd: always check for MFD_CLOEXEC

2018-11-27 Thread Ilya Maximets

QEMU always sets this flag unconditionally. We need to
check if it's supported.

Signed-off-by: Ilya Maximets 
---
 util/memfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 8debd0d037..d74ce4d793 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -188,7 +188,7 @@ bool qemu_memfd_alloc_check(void)
 bool qemu_memfd_check(unsigned int flags)
 {
 #ifdef CONFIG_LINUX
-int mfd = memfd_create("test", flags);
+int mfd = memfd_create("test", flags | MFD_CLOEXEC);
 
 if (mfd >= 0) {
 close(mfd);
-- 
2.17.1

[Qemu-devel] [PATCH 4/4] memfd: improve error messages

2018-11-27 Thread Ilya Maximets

This gives more information about the failure.
Additionally 'ENOSYS' returned for a non-Linux platforms instead of
'errno', which is not initilaized in this case.

Signed-off-by: Ilya Maximets 
---
 util/memfd.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 393d23da96..00334e5b21 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -71,14 +71,18 @@ int qemu_memfd_create(const char *name, size_t size, bool 
hugetlb,
 }
 mfd = memfd_create(name, flags);
 if (mfd < 0) {
+error_setg_errno(errp, errno,
+ "failed to create memfd with flags 0x%x", flags);
 goto err;
 }
 
 if (ftruncate(mfd, size) == -1) {
+error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
 goto err;
 }
 
 if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
+error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
 goto err;
 }
 
@@ -88,8 +92,9 @@ err:
 if (mfd >= 0) {
 close(mfd);
 }
+#else
+error_setg_errno(errp, ENOSYS, "failed to create memfd");
 #endif
-error_setg_errno(errp, errno, "failed to create memfd");
 return -1;
 }
 
-- 
2.17.1

Re: [Qemu-devel] [PATCH 1/4] hostmem-memfd: enable seals only if supported

2018-11-27 Thread Ilya Maximets

On 27.11.2018 14:49, Marc-André Lureau wrote:
> Hi
> On Tue, Nov 27, 2018 at 3:11 PM Ilya Maximets  wrote:
>>
>> If seals are not supported, memfd_create() will fail.
>> Furthermore, there is no way to disable it in this case because
>> '.seal' property is not registered.
>>
>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>
>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>       failed to create memfd: Invalid argument
>>
>> Signed-off-by: Ilya Maximets 
> 
> 
> This will change the default behaviour of memfd backend, and may thus
> me considered a break.

This will change the default behaviour only on systems without sealing
support. But current implementation is broken there anyway and does not
work.

> 
> Instead, memfd vhost-user-test should skipped (or tuned with
> sealed=false if unsupported)

vhost-user-test is just an example here. In fact memfd could not be
used at all on system without sealing support. And there is no way
to disable seals.

> 
>> ---
>>  backends/hostmem-memfd.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
>> index b6836b28e5..ee39bdbde6 100644
>> --- a/backends/hostmem-memfd.c
>> +++ b/backends/hostmem-memfd.c
>> @@ -129,8 +129,8 @@ memfd_backend_instance_init(Object *obj)
>>  {
>>  HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
>>
>> -/* default to sealed file */
>> -m->seal = true;
>> +/* default to sealed file if supported */
>> +m->seal = qemu_memfd_check(MFD_ALLOW_SEALING);
>>  }
>>
>>  static void
>> --
>> 2.17.1
>>
> 
>

Re: [Qemu-devel] [PATCH 1/4] hostmem-memfd: enable seals only if supported

2018-11-27 Thread Ilya Maximets

On 27.11.2018 15:00, Marc-André Lureau wrote:
> Hi
> On Tue, Nov 27, 2018 at 3:56 PM Ilya Maximets  wrote:
>>
>> On 27.11.2018 14:49, Marc-André Lureau wrote:
>>> Hi
>>> On Tue, Nov 27, 2018 at 3:11 PM Ilya Maximets  
>>> wrote:
>>>>
>>>> If seals are not supported, memfd_create() will fail.
>>>> Furthermore, there is no way to disable it in this case because
>>>> '.seal' property is not registered.
>>>>
>>>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>>>
>>>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>>>   failed to create memfd: Invalid argument
>>>>
>>>> Signed-off-by: Ilya Maximets 
>>>
>>>
>>> This will change the default behaviour of memfd backend, and may thus
>>> me considered a break.
>>
>> This will change the default behaviour only on systems without sealing
>> support. But current implementation is broken there anyway and does not
>> work.
>>
>>>
>>> Instead, memfd vhost-user-test should skipped (or tuned with
>>> sealed=false if unsupported)
>>
>> vhost-user-test is just an example here. In fact memfd could not be
>> used at all on system without sealing support. And there is no way
>> to disable seals.
> 
> which system supports memfd without sealing?

RHEL 7.2. kernel version 3.10.0-327.el7.x86_64

> 
>>
>>>
>>>> ---
>>>>  backends/hostmem-memfd.c | 4 ++--
>>>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
>>>> index b6836b28e5..ee39bdbde6 100644
>>>> --- a/backends/hostmem-memfd.c
>>>> +++ b/backends/hostmem-memfd.c
>>>> @@ -129,8 +129,8 @@ memfd_backend_instance_init(Object *obj)
>>>>  {
>>>>  HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
>>>>
>>>> -/* default to sealed file */
>>>> -m->seal = true;
>>>> +/* default to sealed file if supported */
>>>> +m->seal = qemu_memfd_check(MFD_ALLOW_SEALING);
>>>>  }
>>>>
>>>>  static void
>>>> --
>>>> 2.17.1
>>>>
>>>
>>>
>>
> 
>

Re: [Qemu-devel] [PATCH 1/4] hostmem-memfd: enable seals only if supported

2018-11-27 Thread Ilya Maximets

On 27.11.2018 15:29, Marc-André Lureau wrote:
> Hi
> 
> On Tue, Nov 27, 2018 at 4:02 PM Ilya Maximets  wrote:
>>
>> On 27.11.2018 15:00, Marc-André Lureau wrote:
>>> Hi
>>> On Tue, Nov 27, 2018 at 3:56 PM Ilya Maximets  
>>> wrote:
>>>>
>>>> On 27.11.2018 14:49, Marc-André Lureau wrote:
>>>>> Hi
>>>>> On Tue, Nov 27, 2018 at 3:11 PM Ilya Maximets  
>>>>> wrote:
>>>>>>
>>>>>> If seals are not supported, memfd_create() will fail.
>>>>>> Furthermore, there is no way to disable it in this case because
>>>>>> '.seal' property is not registered.
>>>>>>
>>>>>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>>>>>
>>>>>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>>>>>   failed to create memfd: Invalid argument
>>>>>>
>>>>>> Signed-off-by: Ilya Maximets 
>>>>>
>>>>>
>>>>> This will change the default behaviour of memfd backend, and may thus
>>>>> me considered a break.
>>>>
>>>> This will change the default behaviour only on systems without sealing
>>>> support. But current implementation is broken there anyway and does not
>>>> work.
>>>>
>>>>>
>>>>> Instead, memfd vhost-user-test should skipped (or tuned with
>>>>> sealed=false if unsupported)
>>>>
>>>> vhost-user-test is just an example here. In fact memfd could not be
>>>> used at all on system without sealing support. And there is no way
>>>> to disable seals.
>>>
>>> which system supports memfd without sealing?
>>
>> RHEL 7.2. kernel version 3.10.0-327.el7.x86_64
> 
> Correct, it was backported without sealing for some reason.
> 
> I would rather have an explicit seal=off argument on such system
> (because sealing is expected to be available with memfd in general)
> 

But '.seal' property registering is guarded by 
'qemu_memfd_check(MFD_ALLOW_SEALING)'.
And you can not disable it:

qemu-system-x86_64: -object memory-backend-memfd,seal=off,id=mem,size=2M,: 
Property '.seal' not found

Enabling this option on system that does not support sealing will
probably break some libvirt feature discovering or something similar.

What about adding some warning to 'memfd_backend_instance_init' if
sealing not supported before disabling it ?

>>
>>>
>>>>
>>>>>
>>>>>> ---
>>>>>>  backends/hostmem-memfd.c | 4 ++--
>>>>>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
>>>>>> index b6836b28e5..ee39bdbde6 100644
>>>>>> --- a/backends/hostmem-memfd.c
>>>>>> +++ b/backends/hostmem-memfd.c
>>>>>> @@ -129,8 +129,8 @@ memfd_backend_instance_init(Object *obj)
>>>>>>  {
>>>>>>  HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(obj);
>>>>>>
>>>>>> -/* default to sealed file */
>>>>>> -m->seal = true;
>>>>>> +/* default to sealed file if supported */
>>>>>> +m->seal = qemu_memfd_check(MFD_ALLOW_SEALING);
>>>>>>  }
>>>>>>
>>>>>>  static void
>>>>>> --
>>>>>> 2.17.1
>>>>>>
>>>>>
>>>>>
>>>>
>>>
>>>
> 
> 
>

Re: [Qemu-devel] [PATCH 1/4] hostmem-memfd: enable seals only if supported

2018-11-27 Thread Ilya Maximets

On 27.11.2018 15:56, Marc-André Lureau wrote:
> Hi
> 
> On Tue, Nov 27, 2018 at 4:37 PM Ilya Maximets  wrote:
>>
>> On 27.11.2018 15:29, Marc-André Lureau wrote:
>>> Hi
>>>
>>> On Tue, Nov 27, 2018 at 4:02 PM Ilya Maximets  
>>> wrote:
>>>>
>>>> On 27.11.2018 15:00, Marc-André Lureau wrote:
>>>>> Hi
>>>>> On Tue, Nov 27, 2018 at 3:56 PM Ilya Maximets  
>>>>> wrote:
>>>>>>
>>>>>> On 27.11.2018 14:49, Marc-André Lureau wrote:
>>>>>>> Hi
>>>>>>> On Tue, Nov 27, 2018 at 3:11 PM Ilya Maximets  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> If seals are not supported, memfd_create() will fail.
>>>>>>>> Furthermore, there is no way to disable it in this case because
>>>>>>>> '.seal' property is not registered.
>>>>>>>>
>>>>>>>> This issue leads to vhost-user-test failures on RHEL 7.2:
>>>>>>>>
>>>>>>>>   qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
>>>>>>>>   failed to create memfd: Invalid argument
>>>>>>>>
>>>>>>>> Signed-off-by: Ilya Maximets 
>>>>>>>
>>>>>>>
>>>>>>> This will change the default behaviour of memfd backend, and may thus
>>>>>>> me considered a break.
>>>>>>
>>>>>> This will change the default behaviour only on systems without sealing
>>>>>> support. But current implementation is broken there anyway and does not
>>>>>> work.
>>>>>>
>>>>>>>
>>>>>>> Instead, memfd vhost-user-test should skipped (or tuned with
>>>>>>> sealed=false if unsupported)
>>>>>>
>>>>>> vhost-user-test is just an example here. In fact memfd could not be
>>>>>> used at all on system without sealing support. And there is no way
>>>>>> to disable seals.
>>>>>
>>>>> which system supports memfd without sealing?
>>>>
>>>> RHEL 7.2. kernel version 3.10.0-327.el7.x86_64
>>>
>>> Correct, it was backported without sealing for some reason.
>>>
>>> I would rather have an explicit seal=off argument on such system
>>> (because sealing is expected to be available with memfd in general)
>>>
>>
>> But '.seal' property registering is guarded by 
>> 'qemu_memfd_check(MFD_ALLOW_SEALING)'.
>> And you can not disable it:
>>
>> qemu-system-x86_64: -object memory-backend-memfd,seal=off,id=mem,size=2M,: 
>> Property '.seal' not found
> 
> Right
> 
>>
>> Enabling this option on system that does not support sealing will
>> probably break some libvirt feature discovering or something similar.
>>
>> What about adding some warning to 'memfd_backend_instance_init' if
>> sealing not supported before disabling it ?
> 
> What do you think of Gerd suggestion, and disable memfd backend if
> sealing is not available? (the sealing property check can be removed
> then).

It's OK in general for me.
What about something like this:

---
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index ee39bdbde6..a3455da9c9 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -156,15 +156,13 @@ memfd_backend_class_init(ObjectClass *oc, void *data)
   "Huge pages size (ex: 2M, 1G)",
   &error_abort);
 }
-if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
-object_class_property_add_bool(oc, "seal",
-   memfd_backend_get_seal,
-   memfd_backend_set_seal,
-   &error_abort);
-object_class_property_set_description(oc, "seal",
-  "Seal growing & shrinking",
-  &error_abort);
-}
+object_class_property_add_bool(oc, "seal",
+   memfd_backend_get_seal,
+   memfd_backend_set_seal,
+   &error_abort);
+object_class_property_set_description(oc, "seal",
+  "S

[Qemu-devel] [PATCH v2 2/4] memfd: always check for MFD_CLOEXEC

2018-11-27 Thread Ilya Maximets

QEMU always sets this flag unconditionally. We need to
check if it's supported.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 8debd0d037..d74ce4d793 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -188,7 +188,7 @@ bool qemu_memfd_alloc_check(void)
 bool qemu_memfd_check(unsigned int flags)
 {
 #ifdef CONFIG_LINUX
-int mfd = memfd_create("test", flags);
+int mfd = memfd_create("test", flags | MFD_CLOEXEC);
 
 if (mfd >= 0) {
 close(mfd);
-- 
2.17.1

[Qemu-devel] [PATCH v2 0/4] memfd fixes.

2018-11-27 Thread Ilya Maximets

Version 2:
* First patch changed to just drop the memfd backend
  if seals are not supported.

Ilya Maximets (4):
  hostmem-memfd: disable for systems wihtout sealing support
  memfd: always check for MFD_CLOEXEC
  memfd: set up correct errno if not supported
  memfd: improve error messages

 backends/hostmem-memfd.c | 18 --
 tests/vhost-user-test.c  |  6 +++---
 util/memfd.c | 10 --
 3 files changed, 19 insertions(+), 15 deletions(-)

-- 
2.17.1

[Qemu-devel] [PATCH v2 1/4] hostmem-memfd: disable for systems wihtout sealing support

2018-11-27 Thread Ilya Maximets

If seals are not supported, memfd_create() will fail.
Furthermore, there is no way to disable it in this case because
'.seal' property is not registered.

This issue leads to vhost-user-test failures on RHEL 7.2:

  qemu-system-x86_64: -object memory-backend-memfd,id=mem,size=2M,: \
  failed to create memfd: Invalid argument

and actually breaks the feature on such systems.

Let's restrict memfd backend to systems with sealing support.

Signed-off-by: Ilya Maximets 
---
 backends/hostmem-memfd.c | 18 --
 tests/vhost-user-test.c  |  6 +++---
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index b6836b28e5..a3455da9c9 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -156,15 +156,13 @@ memfd_backend_class_init(ObjectClass *oc, void *data)
   "Huge pages size (ex: 2M, 1G)",
   &error_abort);
 }
-if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
-object_class_property_add_bool(oc, "seal",
-   memfd_backend_get_seal,
-   memfd_backend_set_seal,
-   &error_abort);
-object_class_property_set_description(oc, "seal",
-  "Seal growing & shrinking",
-  &error_abort);
-}
+object_class_property_add_bool(oc, "seal",
+   memfd_backend_get_seal,
+   memfd_backend_set_seal,
+   &error_abort);
+object_class_property_set_description(oc, "seal",
+  "Seal growing & shrinking",
+  &error_abort);
 }
 
 static const TypeInfo memfd_backend_info = {
@@ -177,7 +175,7 @@ static const TypeInfo memfd_backend_info = {
 
 static void register_types(void)
 {
-if (qemu_memfd_check(0)) {
+if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
 type_register_static(&memfd_backend_info);
 }
 }
diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index 45d58d8ea2..e3e9a33580 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -169,7 +169,7 @@ static char *get_qemu_cmd(TestServer *s,
   int mem, enum test_memfd memfd, const char *mem_path,
   const char *chr_opts, const char *extra)
 {
-if (memfd == TEST_MEMFD_AUTO && qemu_memfd_check(0)) {
+if (memfd == TEST_MEMFD_AUTO && qemu_memfd_check(MFD_ALLOW_SEALING)) {
 memfd = TEST_MEMFD_YES;
 }
 
@@ -903,7 +903,7 @@ static void test_multiqueue(void)
 s->queues = 2;
 test_server_listen(s);
 
-if (qemu_memfd_check(0)) {
+if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
 cmd = g_strdup_printf(
 QEMU_CMD_MEMFD QEMU_CMD_CHR QEMU_CMD_NETDEV ",queues=%d "
 "-device virtio-net-pci,netdev=net0,mq=on,vectors=%d",
@@ -963,7 +963,7 @@ int main(int argc, char **argv)
 /* run the main loop thread so the chardev may operate */
 thread = g_thread_new(NULL, thread_function, loop);
 
-if (qemu_memfd_check(0)) {
+if (qemu_memfd_check(MFD_ALLOW_SEALING)) {
 qtest_add_data_func("/vhost-user/read-guest-mem/memfd",
 GINT_TO_POINTER(TEST_MEMFD_YES),
 test_read_guest_mem);
-- 
2.17.1

[Qemu-devel] [PATCH v2 3/4] memfd: set up correct errno if not supported

2018-11-27 Thread Ilya Maximets

qemu_memfd_create() prints the value of 'errno' which is not
set in this case.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/memfd.c b/util/memfd.c
index d74ce4d793..393d23da96 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -40,6 +40,7 @@ static int memfd_create(const char *name, unsigned int flags)
 #ifdef __NR_memfd_create
 return syscall(__NR_memfd_create, name, flags);
 #else
+errno = ENOSYS;
 return -1;
 #endif
 }
-- 
2.17.1

[Qemu-devel] [PATCH v2 4/4] memfd: improve error messages

2018-11-27 Thread Ilya Maximets

This gives more information about the failure.
Additionally 'ENOSYS' returned for a non-Linux platforms instead of
'errno', which is not initilaized in this case.

Signed-off-by: Ilya Maximets 
Reviewed-by: Marc-André Lureau 
---
 util/memfd.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/util/memfd.c b/util/memfd.c
index 393d23da96..00334e5b21 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -71,14 +71,18 @@ int qemu_memfd_create(const char *name, size_t size, bool 
hugetlb,
 }
 mfd = memfd_create(name, flags);
 if (mfd < 0) {
+error_setg_errno(errp, errno,
+ "failed to create memfd with flags 0x%x", flags);
 goto err;
 }
 
 if (ftruncate(mfd, size) == -1) {
+error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
 goto err;
 }
 
 if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
+error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
 goto err;
 }
 
@@ -88,8 +92,9 @@ err:
 if (mfd >= 0) {
 close(mfd);
 }
+#else
+error_setg_errno(errp, ENOSYS, "failed to create memfd");
 #endif
-error_setg_errno(errp, errno, "failed to create memfd");
 return -1;
 }
 
-- 
2.17.1

1 2 >

1 - 100 of 114 matches

Mail list logo