date:20220413

Re: [RFC PATCH v7 09/25] vhost: move descriptor translation to vhost_svq_vring_write_descs

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

It's done for both in and out descriptors so it's better placed here.

Signed-off-by: Eugenio Pérez 
---
  hw/virtio/vhost-shadow-virtqueue.c | 26 +++---
  1 file changed, 15 insertions(+), 11 deletions(-)



Acked-by: Jason Wang 




diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index a2531d5874..f874374651 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -122,17 +122,23 @@ static bool vhost_svq_translate_addr(const 
VhostShadowVirtqueue *svq,
  return true;
  }
  
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,

-const struct iovec *iovec, size_t num,
-bool more_descs, bool write)
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+const struct iovec *iovec, size_t num,
+bool more_descs, bool write)
  {
  uint16_t i = svq->free_head, last = svq->free_head;
  unsigned n;
  uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
  vring_desc_t *descs = svq->vring.desc;
+bool ok;
  
  if (num == 0) {

-return;
+return true;
+}
+
+ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+if (unlikely(!ok)) {
+return false;
  }
  
  for (n = 0; n < num; n++) {

@@ -150,6 +156,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
  }
  
  svq->free_head = le16_to_cpu(svq->desc_next[last]);

+return true;
  }
  
  static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,

@@ -169,21 +176,18 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  return false;
  }
  
-ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);

+ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+ elem->in_num > 0, false);
  if (unlikely(!ok)) {
  return false;
  }
-vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-elem->in_num > 0, false);
  
-

-ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, 
false,
+ true);
  if (unlikely(!ok)) {
  return false;
  }
  
-vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);

-
  /*
   * Put the entry in the available array (but don't update avail->idx until
   * they do sync).

Re: [RFC PATCH v7 08/25] vdpa: Add x-svq to NetdevVhostVDPAOptions

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

Finally offering the possibility to enable SVQ from the command line.

Signed-off-by: Eugenio Pérez 
---
  qapi/net.json|  9 -
  net/vhost-vdpa.c | 48 
  2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/qapi/net.json b/qapi/net.json
index b92f3f5fb4..92848e4362 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -445,12 +445,19 @@
  # @queues: number of queues to be created for multiqueue vhost-vdpa
  #  (default: 1)
  #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+# (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
  # Since: 5.1
  ##
  { 'struct': 'NetdevVhostVDPAOptions',
'data': {
  '*vhostdev': 'str',
-'*queues':   'int' } }
+'*queues':   'int',
+'*x-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
  
  ##

  # @NetClientDriver:
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 1e9fe47c03..9261101af2 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -128,6 +128,7 @@ static void vhost_vdpa_cleanup(NetClientState *nc)
  {
  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
  
+g_clear_pointer(>vhost_vdpa.iova_tree, vhost_iova_tree_release);

  if (s->vhost_net) {
  vhost_net_cleanup(s->vhost_net);
  g_free(s->vhost_net);
@@ -187,13 +188,23 @@ static NetClientInfo net_vhost_vdpa_info = {
  .check_peer_type = vhost_vdpa_check_peer_type,
  };
  
+static int vhost_vdpa_get_iova_range(int fd,

+ struct vhost_vdpa_iova_range *iova_range)
+{
+int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+return ret < 0 ? -errno : 0;
+}
+
  static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-   const char *device,
-   const char *name,
-   int vdpa_device_fd,
-   int queue_pair_index,
-   int nvqs,
-   bool is_datapath)
+   const char *device,
+   const char *name,
+   int vdpa_device_fd,
+   int queue_pair_index,
+   int nvqs,
+   bool is_datapath,



It's better not mix style changes here (especially it looks correct).



+   bool svq,
+   VhostIOVATree *iova_tree)
  {
  NetClientState *nc = NULL;
  VhostVDPAState *s;
@@ -211,8 +222,14 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
  
  s->vhost_vdpa.device_fd = vdpa_device_fd;

  s->vhost_vdpa.index = queue_pair_index;
+s->vhost_vdpa.shadow_vqs_enabled = svq;
+s->vhost_vdpa.iova_tree = iova_tree ? vhost_iova_tree_acquire(iova_tree) :
+  NULL;
  ret = vhost_vdpa_add(nc, (void *)>vhost_vdpa, queue_pair_index, nvqs);
  if (ret) {
+if (iova_tree) {
+vhost_iova_tree_release(iova_tree);
+}
  qemu_del_net_client(nc);
  return NULL;
  }
@@ -266,6 +283,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
  g_autofree NetClientState **ncs = NULL;
  NetClientState *nc;
  int queue_pairs, i, has_cvq = 0;
+g_autoptr(VhostIOVATree) iova_tree = NULL;
  
  assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);

  opts = >u.vhost_vdpa;
@@ -285,19 +303,31 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
  qemu_close(vdpa_device_fd);
  return queue_pairs;
  }
+if (opts->x_svq) {
+struct vhost_vdpa_iova_range iova_range;
+
+if (has_cvq) {
+error_setg(errp, "vdpa svq does not work with cvq");
+goto err_svq;
+}
+vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+}
  
  ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
  
  for (i = 0; i < queue_pairs; i++) {

  ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 2, true);
+ vdpa_device_fd, i, 2, true, opts->x_svq,
+ iova_tree);
  if (!ncs[i])
  goto err;
  }
  
  if (has_cvq) {

  nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 1, false);
+ vdpa_device_fd, i, 1, false, opts->x_svq,
+ iova_tree);



So we had at most 1 iova_tree

Re: [RFC PATCH v7 07/25] vhost: Add reference counting to vhost_iova_tree

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

Now that different vqs can have different ASIDs its easier to track them
using reference counters.

QEMU's glib version still does not have them so we've copied g_rc_box,
so the implementation can be converted to glib's one when the minimum
version is raised.

Signed-off-by: Eugenio Pérez 
---



I'm not sure if it's too early to introduce things like this since we 
have at most 2 ASIDs. This is probably only needed when we want to 
expose ASIDs to guest.


Let's see how it goes for the following patch anyhow.

Thanks



  hw/virtio/vhost-iova-tree.h |  5 +++--
  hw/virtio/vhost-iova-tree.c | 21 +++--
  2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
index 6a4f24e0f9..2fc825d7b1 100644
--- a/hw/virtio/vhost-iova-tree.h
+++ b/hw/virtio/vhost-iova-tree.h
@@ -16,8 +16,9 @@
  typedef struct VhostIOVATree VhostIOVATree;
  
  VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);

-void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
-G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+VhostIOVATree *vhost_iova_tree_acquire(VhostIOVATree *iova_tree);
+void vhost_iova_tree_release(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_release);
  
  const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,

  const DMAMap *map);
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
index 55fed1fefb..31445cbdfc 100644
--- a/hw/virtio/vhost-iova-tree.c
+++ b/hw/virtio/vhost-iova-tree.c
@@ -28,6 +28,9 @@ struct VhostIOVATree {
  
  /* IOVA address to qemu memory maps. */

  IOVATree *iova_taddr_map;
+
+/* Reference count */
+size_t refcnt;
  };
  
  /**

@@ -44,14 +47,28 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, 
hwaddr iova_last)
  tree->iova_last = iova_last;
  
  tree->iova_taddr_map = iova_tree_new();

+tree->refcnt = 1;
  return tree;
  }
  
  /**

- * Delete an iova tree
+ * Increases the reference count of the iova tree
+ */
+VhostIOVATree *vhost_iova_tree_acquire(VhostIOVATree *iova_tree)
+{
+++iova_tree->refcnt;
+return iova_tree;
+}
+
+/**
+ * Decrease reference counter of iova tree, freeing if it reaches 0
   */
-void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+void vhost_iova_tree_release(VhostIOVATree *iova_tree)
  {
+if (--iova_tree->refcnt) {
+return;
+}
+
  iova_tree_destroy(iova_tree->iova_taddr_map);
  g_free(iova_tree);
  }

Re: [PATCH] docs: Correct the default thread-pool-size

2022-04-13 Thread liuyd.f...@fujitsu.com

[+cc vgo...@redhat.com]

On 4/13/22 12:20 PM, Liu Yiding wrote:
> Refer to 26ec190964 virtiofsd: Do not use a thread pool by default
>
> Signed-off-by: Liu Yiding 
> ---
>   docs/tools/virtiofsd.rst | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst
> index 0c0560203c..33fed08c6f 100644
> --- a/docs/tools/virtiofsd.rst
> +++ b/docs/tools/virtiofsd.rst
> @@ -127,7 +127,7 @@ Options
>   .. option:: --thread-pool-size=NUM
>   
> Restrict the number of worker threads per request queue to NUM.  The 
> default
> -  is 64.
> +  is 0.
>   
>   .. option:: --cache=none|auto|always
>   

-- 
Best Regards.
Yiding Liu

Re: [RFC PATCH v7 06/25] vdpa: Send all updates in memory listener commit

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

With the introduction of many ASID it can happen that many changes on
different listeners come before the commit call.



I think we have at most one listener even for the case of MQ/CVQ?



  Since kernel vhost-vdpa
still does not support it, send it all in one shot.

This also have one extra advantage: If there is no update to notify, we
save the iotlb_{begin,end} calls.

Signed-off-by: Eugenio Pérez 
---
  include/hw/virtio/vhost-vdpa.h |  2 +-
  hw/virtio/vhost-vdpa.c | 69 +-
  2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index a29dbb3f53..4961acea8b 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -27,7 +27,7 @@ typedef struct vhost_vdpa {
  int device_fd;
  int index;
  uint32_t msg_type;
-bool iotlb_batch_begin_sent;
+GArray *iotlb_updates;
  MemoryListener listener;
  struct vhost_vdpa_iova_range iova_range;
  uint64_t acked_features;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 1f229ff4cb..27ee678dc9 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -85,6 +85,11 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr 
iova, hwaddr size,
  msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
  msg.iotlb.type = VHOST_IOTLB_UPDATE;
  
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH)) {

+g_array_append_val(v->iotlb_updates, msg);
+return 0;
+}



I think it's better to use a consistent way for !batch and batch (E.g we 
can do this even for the backend that doesn't support batching?)


Otherwise the codes are hard to be maintained.



+
 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
  msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
  
@@ -109,6 +114,11 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,

  msg.iotlb.size = size;
  msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
  
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH)) {

+g_array_append_val(v->iotlb_updates, msg);
+return 0;
+}
+
  trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
 msg.iotlb.size, msg.iotlb.type);
  
@@ -121,56 +131,47 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,

  return ret;
  }
  
-static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)

-{
-int fd = v->device_fd;
-struct vhost_msg_v2 msg = {
-.type = v->msg_type,
-.iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
-};
-
-trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
-if (write(fd, , sizeof(msg)) != sizeof(msg)) {
-error_report("failed to write, fd=%d, errno=%d (%s)",
- fd, errno, strerror(errno));
-}
-}
-
-static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
-{
-if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
-!v->iotlb_batch_begin_sent) {
-vhost_vdpa_listener_begin_batch(v);
-}
-
-v->iotlb_batch_begin_sent = true;
-}
-
  static void vhost_vdpa_listener_commit(MemoryListener *listener)
  {
  struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, 
listener);
-struct vhost_dev *dev = v->dev;
  struct vhost_msg_v2 msg = {};
  int fd = v->device_fd;
+size_t num = v->iotlb_updates->len;
  
-if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {

+if (!num) {
  return;
  }
  
-if (!v->iotlb_batch_begin_sent) {

-return;
+msg.type = v->msg_type;
+msg.iotlb.type = VHOST_IOTLB_BATCH_BEGIN;
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
+if (write(fd, , sizeof(msg)) != sizeof(msg)) {



We need check whehter the vhost-vDPA support batching first?



+error_report("failed to write BEGIN_BATCH, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+goto done;
  }
  
-msg.type = v->msg_type;

-msg.iotlb.type = VHOST_IOTLB_BATCH_END;
+for (size_t i = 0; i < num; ++i) {
+struct vhost_msg_v2 *update = _array_index(v->iotlb_updates,
+ struct vhost_msg_v2, i);
+if (write(fd, update, sizeof(*update)) != sizeof(*update)) {
+error_report("failed to write dma update, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+goto done;



Maybe it's time to introduce v3 to allow a batch of messaged to be 
passed to vhost-vDPA in a single system call.


Thanks



+}
+}
  
+msg.iotlb.type = VHOST_IOTLB_BATCH_END;

  trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
  if (write(fd, , sizeof(msg)) !=

Re: [PATCH v4 3/3] multifd: Implement zerocopy write in multifd migration (multifd-zerocopy)

2022-04-13 Thread Leonardo Bras Soares Passos

Hello Juan,

Sorry to go back that early in discussion, but I was reviewing for v9
and I am not sure If I am unable to recall the reason, or I missed an
argument here.
Could you please help me with this?

On Tue, Nov 2, 2021 at 9:32 AM Juan Quintela  wrote:
>
> Leonardo Bras  wrote:
> > Implement zerocopy on nocomp_send_write(), by making use of QIOChannel
> > zerocopy interface.
> >
> > Change multifd_send_sync_main() so it can distinguish the last sync from
> > the setup and per-iteration ones, so a flush_zerocopy() can be called
> > at the last sync in order to make sure all RAM is sent before finishing
> > the migration.
>
> You need to do this after each iteration.  Otherwise it can happen that:
>
> channel 1:   channel 2:
>
>send page 11
>
> next iteration
>  send page 11
>
>  this page arrives
>
> now arrives this old copy.
>
> After each iteration, one needs to be sure that no ram is inflight.
>
> This means that I think you don't need the last_sync parameter at all,
> as you have to do the flush() in every iteration.

The flush command is used to guarantee every packet queued before
flush is actually sent before flush returns.
I mean, flushing every iteration will not help with the situation
above, where the pages are sent in order, but arrive at target in a
different order.

There is a chance that in the above text you meant 'send page' as
"queue page for sending", and 'page arrives' as "actually send the
queued page".
It that is correct, then syncing every iteration should not be necessary:
- On page queue, Linux saves the page address and size for sending
- On actual send, Linux will send the current data in the page and send.

So, in this example, if page 11 from iteration 'i' happens to be
'actually sent' after page 11 from iteration 'i+1', it would not be an
issue:
###
channel 1:   channel 2:
Iteration i

queue page 11 (i)

iteration i+1
  queue page 11 (i+1)
  actually send page 11 (i+1)

actually send page 11 (i)
###

That's because page 11 (i) will contain a newer version compared to
page 11 (i+1)

tl;dr:
- The page content always depends on the send time, instead of queue time.
- The iteration count describes the queue time.
(on non-zerocopy it's the opposite: it will depend on queue time,
because it copies the memory content during enqueue)

>
[...]

Juan, could you please help me understand if I am missing a part of
your argument up there?
Also, syncing every iteration is still necessary / recommended?

Best regards,
Leo

Re: [RFC PATCH v7 05/25] hw/virtio: Replace g_memdup() by g_memdup2()

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

From: Philippe Mathieu-Daudé 

Per 
https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

   The old API took the size of the memory to duplicate as a guint,
   whereas most memory functions take memory sizes as a gsize. This
   made it easy to accidentally pass a gsize to g_memdup(). For large
   values, that would lead to a silent truncation of the size from 64
   to 32 bits, and result in a heap area being returned which is
   significantly smaller than what the caller expects. This can likely
   be exploited in various modules to cause a heap buffer overflow.

Replace g_memdup() by the safer g_memdup2() wrapper.

Signed-off-by: Philippe Mathieu-Daudé 
---



Acked-by: Jason Wang 



  hw/net/virtio-net.c   | 3 ++-
  hw/virtio/virtio-crypto.c | 6 +++---
  2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1067e72b39..e4748a7e6c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1443,7 +1443,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  iov_cnt = elem->out_num;

-iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * 
elem->out_num);
+iov2 = iov = g_memdup2(elem->out_sg,
+   sizeof(struct iovec) * elem->out_num);
  s = iov_to_buf(iov, iov_cnt, 0, , sizeof(ctrl));
  iov_discard_front(, _cnt, sizeof(ctrl));
  if (s != sizeof(ctrl)) {
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index dcd80b904d..0e31e3cc04 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

@@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

-in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num);
+in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
  in_iov = in_iov_copy;
  
  if (unlikely(iov_to_buf(out_iov, out_num, 0, , sizeof(req))

Re: [RFC PATCH v7 05/25] hw/virtio: Replace g_memdup() by g_memdup2()

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

From: Philippe Mathieu-Daudé 

Per 
https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

   The old API took the size of the memory to duplicate as a guint,
   whereas most memory functions take memory sizes as a gsize. This
   made it easy to accidentally pass a gsize to g_memdup(). For large
   values, that would lead to a silent truncation of the size from 64
   to 32 bits, and result in a heap area being returned which is
   significantly smaller than what the caller expects. This can likely
   be exploited in various modules to cause a heap buffer overflow.

Replace g_memdup() by the safer g_memdup2() wrapper.

Signed-off-by: Philippe Mathieu-Daudé 
---



Acked-by: Jason Wang 



  hw/net/virtio-net.c   | 3 ++-
  hw/virtio/virtio-crypto.c | 6 +++---
  2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1067e72b39..e4748a7e6c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1443,7 +1443,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  iov_cnt = elem->out_num;

-iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * 
elem->out_num);
+iov2 = iov = g_memdup2(elem->out_sg,
+   sizeof(struct iovec) * elem->out_num);
  s = iov_to_buf(iov, iov_cnt, 0, , sizeof(ctrl));
  iov_discard_front(, _cnt, sizeof(ctrl));
  if (s != sizeof(ctrl)) {
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index dcd80b904d..0e31e3cc04 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

@@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

-in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num);
+in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
  in_iov = in_iov_copy;
  
  if (unlikely(iov_to_buf(out_iov, out_num, 0, , sizeof(req))

Re: [RFC PATCH v7 03/25] vdpa: Fix bad index calculus at vhost_vdpa_get_vring_base

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

Fixes: 6d0b222666 ("vdpa: Adapt vhost_vdpa_get_vring_base to SVQ")

Signed-off-by: Eugenio Pérez 



Acked-by: Jason Wang 



---
  hw/virtio/vhost-vdpa.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9e5fe15d03..1f229ff4cb 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1172,11 +1172,11 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev 
*dev,
 struct vhost_vring_state *ring)
  {
  struct vhost_vdpa *v = dev->opaque;
+int vdpa_idx = ring->index - dev->vq_index;
  int ret;
  
  if (v->shadow_vqs_enabled) {

-VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-  ring->index);
+VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
  
  /*

   * Setting base as last used idx, so destination will see as available

Re: [RFC PATCH v7 04/25] util: Return void on iova_tree_remove

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

It always returns IOVA_OK so nobody uses it.

Signed-off-by: Eugenio Pérez 
---



Acked-by: Jason Wang 



  include/qemu/iova-tree.h | 4 +---
  util/iova-tree.c | 4 +---
  2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index c938fb0793..16bbfdf5f8 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -72,10 +72,8 @@ int iova_tree_insert(IOVATree *tree, const DMAMap *map);
   * provided.  The range does not need to be exactly what has inserted,
   * all the mappings that are included in the provided range will be
   * removed from the tree.  Here map->translated_addr is meaningless.
- *
- * Return: 0 if succeeded, or <0 if error.
   */
-int iova_tree_remove(IOVATree *tree, const DMAMap *map);
+void iova_tree_remove(IOVATree *tree, const DMAMap *map);
  
  /**

   * iova_tree_find:
diff --git a/util/iova-tree.c b/util/iova-tree.c
index 6dff29c1f6..fee530a579 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -164,15 +164,13 @@ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator 
iterator)
  g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
  }
  
-int iova_tree_remove(IOVATree *tree, const DMAMap *map)

+void iova_tree_remove(IOVATree *tree, const DMAMap *map)
  {
  const DMAMap *overlap;
  
  while ((overlap = iova_tree_find(tree, map))) {

  g_tree_remove(tree->tree, overlap);
  }
-
-return IOVA_OK;
  }
  
  /**

Re: [RFC PATCH v7 02/25] vdpa: Add missing tracing to batch mapping functions

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

These functions were not traced properly.

Signed-off-by: Eugenio Pérez 
---



Acked-by: Jason Wang 



  hw/virtio/vhost-vdpa.c | 2 ++
  hw/virtio/trace-events | 2 ++
  2 files changed, 4 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 8adf7c0b92..9e5fe15d03 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -129,6 +129,7 @@ static void vhost_vdpa_listener_begin_batch(struct 
vhost_vdpa *v)
  .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
  };
  
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);

  if (write(fd, , sizeof(msg)) != sizeof(msg)) {
  error_report("failed to write, fd=%d, errno=%d (%s)",
   fd, errno, strerror(errno));
@@ -163,6 +164,7 @@ static void vhost_vdpa_listener_commit(MemoryListener 
*listener)
  msg.type = v->msg_type;
  msg.iotlb.type = VHOST_IOTLB_BATCH_END;
  
+trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);

  if (write(fd, , sizeof(msg)) != sizeof(msg)) {
  error_report("failed to write, fd=%d, errno=%d (%s)",
   fd, errno, strerror(errno));
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index a5102eac9e..48d9d5 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -25,6 +25,8 @@ vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) 
"%s + 0x%"
  # vhost-vdpa.c
  vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) 
"vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" 
perm: 0x%"PRIx8" type: %"PRIu8
  vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint8_t type) "vdpa:%p 
fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
+vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa:%p 
fd: %d msg_type: %"PRIu32" type: %"PRIu8
+vhost_vdpa_listener_commit(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa:%p fd: %d 
msg_type: %"PRIu32" type: %"PRIu8
  vhost_vdpa_listener_region_add(void *vdpa, uint64_t iova, uint64_t llend, void *vaddr, bool readonly) 
"vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64" vaddr: %p read-only: %d"
  vhost_vdpa_listener_region_del(void *vdpa, uint64_t iova, uint64_t llend) "vdpa: %p iova 
0x%"PRIx64" llend 0x%"PRIx64
  vhost_vdpa_add_status(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8

Re: [RFC PATCH v7 01/25] vhost: Track descriptor chain in private at SVQ

2022-04-13 Thread Jason Wang




在 2022/4/14 00:31, Eugenio Pérez 写道:

Only the first one of them were properly enqueued back.



I wonder if it's better to use two patches:

1) using private chain

2) fix the chain issue

Patch looks good itself.

Thanks




While we're at it, harden SVQ: The device could have access to modify
them, and it definitely have access when we implement packed vq. Harden
SVQ maintaining a private copy of the descriptor chain. Other fields
like buffer addresses are already maintained sepparatedly.

Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding")

Signed-off-by: Eugenio Pérez 
---
  hw/virtio/vhost-shadow-virtqueue.h |  6 ++
  hw/virtio/vhost-shadow-virtqueue.c | 27 +--
  2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index e5e24c536d..c132c994e9 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -53,6 +53,12 @@ typedef struct VhostShadowVirtqueue {
  /* Next VirtQueue element that guest made available */
  VirtQueueElement *next_guest_avail_elem;
  
+/*

+ * Backup next field for each descriptor so we can recover securely, not
+ * needing to trust the device access.
+ */
+uint16_t *desc_next;
+
  /* Next head to expose to the device */
  uint16_t shadow_avail_idx;
  
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c

index b232803d1b..a2531d5874 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -138,6 +138,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
  for (n = 0; n < num; n++) {
  if (more_descs || (n + 1 < num)) {
  descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+descs[i].next = cpu_to_le16(svq->desc_next[i]);
  } else {
  descs[i].flags = flags;
  }
@@ -145,10 +146,10 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
  descs[i].len = cpu_to_le32(iovec[n].iov_len);
  
  last = i;

-i = cpu_to_le16(descs[i].next);
+i = cpu_to_le16(svq->desc_next[i]);
  }
  
-svq->free_head = le16_to_cpu(descs[last].next);

+svq->free_head = le16_to_cpu(svq->desc_next[last]);
  }
  
  static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,

@@ -333,13 +334,22 @@ static void 
vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
  svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
  }
  
+static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq,

+ uint16_t num, uint16_t i)
+{
+for (uint16_t j = 0; j < num; ++j) {
+i = le16_to_cpu(svq->desc_next[i]);
+}
+
+return i;
+}
+
  static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 uint32_t *len)
  {
-vring_desc_t *descs = svq->vring.desc;
  const vring_used_t *used = svq->vring.used;
  vring_used_elem_t used_elem;
-uint16_t last_used;
+uint16_t last_used, last_used_chain, num;
  
  if (!vhost_svq_more_used(svq)) {

  return NULL;
@@ -365,7 +375,10 @@ static VirtQueueElement 
*vhost_svq_get_buf(VhostShadowVirtqueue *svq,
  return NULL;
  }
  
-descs[used_elem.id].next = svq->free_head;

+num = svq->ring_id_maps[used_elem.id]->in_num +
+  svq->ring_id_maps[used_elem.id]->out_num;
+last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
+svq->desc_next[last_used_chain] = svq->free_head;
  svq->free_head = used_elem.id;
  
  *len = used_elem.len;

@@ -540,8 +553,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
VirtIODevice *vdev,
  svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
  memset(svq->vring.used, 0, device_size);
  svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+svq->desc_next = g_new0(uint16_t, svq->vring.num);
  for (unsigned i = 0; i < svq->vring.num - 1; i++) {
-svq->vring.desc[i].next = cpu_to_le16(i + 1);
+svq->desc_next[i] = cpu_to_le16(i + 1);
  }
  }
  
@@ -574,6 +588,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)

  virtqueue_detach_element(svq->vq, next_avail_elem, 0);
  }
  svq->vq = NULL;
+g_free(svq->desc_next);
  g_free(svq->ring_id_maps);
  qemu_vfree(svq->vring.desc);
  qemu_vfree(svq->vring.used);

Re: [RFC PATCH v5 06/23] vdpa: Add x-svq to NetdevVhostVDPAOptions

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

Finally offering the possibility to enable SVQ from the command line.

Signed-off-by: Eugenio Pérez 
---
  qapi/net.json|  9 -
  net/vhost-vdpa.c | 48 
  2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/qapi/net.json b/qapi/net.json
index b92f3f5fb4..92848e4362 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -445,12 +445,19 @@
  # @queues: number of queues to be created for multiqueue vhost-vdpa
  #  (default: 1)
  #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+# (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
  # Since: 5.1
  ##
  { 'struct': 'NetdevVhostVDPAOptions',
'data': {
  '*vhostdev': 'str',
-'*queues':   'int' } }
+'*queues':   'int',
+'*x-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
  
  ##

  # @NetClientDriver:
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 1e9fe47c03..def738998b 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -127,7 +127,11 @@ err_init:
  static void vhost_vdpa_cleanup(NetClientState *nc)
  {
  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+struct vhost_dev *dev = s->vhost_vdpa.dev;
  
+if (dev && dev->vq_index + dev->nvqs == dev->vq_index_end) {

+g_clear_pointer(>vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+}
  if (s->vhost_net) {
  vhost_net_cleanup(s->vhost_net);
  g_free(s->vhost_net);
@@ -187,13 +191,23 @@ static NetClientInfo net_vhost_vdpa_info = {
  .check_peer_type = vhost_vdpa_check_peer_type,
  };
  
+static int vhost_vdpa_get_iova_range(int fd,

+ struct vhost_vdpa_iova_range *iova_range)
+{
+int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+return ret < 0 ? -errno : 0;
+}
+
  static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-   const char *device,
-   const char *name,
-   int vdpa_device_fd,
-   int queue_pair_index,
-   int nvqs,
-   bool is_datapath)
+   const char *device,
+   const char *name,
+   int vdpa_device_fd,
+   int queue_pair_index,
+   int nvqs,
+   bool is_datapath,



It's better not mix style changes with the logic changes.

Other looks fine.

Thanks



+   bool svq,
+   VhostIOVATree *iova_tree)
  {
  NetClientState *nc = NULL;
  VhostVDPAState *s;
@@ -211,6 +225,8 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
  
  s->vhost_vdpa.device_fd = vdpa_device_fd;

  s->vhost_vdpa.index = queue_pair_index;
+s->vhost_vdpa.shadow_vqs_enabled = svq;
+s->vhost_vdpa.iova_tree = iova_tree;
  ret = vhost_vdpa_add(nc, (void *)>vhost_vdpa, queue_pair_index, nvqs);
  if (ret) {
  qemu_del_net_client(nc);
@@ -266,6 +282,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
  g_autofree NetClientState **ncs = NULL;
  NetClientState *nc;
  int queue_pairs, i, has_cvq = 0;
+g_autoptr(VhostIOVATree) iova_tree = NULL;
  
  assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);

  opts = >u.vhost_vdpa;
@@ -285,29 +302,44 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
  qemu_close(vdpa_device_fd);
  return queue_pairs;
  }
+if (opts->x_svq) {
+struct vhost_vdpa_iova_range iova_range;
+
+if (has_cvq) {
+error_setg(errp, "vdpa svq does not work with cvq");
+goto err_svq;
+}
+vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+}
  
  ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
  
  for (i = 0; i < queue_pairs; i++) {

  ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 2, true);
+ vdpa_device_fd, i, 2, true, opts->x_svq,
+ iova_tree);
  if (!ncs[i])
  goto err;
  }
  
  if (has_cvq) {

  nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 1, false);
+ vdpa_device_fd, i, 1, false, opts->x_svq,
+ iova_tree);
  if (!nc)
  goto err;
  }
  
+iova_tree =

Re: [RFC PATCH v5 05/23] vhost: Fix bad return of descriptors to SVQ

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

Only the first one of them were properly enqueued back.

Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding")
Signed-off-by: Eugenio Pérez 
---



Acked-by: Jason Wang 



  hw/virtio/vhost-shadow-virtqueue.c | 17 +++--
  1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index b232803d1b..c17506df20 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -333,13 +333,25 @@ static void 
vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
  svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
  }
  
+static uint16_t vhost_svq_last_desc_of_chain(VhostShadowVirtqueue *svq,

+ uint16_t i)
+{
+vring_desc_t *descs = svq->vring.desc;
+
+while (le16_to_cpu(descs[i].flags) & VRING_DESC_F_NEXT) {
+i = le16_to_cpu(descs[i].next);
+}
+
+return i;
+}
+
  static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 uint32_t *len)
  {
  vring_desc_t *descs = svq->vring.desc;
  const vring_used_t *used = svq->vring.used;
  vring_used_elem_t used_elem;
-uint16_t last_used;
+uint16_t last_used, last_used_chain;
  
  if (!vhost_svq_more_used(svq)) {

  return NULL;
@@ -365,7 +377,8 @@ static VirtQueueElement 
*vhost_svq_get_buf(VhostShadowVirtqueue *svq,
  return NULL;
  }
  
-descs[used_elem.id].next = svq->free_head;

+last_used_chain = vhost_svq_last_desc_of_chain(svq, used_elem.id);
+descs[last_used_chain].next = svq->free_head;
  svq->free_head = used_elem.id;
  
  *len = used_elem.len;

Re: [RFC PATCH v5 04/23] hw/virtio: Replace g_memdup() by g_memdup2()

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

From: Philippe Mathieu-Daudé 

Per 
https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

   The old API took the size of the memory to duplicate as a guint,
   whereas most memory functions take memory sizes as a gsize. This
   made it easy to accidentally pass a gsize to g_memdup(). For large
   values, that would lead to a silent truncation of the size from 64
   to 32 bits, and result in a heap area being returned which is
   significantly smaller than what the caller expects. This can likely
   be exploited in various modules to cause a heap buffer overflow.

Replace g_memdup() by the safer g_memdup2() wrapper.

Signed-off-by: Philippe Mathieu-Daudé 
---



Acked-by: Jason Wang 



  hw/net/virtio-net.c   | 3 ++-
  hw/virtio/virtio-crypto.c | 6 +++---
  2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1067e72b39..e4748a7e6c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1443,7 +1443,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  iov_cnt = elem->out_num;

-iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * 
elem->out_num);
+iov2 = iov = g_memdup2(elem->out_sg,
+   sizeof(struct iovec) * elem->out_num);
  s = iov_to_buf(iov, iov_cnt, 0, , sizeof(ctrl));
  iov_discard_front(, _cnt, sizeof(ctrl));
  if (s != sizeof(ctrl)) {
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index dcd80b904d..0e31e3cc04 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

@@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
  }
  
  out_num = elem->out_num;

-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
  out_iov = out_iov_copy;
  
  in_num = elem->in_num;

-in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num);
+in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
  in_iov = in_iov_copy;
  
  if (unlikely(iov_to_buf(out_iov, out_num, 0, , sizeof(req))

Re: [RFC PATCH v5 03/23] util: Return void on iova_tree_remove

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

It always returns IOVA_OK so nobody uses it.

Signed-off-by: Eugenio Pérez 



Acked-by: Jason Wang 



---
  include/qemu/iova-tree.h | 4 +---
  util/iova-tree.c | 4 +---
  2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index c938fb0793..16bbfdf5f8 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -72,10 +72,8 @@ int iova_tree_insert(IOVATree *tree, const DMAMap *map);
   * provided.  The range does not need to be exactly what has inserted,
   * all the mappings that are included in the provided range will be
   * removed from the tree.  Here map->translated_addr is meaningless.
- *
- * Return: 0 if succeeded, or <0 if error.
   */
-int iova_tree_remove(IOVATree *tree, const DMAMap *map);
+void iova_tree_remove(IOVATree *tree, const DMAMap *map);
  
  /**

   * iova_tree_find:
diff --git a/util/iova-tree.c b/util/iova-tree.c
index 6dff29c1f6..fee530a579 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -164,15 +164,13 @@ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator 
iterator)
  g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
  }
  
-int iova_tree_remove(IOVATree *tree, const DMAMap *map)

+void iova_tree_remove(IOVATree *tree, const DMAMap *map)
  {
  const DMAMap *overlap;
  
  while ((overlap = iova_tree_find(tree, map))) {

  g_tree_remove(tree->tree, overlap);
  }
-
-return IOVA_OK;
  }
  
  /**

Re: [RFC PATCH v5 02/23] vdpa: Fix bad index calculus at vhost_vdpa_get_vring_base

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

Fixes: 6d0b222666 ("vdpa: Adapt vhost_vdpa_get_vring_base to SVQ")

Signed-off-by: Eugenio Pérez 
---



Acked-by: Jason Wang 

I think we need to use a separate patch for this.



  hw/virtio/vhost-vdpa.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9e5fe15d03..1f229ff4cb 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1172,11 +1172,11 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev 
*dev,
 struct vhost_vring_state *ring)
  {
  struct vhost_vdpa *v = dev->opaque;
+int vdpa_idx = ring->index - dev->vq_index;
  int ret;
  
  if (v->shadow_vqs_enabled) {

-VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-  ring->index);
+VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
  
  /*

   * Setting base as last used idx, so destination will see as available

Re: [RFC PATCH v5 01/23] vdpa: Add missing tracing to batch mapping functions

2022-04-13 Thread Jason Wang




在 2022/4/8 21:33, Eugenio Pérez 写道:

These functions were not traced properly.

Signed-off-by: Eugenio Pérez 



Acked-by: Jason Wang 



---
  hw/virtio/vhost-vdpa.c | 2 ++
  hw/virtio/trace-events | 2 ++
  2 files changed, 4 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 8adf7c0b92..9e5fe15d03 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -129,6 +129,7 @@ static void vhost_vdpa_listener_begin_batch(struct 
vhost_vdpa *v)
  .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
  };
  
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);

  if (write(fd, , sizeof(msg)) != sizeof(msg)) {
  error_report("failed to write, fd=%d, errno=%d (%s)",
   fd, errno, strerror(errno));
@@ -163,6 +164,7 @@ static void vhost_vdpa_listener_commit(MemoryListener 
*listener)
  msg.type = v->msg_type;
  msg.iotlb.type = VHOST_IOTLB_BATCH_END;
  
+trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);

  if (write(fd, , sizeof(msg)) != sizeof(msg)) {
  error_report("failed to write, fd=%d, errno=%d (%s)",
   fd, errno, strerror(errno));
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index a5102eac9e..48d9d5 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -25,6 +25,8 @@ vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) 
"%s + 0x%"
  # vhost-vdpa.c
  vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) 
"vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" 
perm: 0x%"PRIx8" type: %"PRIu8
  vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint8_t type) "vdpa:%p 
fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
+vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa:%p 
fd: %d msg_type: %"PRIu32" type: %"PRIu8
+vhost_vdpa_listener_commit(void *v, int fd, uint32_t msg_type, uint8_t type)  "vdpa:%p fd: %d 
msg_type: %"PRIu32" type: %"PRIu8
  vhost_vdpa_listener_region_add(void *vdpa, uint64_t iova, uint64_t llend, void *vaddr, bool readonly) 
"vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64" vaddr: %p read-only: %d"
  vhost_vdpa_listener_region_del(void *vdpa, uint64_t iova, uint64_t llend) "vdpa: %p iova 
0x%"PRIx64" llend 0x%"PRIx64
  vhost_vdpa_add_status(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8

RE: [PATCH V2 0/4] COLO net and runstate bugfix/optimization

2022-04-13 Thread Zhang, Chen



> -Original Message-
> From: Jason Wang 
> Sent: Thursday, April 14, 2022 11:12 AM
> To: Zhang, Chen 
> Cc: Li Zhijian ; qemu-dev  de...@nongnu.org>
> Subject: Re: [PATCH V2 0/4] COLO net and runstate bugfix/optimization
> 
> On Thu, Apr 14, 2022 at 9:52 AM Zhang, Chen 
> wrote:
> >
> > No update for a while. Ping...
> >
> > Thanks
> > Chen
> 
> Hi:
> 
> It's near to release, I think we can queue this for 7.1?

Sure. I forgot to add "for 7.1" tag.

Thanks
Chen

> 
> Thanks
> 
> >
> > > -Original Message-
> > > From: Zhang, Chen 
> > > Sent: Friday, April 1, 2022 11:47 AM
> > > To: Jason Wang ; Li Zhijian
> > > 
> > > Cc: Zhang, Chen ; qemu-dev  > > de...@nongnu.org>
> > > Subject: [PATCH V2 0/4] COLO net and runstate bugfix/optimization
> > >
> > > This series fix some COLO related issues in internal stress testing.
> > >
> > >  - V2:
> > > - Add more comments in patch 2/4 commit log.
> > >
> > > Zhang Chen (4):
> > >   softmmu/runstate.c: add RunStateTransition support form COLO to
> > > PRELAUNCH
> > >   net/colo: Fix a "double free" crash to clear the conn_list
> > >   net/colo.c: No need to track conn_list for filter-rewriter
> > >   net/colo.c: fix segmentation fault when packet is not parsed
> > > correctly
> > >
> > >  net/colo-compare.c|  2 +-
> > >  net/colo.c| 11 +--
> > >  net/filter-rewriter.c |  2 +-
> > >  net/trace-events  |  1 +
> > >  softmmu/runstate.c|  1 +
> > >  5 files changed, 13 insertions(+), 4 deletions(-)
> > >
> > > --
> > > 2.25.1
> >

Re: [PATCH V2 0/4] COLO net and runstate bugfix/optimization

2022-04-13 Thread Jason Wang

On Thu, Apr 14, 2022 at 9:52 AM Zhang, Chen  wrote:
>
> No update for a while. Ping...
>
> Thanks
> Chen

Hi:

It's near to release, I think we can queue this for 7.1?

Thanks

>
> > -Original Message-
> > From: Zhang, Chen 
> > Sent: Friday, April 1, 2022 11:47 AM
> > To: Jason Wang ; Li Zhijian 
> > Cc: Zhang, Chen ; qemu-dev  > de...@nongnu.org>
> > Subject: [PATCH V2 0/4] COLO net and runstate bugfix/optimization
> >
> > This series fix some COLO related issues in internal stress testing.
> >
> >  - V2:
> > - Add more comments in patch 2/4 commit log.
> >
> > Zhang Chen (4):
> >   softmmu/runstate.c: add RunStateTransition support form COLO to
> > PRELAUNCH
> >   net/colo: Fix a "double free" crash to clear the conn_list
> >   net/colo.c: No need to track conn_list for filter-rewriter
> >   net/colo.c: fix segmentation fault when packet is not parsed correctly
> >
> >  net/colo-compare.c|  2 +-
> >  net/colo.c| 11 +--
> >  net/filter-rewriter.c |  2 +-
> >  net/trace-events  |  1 +
> >  softmmu/runstate.c|  1 +
> >  5 files changed, 13 insertions(+), 4 deletions(-)
> >
> > --
> > 2.25.1
>

Re: [PATCH v5 4/4] hw/acpi/aml-build: Use existing CPU topology to build PPTT table

2022-04-13 Thread wangyanan (Y)


Hi Gavin,

On 2022/4/3 22:59, Gavin Shan wrote:

When the PPTT table is built, the CPU topology is re-calculated, but
it's unecessary because the CPU topology has been populated in
virt_possible_cpu_arch_ids() on arm/virt machine.

This reworks build_pptt() to avoid by reusing the existing one in
ms->possible_cpus. Currently, the only user of build_pptt() is
arm/virt machine.

Signed-off-by: Gavin Shan 
---
  hw/acpi/aml-build.c | 100 +---
  1 file changed, 38 insertions(+), 62 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 4086879ebf..4b0f9df3e3 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -2002,86 +2002,62 @@ void build_pptt(GArray *table_data, BIOSLinker *linker, 
MachineState *ms,
  const char *oem_id, const char *oem_table_id)
  {
  MachineClass *mc = MACHINE_GET_CLASS(ms);
-GQueue *list = g_queue_new();
-guint pptt_start = table_data->len;
-guint parent_offset;
-guint length, i;
-int uid = 0;
-int socket;
+CPUArchIdList *cpus = ms->possible_cpus;
+int64_t socket_id = -1, cluster_id = -1, core_id = -1;

nit: why not using "int" for the ID variables? (to be consistent with how
the IDs are stored in CpuInstanceProperties).

Thanks,
Yanan

+uint32_t socket_offset, cluster_offset, core_offset;
+uint32_t pptt_start = table_data->len;
+int n;
  AcpiTable table = { .sig = "PPTT", .rev = 2,
  .oem_id = oem_id, .oem_table_id = oem_table_id };
  
  acpi_table_begin(, table_data);
  
-for (socket = 0; socket < ms->smp.sockets; socket++) {

-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
-/*
- * Physical package - represents the boundary
- * of a physical package
- */
-(1 << 0),
-0, socket, NULL, 0);
-}
+for (n = 0; n < cpus->len; n++) {
+if (cpus->cpus[n].props.socket_id != socket_id) {
+socket_id = cpus->cpus[n].props.socket_id;
+cluster_id = -1;
+core_id = -1;
+socket_offset = table_data->len - pptt_start;
+build_processor_hierarchy_node(table_data,
+(1 << 0), /* Physical package */
+0, socket_id, NULL, 0);
+}
  
-if (mc->smp_props.clusters_supported) {

-length = g_queue_get_length(list);
-for (i = 0; i < length; i++) {
-int cluster;
-
-parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-for (cluster = 0; cluster < ms->smp.clusters; cluster++) {
-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
-(0 << 0), /* not a physical package */
-parent_offset, cluster, NULL, 0);
+if (mc->smp_props.clusters_supported) {
+if (cpus->cpus[n].props.cluster_id != cluster_id) {
+cluster_id = cpus->cpus[n].props.cluster_id;
+core_id = -1;
+cluster_offset = table_data->len - pptt_start;
+build_processor_hierarchy_node(table_data,
+(0 << 0), /* Not a physical package */
+socket_offset, cluster_id, NULL, 0);
  }
+} else {
+cluster_offset = socket_offset;
  }
-}
  
-length = g_queue_get_length(list);

-for (i = 0; i < length; i++) {
-int core;
-
-parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-for (core = 0; core < ms->smp.cores; core++) {
-if (ms->smp.threads > 1) {
-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
+if (ms->smp.threads <= 1) {
+build_processor_hierarchy_node(table_data,
+(1 << 1) | /* ACPI Processor ID valid */
+(1 << 3),  /* Node is a Leaf */
+cluster_offset, n, NULL, 0);
+} else {
+if (cpus->cpus[n].props.core_id != core_id) {
+core_id = cpus->cpus[n].props.core_id;
+core_offset = table_data->len - pptt_start;
+build_processor_hierarchy_node(table_data,
  (0 << 0), /* not a physical package */
-parent_offset, core, NULL, 0);
-} else {
-build_processor_hierarchy_node(
-table_data,
-(1 << 1) | /* ACPI Processor ID valid */
-(1 << 3),  /* Node is a Leaf */
-parent_offset, uid++, NULL, 0);
+cluster_offset, core_id, NULL, 0);

Re: [PATCH v5 4/4] hw/acpi/aml-build: Use existing CPU topology to build PPTT table

2022-04-13 Thread wangyanan (Y)


On 2022/4/14 8:33, Gavin Shan wrote:

Hi Igor,

On 4/13/22 9:52 PM, Igor Mammedov wrote:

On Sun,  3 Apr 2022 22:59:53 +0800
Gavin Shan  wrote:


When the PPTT table is built, the CPU topology is re-calculated, but
it's unecessary because the CPU topology has been populated in
virt_possible_cpu_arch_ids() on arm/virt machine.

This reworks build_pptt() to avoid by reusing the existing one in
ms->possible_cpus. Currently, the only user of build_pptt() is
arm/virt machine.

Signed-off-by: Gavin Shan 
---
  hw/acpi/aml-build.c | 100 
+---

  1 file changed, 38 insertions(+), 62 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 4086879ebf..4b0f9df3e3 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -2002,86 +2002,62 @@ void build_pptt(GArray *table_data, 
BIOSLinker *linker, MachineState *ms,

  const char *oem_id, const char *oem_table_id)
  {
  MachineClass *mc = MACHINE_GET_CLASS(ms);
-    GQueue *list = g_queue_new();
-    guint pptt_start = table_data->len;
-    guint parent_offset;
-    guint length, i;
-    int uid = 0;
-    int socket;
+    CPUArchIdList *cpus = ms->possible_cpus;
+    int64_t socket_id = -1, cluster_id = -1, core_id = -1;
+    uint32_t socket_offset, cluster_offset, core_offset;
+    uint32_t pptt_start = table_data->len;
+    int n;
  AcpiTable table = { .sig = "PPTT", .rev = 2,
  .oem_id = oem_id, .oem_table_id = 
oem_table_id };

    acpi_table_begin(, table_data);
  -    for (socket = 0; socket < ms->smp.sockets; socket++) {
-    g_queue_push_tail(list,
-    GUINT_TO_POINTER(table_data->len - pptt_start));
-    build_processor_hierarchy_node(
-    table_data,
-    /*
- * Physical package - represents the boundary
- * of a physical package
- */
-    (1 << 0),
-    0, socket, NULL, 0);
-    }
+    for (n = 0; n < cpus->len; n++) {



+    if (cpus->cpus[n].props.socket_id != socket_id) {
+    socket_id = cpus->cpus[n].props.socket_id;


this relies on cpus->cpus[n].props.*_id being sorted form top to down 
levels

I'd add here and for other container_id an assert() that checks for that
specific ID goes in only one direction, to be able to detect when 
rule is broken.


otherwise on may end up with duplicate containers silently.



Exactly. cpus->cpus[n].props.*_id is sorted as you said in 
virt_possible_cpu_arch_ids().
The only user of build_pptt() is arm/virt machine. So it's fine. 
However, I think I

may need add comments for this in v6.

    /*
 * This works with the assumption that cpus[n].props.*_id has been
 * sorted from top to down levels in mc->possible_cpu_arch_ids().
 * Otherwise, the unexpected and duplicate containers will be 
created.

 */

The implementation in v3 looks complicated, but comprehensive. The one
in this revision (v6) looks simple, but the we're losing flexibility :)



+    cluster_id = -1;
+    core_id = -1;
+    socket_offset = table_data->len - pptt_start;
+    build_processor_hierarchy_node(table_data,
+    (1 << 0), /* Physical package */
+    0, socket_id, NULL, 0);
+    }
  -    if (mc->smp_props.clusters_supported) {
-    length = g_queue_get_length(list);
-    for (i = 0; i < length; i++) {
-    int cluster;
-
-    parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-    for (cluster = 0; cluster < ms->smp.clusters; cluster++) {
-    g_queue_push_tail(list,
-    GUINT_TO_POINTER(table_data->len - pptt_start));
-    build_processor_hierarchy_node(
-    table_data,
-    (0 << 0), /* not a physical package */
-    parent_offset, cluster, NULL, 0);
+    if (mc->smp_props.clusters_supported) {
+    if (cpus->cpus[n].props.cluster_id != cluster_id) {
+    cluster_id = cpus->cpus[n].props.cluster_id;
+    core_id = -1;
+    cluster_offset = table_data->len - pptt_start;
+    build_processor_hierarchy_node(table_data,
+    (0 << 0), /* Not a physical package */
+    socket_offset, cluster_id, NULL, 0);
  }
+    } else {
+    cluster_offset = socket_offset;
  }
-    }
  -    length = g_queue_get_length(list);
-    for (i = 0; i < length; i++) {
-    int core;
-
-    parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-    for (core = 0; core < ms->smp.cores; core++) {
-    if (ms->smp.threads > 1) {
-    g_queue_push_tail(list,
-    GUINT_TO_POINTER(table_data->len - pptt_start));
-    build_processor_hierarchy_node(
-    table_data,
+    if (ms->smp.threads <= 1) {


why <= instead of < is used here?



It's the counterpart

Re: [PATCH v5 2/4] hw/arm/virt: Consider SMP configuration in CPU topology

2022-04-13 Thread wangyanan (Y)


On 2022/4/14 10:37, Gavin Shan wrote:

Hi Yanan,

On 4/14/22 10:27 AM, wangyanan (Y) wrote:

On 2022/4/14 8:08, Gavin Shan wrote:

On 4/13/22 8:39 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

Currently, the SMP configuration isn't considered when the CPU
topology is populated. In this case, it's impossible to provide
the default CPU-to-NUMA mapping or association based on the socket
ID of the given CPU.

This takes account of SMP configuration when the CPU topology
is populated. The die ID for the given CPU isn't assigned since
it's not supported on arm/virt machine yet.

Signed-off-by: Gavin Shan 
---
  hw/arm/virt.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d2e5ecd234..3174526730 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2505,6 +2505,7 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)

  int n;
  unsigned int max_cpus = ms->smp.max_cpus;
  VirtMachineState *vms = VIRT_MACHINE(ms);
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
  if (ms->possible_cpus) {
  assert(ms->possible_cpus->len == max_cpus);
@@ -2518,8 +2519,21 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)

  ms->possible_cpus->cpus[n].type = ms->cpu_type;
  ms->possible_cpus->cpus[n].arch_id =
  virt_cpu_mp_affinity(vms, n);
+
+    assert(!mc->smp_props.dies_supported);
+ ms->possible_cpus->cpus[n].props.has_socket_id = true;
+    ms->possible_cpus->cpus[n].props.socket_id =
+    (n / (ms->smp.clusters * ms->smp.cores * 
ms->smp.threads)) %

+    ms->smp.sockets;

No need for "% ms->smp.sockets".


Yeah, lets remove it in v6.


+ ms->possible_cpus->cpus[n].props.has_cluster_id = true;
+    ms->possible_cpus->cpus[n].props.cluster_id =
+    (n / (ms->smp.cores * ms->smp.threads)) % 
ms->smp.clusters;

+ ms->possible_cpus->cpus[n].props.has_core_id = true;
+    ms->possible_cpus->cpus[n].props.core_id =
+    (n / ms->smp.threads) % ms->smp.cores;
ms->possible_cpus->cpus[n].props.has_thread_id = true;
-    ms->possible_cpus->cpus[n].props.thread_id = n;
+    ms->possible_cpus->cpus[n].props.thread_id =
+    n % ms->smp.threads;
  }
  return ms->possible_cpus;
  }

Otherwise, looks good to me:
Reviewed-by: Yanan Wang 



Thanks for your time to review :)



Oh, after further testing this patch breaks numa-test for aarch64,
which should be checked and fixed. I guess it's because we have
more IDs supported for ARM. We have to fully running the QEMU
tests before sending some patches to ensure that they are not
breaking anything. :)



Thanks for catching the failure and reporting back. I'm not
too much familar with QEMU's test workframe. Could you please
share the detailed commands to reproduce the failure? I will
fix in v6, which will be done in a separate patch :)


There is a reference link: https://wiki.qemu.org/Testing
To catch the failure of this patch: "make check" will be enough.

Thanks,
Yanan

Thanks,
Gavin


.

Re: [PATCH v5 2/4] hw/arm/virt: Consider SMP configuration in CPU topology

2022-04-13 Thread Gavin Shan


Hi Yanan,

On 4/14/22 10:27 AM, wangyanan (Y) wrote:

On 2022/4/14 8:08, Gavin Shan wrote:

On 4/13/22 8:39 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

Currently, the SMP configuration isn't considered when the CPU
topology is populated. In this case, it's impossible to provide
the default CPU-to-NUMA mapping or association based on the socket
ID of the given CPU.

This takes account of SMP configuration when the CPU topology
is populated. The die ID for the given CPU isn't assigned since
it's not supported on arm/virt machine yet.

Signed-off-by: Gavin Shan 
---
  hw/arm/virt.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d2e5ecd234..3174526730 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2505,6 +2505,7 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)
  int n;
  unsigned int max_cpus = ms->smp.max_cpus;
  VirtMachineState *vms = VIRT_MACHINE(ms);
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
  if (ms->possible_cpus) {
  assert(ms->possible_cpus->len == max_cpus);
@@ -2518,8 +2519,21 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)
  ms->possible_cpus->cpus[n].type = ms->cpu_type;
  ms->possible_cpus->cpus[n].arch_id =
  virt_cpu_mp_affinity(vms, n);
+
+    assert(!mc->smp_props.dies_supported);
+    ms->possible_cpus->cpus[n].props.has_socket_id = true;
+    ms->possible_cpus->cpus[n].props.socket_id =
+    (n / (ms->smp.clusters * ms->smp.cores * ms->smp.threads)) %
+    ms->smp.sockets;

No need for "% ms->smp.sockets".


Yeah, lets remove it in v6.


+ ms->possible_cpus->cpus[n].props.has_cluster_id = true;
+    ms->possible_cpus->cpus[n].props.cluster_id =
+    (n / (ms->smp.cores * ms->smp.threads)) % ms->smp.clusters;
+    ms->possible_cpus->cpus[n].props.has_core_id = true;
+    ms->possible_cpus->cpus[n].props.core_id =
+    (n / ms->smp.threads) % ms->smp.cores;
  ms->possible_cpus->cpus[n].props.has_thread_id = true;
-    ms->possible_cpus->cpus[n].props.thread_id = n;
+    ms->possible_cpus->cpus[n].props.thread_id =
+    n % ms->smp.threads;
  }
  return ms->possible_cpus;
  }

Otherwise, looks good to me:
Reviewed-by: Yanan Wang 



Thanks for your time to review :)



Oh, after further testing this patch breaks numa-test for aarch64,
which should be checked and fixed. I guess it's because we have
more IDs supported for ARM. We have to fully running the QEMU
tests before sending some patches to ensure that they are not
breaking anything. :)



Thanks for catching the failure and reporting back. I'm not
too much familar with QEMU's test workframe. Could you please
share the detailed commands to reproduce the failure? I will
fix in v6, which will be done in a separate patch :)

Thanks,
Gavin

Re: [PATCH v5 1/4] qapi/machine.json: Add cluster-id

2022-04-13 Thread wangyanan (Y)


Hi Gavin,

Cc: Daniel and Markus
On 2022/4/14 8:06, Gavin Shan wrote:

Hi Yanan,

On 4/13/22 7:49 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

This adds cluster-id in CPU instance properties, which will be used
by arm/virt machine. Besides, the cluster-id is also verified or
dumped in various spots:

   * hw/core/machine.c::machine_set_cpu_numa_node() to associate
 CPU with its NUMA node.

   * hw/core/machine.c::machine_numa_finish_cpu_init() to associate
 CPU with NUMA node when no default association isn't provided.

   * hw/core/machine-hmp-cmds.c::hmp_hotpluggable_cpus() to dump
 cluster-id.

Signed-off-by: Gavin Shan 
---
  hw/core/machine-hmp-cmds.c |  4 
  hw/core/machine.c  | 16 
  qapi/machine.json  |  6 --
  3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c
index 4e2f319aeb..5cb5eecbfc 100644
--- a/hw/core/machine-hmp-cmds.c
+++ b/hw/core/machine-hmp-cmds.c
@@ -77,6 +77,10 @@ void hmp_hotpluggable_cpus(Monitor *mon, const 
QDict *qdict)

  if (c->has_die_id) {
  monitor_printf(mon, "    die-id: \"%" PRIu64 "\"\n", 
c->die_id);

  }
+    if (c->has_cluster_id) {
+    monitor_printf(mon, "    cluster-id: \"%" PRIu64 "\"\n",
+   c->cluster_id);
+    }
  if (c->has_core_id) {
  monitor_printf(mon, "    core-id: \"%" PRIu64 "\"\n", 
c->core_id);

  }
diff --git a/hw/core/machine.c b/hw/core/machine.c
index d856485cb4..8748b64657 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -677,6 +677,11 @@ void machine_set_cpu_numa_node(MachineState 
*machine,

  return;
  }
+    if (props->has_cluster_id && !slot->props.has_cluster_id) {
+    error_setg(errp, "cluster-id is not supported");
+    return;
+    }
+
  if (props->has_socket_id && !slot->props.has_socket_id) {
  error_setg(errp, "socket-id is not supported");
  return;
@@ -696,6 +701,11 @@ void machine_set_cpu_numa_node(MachineState 
*machine,

  continue;
  }
+    if (props->has_cluster_id &&
+    props->cluster_id != slot->props.cluster_id) {
+    continue;
+    }
+
  if (props->has_die_id && props->die_id != 
slot->props.die_id) {

  continue;
  }
@@ -990,6 +1000,12 @@ static char *cpu_slot_to_string(const 
CPUArchId *cpu)

  }
  g_string_append_printf(s, "die-id: %"PRId64, 
cpu->props.die_id);

  }
+    if (cpu->props.has_cluster_id) {
+    if (s->len) {
+    g_string_append_printf(s, ", ");
+    }
+    g_string_append_printf(s, "cluster-id: %"PRId64, 
cpu->props.cluster_id);

+    }
  if (cpu->props.has_core_id) {
  if (s->len) {
  g_string_append_printf(s, ", ");
diff --git a/qapi/machine.json b/qapi/machine.json
index 9c460ec450..ea22b574b0 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -868,10 +868,11 @@
  # @node-id: NUMA node ID the CPU belongs to
  # @socket-id: socket number within node/board the CPU belongs to
  # @die-id: die number within socket the CPU belongs to (since 4.1)
-# @core-id: core number within die the CPU belongs to
+# @cluster-id: cluster number within die the CPU belongs to

We also need a "(since 7.1)" tag for cluster-id.

I remember this should be "cluster number within socket..."
according to Igor's comments in v3 ?


Igor had suggestion to correct the description for 'core-id' like
below, but he didn't suggest anything for 'cluster-id'. The question
is clusters are sub-components of die, instead of socket, if die
is supported? You may want to me change it like below and please
confirm.

  @cluster-id: cluster number within die/socket the CPU belongs to

suggestion from Ignor in v3:

   > +# @core-id: core number within cluster the CPU belongs to

   s:cluster:cluster/die:


We want "within cluster/die" description for core-id because we
support both "cores in cluster" for ARM and "cores in die" for X86.
Base on this routine, we only need "within socket" for cluster-id
because we currently only support "clusters in socket". Does this
make sense?

Alternatively, the plainest documentation for the IDs is to simply
range **-id only to its next level topo like below. This may avoid
increasing complexity when more topo-ids are inserted middle.
But whether this way is acceptable is up to the Maintainers. :)

# @socket-id: socket number within node/board the CPU belongs to
# @die-id: die number within socket the CPU belongs to (since 4.1)
# @cluster-id: cluster number within die the CPU belongs to (since 7.1)
# @core-id: core number within cluster the CPU belongs to
# @thread-id: thread number within core the CPU belongs to

Thanks,
Yanan



+# @core-id: core number within cluster/die the CPU belongs to
  # @thread-id: thread number within core the

Re: [PATCH v5 2/4] hw/arm/virt: Consider SMP configuration in CPU topology

2022-04-13 Thread wangyanan (Y)


On 2022/4/14 8:08, Gavin Shan wrote:

Hi Yanan,

On 4/13/22 8:39 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

Currently, the SMP configuration isn't considered when the CPU
topology is populated. In this case, it's impossible to provide
the default CPU-to-NUMA mapping or association based on the socket
ID of the given CPU.

This takes account of SMP configuration when the CPU topology
is populated. The die ID for the given CPU isn't assigned since
it's not supported on arm/virt machine yet.

Signed-off-by: Gavin Shan 
---
  hw/arm/virt.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d2e5ecd234..3174526730 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2505,6 +2505,7 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)

  int n;
  unsigned int max_cpus = ms->smp.max_cpus;
  VirtMachineState *vms = VIRT_MACHINE(ms);
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
  if (ms->possible_cpus) {
  assert(ms->possible_cpus->len == max_cpus);
@@ -2518,8 +2519,21 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)

  ms->possible_cpus->cpus[n].type = ms->cpu_type;
  ms->possible_cpus->cpus[n].arch_id =
  virt_cpu_mp_affinity(vms, n);
+
+    assert(!mc->smp_props.dies_supported);
+    ms->possible_cpus->cpus[n].props.has_socket_id = true;
+    ms->possible_cpus->cpus[n].props.socket_id =
+    (n / (ms->smp.clusters * ms->smp.cores * 
ms->smp.threads)) %

+    ms->smp.sockets;

No need for "% ms->smp.sockets".


Yeah, lets remove it in v6.


+ ms->possible_cpus->cpus[n].props.has_cluster_id = true;
+    ms->possible_cpus->cpus[n].props.cluster_id =
+    (n / (ms->smp.cores * ms->smp.threads)) % 
ms->smp.clusters;

+    ms->possible_cpus->cpus[n].props.has_core_id = true;
+    ms->possible_cpus->cpus[n].props.core_id =
+    (n / ms->smp.threads) % ms->smp.cores;
  ms->possible_cpus->cpus[n].props.has_thread_id = true;
-    ms->possible_cpus->cpus[n].props.thread_id = n;
+    ms->possible_cpus->cpus[n].props.thread_id =
+    n % ms->smp.threads;
  }
  return ms->possible_cpus;
  }

Otherwise, looks good to me:
Reviewed-by: Yanan Wang 



Thanks for your time to review :)



Oh, after further testing this patch breaks numa-test for aarch64,
which should be checked and fixed. I guess it's because we have
more IDs supported for ARM. We have to fully running the QEMU
tests before sending some patches to ensure that they are not
breaking anything. :)

Thanks,
Yanan


.

Re: [PATCH v9 12/14] target/riscv: rvk: add CSR support for Zkr

2022-04-13 Thread Weiwei Li


Thanks for your comments.

在 2022/4/14 上午7:57, Alistair Francis 写道:

On Mon, Apr 11, 2022 at 2:46 PM Weiwei Li  wrote:

Hi, any comments on this patch or patchset?

Currently, read-only instruction to access Seed CSR is checked as a
special case in helper_csrr as suggested in

https://lists.nongnu.org/archive/html/qemu-riscv/2022-03/msg00146.html.

Ah sorry, I didn't realise you had updated this.


(The new version for that patch is in
https://lists.nongnu.org/archive/html/qemu-riscv/2022-03/msg00156.html)

Regards,

Weiwei Li

在 2022/3/18 下午12:19, Weiwei Li 写道:

   - add SEED CSR which must be accessed with a read-write instruction:
 A read-only instruction such as CSRRS/CSRRC with rs1=x0 or CSRRSI/CSRRCI
with uimm=0 will raise an illegal instruction exception.
   - add USEED, SSEED fields for MSECCFG CSR

Co-authored-by: Ruibo Lu 
Co-authored-by: Zewen Ye 
Signed-off-by: Weiwei Li 
Signed-off-by: Junqiang Wang 
---
   target/riscv/cpu_bits.h  |  9 ++
   target/riscv/csr.c   | 68 
   target/riscv/op_helper.c |  9 ++
   target/riscv/pmp.h   |  8 +++--
   4 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
index bb47cf7e77..d401100f47 100644
--- a/target/riscv/cpu_bits.h
+++ b/target/riscv/cpu_bits.h
@@ -458,6 +458,9 @@
   #define CSR_VSPMMASK0x2c1
   #define CSR_VSPMBASE0x2c2

+/* Crypto Extension */
+#define CSR_SEED0x015
+
   /* mstatus CSR bits */
   #define MSTATUS_UIE 0x0001
   #define MSTATUS_SIE 0x0002
@@ -800,4 +803,10 @@ typedef enum RISCVException {
   #define HVICTL_VALID_MASK  \
   (HVICTL_VTI | HVICTL_IID | HVICTL_IPRIOM | HVICTL_IPRIO)

+/* seed CSR bits */
+#define SEED_OPST(0b11 << 30)
+#define SEED_OPST_BIST   (0b00 << 30)
+#define SEED_OPST_WAIT   (0b01 << 30)
+#define SEED_OPST_ES16   (0b10 << 30)
+#define SEED_OPST_DEAD   (0b11 << 30)
   #endif
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 3c61dd69af..5717a51f56 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -24,6 +24,8 @@
   #include "qemu/main-loop.h"
   #include "exec/exec-all.h"
   #include "sysemu/cpu-timers.h"
+#include "qemu/guest-random.h"
+#include "qapi/error.h"

   /* CSR function table public API */
   void riscv_get_csr_ops(int csrno, riscv_csr_operations *ops)
@@ -292,6 +294,40 @@ static RISCVException epmp(CPURISCVState *env, int csrno)
   }
   #endif

+static RISCVException seed(CPURISCVState *env, int csrno)
+{
+RISCVCPU *cpu = env_archcpu(env);
+
+if (!cpu->cfg.ext_zkr) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+#if !defined(CONFIG_USER_ONLY)
+if (riscv_has_ext(env, RVS) && riscv_has_ext(env, RVH)) {
+/* Hypervisor extension is supported */
+if (riscv_cpu_virt_enabled(env) && (env->priv != PRV_M)) {

You can simplify this to just riscv_cpu_virt_enabled(). You don't need
to check if we have the extension as well.


Yeah, Maybe It can merge into following logic, like:

if (env->priv == PRV_M) { //M
return RISCV_EXCP_NONE;
} else if (riscv_cpu_virt_enabled(env)) { //VS/VU
if (env->mseccfg & MSECCFG_SSEED) {
return RISCV_EXCP_NONE;
} else {
return RISCV_EXCP_VIRT_INSTRUCTION_FAULT;
}
} else { //S/U
if (env->priv == PRV_S && (env->mseccfg & MSECCFG_SSEED)) {
return RISCV_EXCP_NONE;
} else if (env->priv == PRV_U && (env->mseccfg & MSECCFG_USEED)) {
return RISCV_EXCP_NONE;
} else {
return RISCV_EXCP_ILLEGAL_INST;
}
}





+if (env->mseccfg & MSECCFG_SSEED) {
+return RISCV_EXCP_VIRT_INSTRUCTION_FAULT;
+} else {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+}
+}
+
+if (env->priv == PRV_M) {
+return RISCV_EXCP_NONE;
+} else if (env->priv == PRV_S && (env->mseccfg & MSECCFG_SSEED)) {
+return RISCV_EXCP_NONE;
+} else if (env->priv == PRV_U && (env->mseccfg & MSECCFG_USEED)) {
+return RISCV_EXCP_NONE;
+} else {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+#else
+return RISCV_EXCP_NONE;
+#endif
+}
+
   /* User Floating-Point CSRs */
   static RISCVException read_fflags(CPURISCVState *env, int csrno,
 target_ulong *val)
@@ -2961,6 +2997,35 @@ static RISCVException write_upmbase(CPURISCVState *env, 
int csrno,

   #endif

+/* Crypto Extension */
+static RISCVException rmw_seed(CPURISCVState *env, int csrno,
+  target_ulong *ret_value,
+  target_ulong new_value, target_ulong write_mask)
+{
+uint16_t random_v;
+Error *random_e = NULL;
+int random_r;
+
+random_r = qemu_guest_getrandom(_v, 2, _e);
+if (unlikely(random_r < 0)) {
+/*
+ * Failed, for unknown reasons in the crypto subsystem.
+ * The best we can do is log the reason and return a
+

RE: [PATCH V2 0/4] COLO net and runstate bugfix/optimization

2022-04-13 Thread Zhang, Chen

No update for a while. Ping...

Thanks
Chen

> -Original Message-
> From: Zhang, Chen 
> Sent: Friday, April 1, 2022 11:47 AM
> To: Jason Wang ; Li Zhijian 
> Cc: Zhang, Chen ; qemu-dev  de...@nongnu.org>
> Subject: [PATCH V2 0/4] COLO net and runstate bugfix/optimization
> 
> This series fix some COLO related issues in internal stress testing.
> 
>  - V2:
> - Add more comments in patch 2/4 commit log.
> 
> Zhang Chen (4):
>   softmmu/runstate.c: add RunStateTransition support form COLO to
> PRELAUNCH
>   net/colo: Fix a "double free" crash to clear the conn_list
>   net/colo.c: No need to track conn_list for filter-rewriter
>   net/colo.c: fix segmentation fault when packet is not parsed correctly
> 
>  net/colo-compare.c|  2 +-
>  net/colo.c| 11 +--
>  net/filter-rewriter.c |  2 +-
>  net/trace-events  |  1 +
>  softmmu/runstate.c|  1 +
>  5 files changed, 13 insertions(+), 4 deletions(-)
> 
> --
> 2.25.1

fwcfg: Wrong callback behaviour after fw_cfg_modify_bytes_read

2022-04-13 Thread Christian A. Ehrhardt



Hi,

there's a long story behind this (see below). However, I'll start with
the result:

fw_cfg_modify_bytes_read() sets the callback data of an existing
fw_cfg file to NULL but leaves the actual callbacks in place.
Additioanlly, this function sets ->allow_write to false for no
good reason AFAICS.

For most callbacks, the callback will just crash on the NULL pointer
in ->callback_opaque if this path is ever hit. 

I think the following patch is required (I can properly format it
if you agree). I'm not 100% sure about the "allow_write" part, tough:

diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index e5f3c98184..b8b6d8fe10 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -742,8 +742,6 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, 
uint16_t key,
 ptr = s->entries[arch][key].data;
 s->entries[arch][key].data = data;
 s->entries[arch][key].len = len;
-s->entries[arch][key].callback_opaque = NULL;
-s->entries[arch][key].allow_write = false;
 
 return ptr;
 }

Oppinions?


For those interesed here's the somewhat longer story and the reason
why the diff actually matters:

We are running Windows in a Q35 based machine in UEFI mode with OVMF.
In some situations we saw that the Windows guest would hang in the
Windows boot loader after a guest initiated reboot of the virtual
machine. A hard "system_reset" would trigger the same bug.

The guest was hanging in a loop trying to read from unassigned
I/O port 0xb008. This is the default port used for the ACPI
PM timer on PIIX based machines (but remember that we use Q35 where
the PM timer lives at 0x608 instead).

It turned out that after the reboot OVMF would try to read the
ACPI tables from FWCFG but commands in the table-loader file
could not be executed correctly and OVMF falls back to some hard
coded PIIX based default.

ACPI tables and the table-loader data is initially generated
during setup but this data is re-generated via an FWCFG callback
(acpi_update_build) when the first of these files is accessed.
The tables generated at this later time differ slightly from those
generated during initial setup.

In our case these differences required a resize of the table-loader
romfile. This resize calls fw_cfg_modify_file() via the resize
hook of the memory region that contains the FWCFG file.
As described above this clears the ->callback_opaque data that
points to the build_state.

After a reboot rom_reset will restore the original contents of
the linker-loader file. In theory, this is only temporary. However,
due to the missing callback_opaque data the first call to
acpi_update_build() will do nothing. As a result the OVMF guest
reads an outdated version of the table-loader file. The actual
tables are properly re-generated on the next access to a different
FWCFG file that did not go through a resize. But at this point the
guest has already read the outdated table-loader data and trying to
apply this to the re-generated ACPI tables results in errors.

This results in broken ACPI tables as discussed above.

   regardsChristian

Re: [PATCH v5 4/4] hw/acpi/aml-build: Use existing CPU topology to build PPTT table

2022-04-13 Thread Gavin Shan


Hi Igor,

On 4/13/22 9:52 PM, Igor Mammedov wrote:

On Sun,  3 Apr 2022 22:59:53 +0800
Gavin Shan  wrote:


When the PPTT table is built, the CPU topology is re-calculated, but
it's unecessary because the CPU topology has been populated in
virt_possible_cpu_arch_ids() on arm/virt machine.

This reworks build_pptt() to avoid by reusing the existing one in
ms->possible_cpus. Currently, the only user of build_pptt() is
arm/virt machine.

Signed-off-by: Gavin Shan 
---
  hw/acpi/aml-build.c | 100 +---
  1 file changed, 38 insertions(+), 62 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 4086879ebf..4b0f9df3e3 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -2002,86 +2002,62 @@ void build_pptt(GArray *table_data, BIOSLinker *linker, 
MachineState *ms,
  const char *oem_id, const char *oem_table_id)
  {
  MachineClass *mc = MACHINE_GET_CLASS(ms);
-GQueue *list = g_queue_new();
-guint pptt_start = table_data->len;
-guint parent_offset;
-guint length, i;
-int uid = 0;
-int socket;
+CPUArchIdList *cpus = ms->possible_cpus;
+int64_t socket_id = -1, cluster_id = -1, core_id = -1;
+uint32_t socket_offset, cluster_offset, core_offset;
+uint32_t pptt_start = table_data->len;
+int n;
  AcpiTable table = { .sig = "PPTT", .rev = 2,
  .oem_id = oem_id, .oem_table_id = oem_table_id };
  
  acpi_table_begin(, table_data);
  
-for (socket = 0; socket < ms->smp.sockets; socket++) {

-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
-/*
- * Physical package - represents the boundary
- * of a physical package
- */
-(1 << 0),
-0, socket, NULL, 0);
-}
+for (n = 0; n < cpus->len; n++) {



+if (cpus->cpus[n].props.socket_id != socket_id) {
+socket_id = cpus->cpus[n].props.socket_id;


this relies on cpus->cpus[n].props.*_id being sorted form top to down levels
I'd add here and for other container_id an assert() that checks for that
specific ID goes in only one direction, to be able to detect when rule is 
broken.

otherwise on may end up with duplicate containers silently.



Exactly. cpus->cpus[n].props.*_id is sorted as you said in 
virt_possible_cpu_arch_ids().
The only user of build_pptt() is arm/virt machine. So it's fine. However, I 
think I
may need add comments for this in v6.

/*
 * This works with the assumption that cpus[n].props.*_id has been
 * sorted from top to down levels in mc->possible_cpu_arch_ids().
 * Otherwise, the unexpected and duplicate containers will be created.
 */

The implementation in v3 looks complicated, but comprehensive. The one
in this revision (v6) looks simple, but the we're losing flexibility :)



+cluster_id = -1;
+core_id = -1;
+socket_offset = table_data->len - pptt_start;
+build_processor_hierarchy_node(table_data,
+(1 << 0), /* Physical package */
+0, socket_id, NULL, 0);
+}
  
-if (mc->smp_props.clusters_supported) {

-length = g_queue_get_length(list);
-for (i = 0; i < length; i++) {
-int cluster;
-
-parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-for (cluster = 0; cluster < ms->smp.clusters; cluster++) {
-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
-(0 << 0), /* not a physical package */
-parent_offset, cluster, NULL, 0);
+if (mc->smp_props.clusters_supported) {
+if (cpus->cpus[n].props.cluster_id != cluster_id) {
+cluster_id = cpus->cpus[n].props.cluster_id;
+core_id = -1;
+cluster_offset = table_data->len - pptt_start;
+build_processor_hierarchy_node(table_data,
+(0 << 0), /* Not a physical package */
+socket_offset, cluster_id, NULL, 0);
  }
+} else {
+cluster_offset = socket_offset;
  }
-}
  
-length = g_queue_get_length(list);

-for (i = 0; i < length; i++) {
-int core;
-
-parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
-for (core = 0; core < ms->smp.cores; core++) {
-if (ms->smp.threads > 1) {
-g_queue_push_tail(list,
-GUINT_TO_POINTER(table_data->len - pptt_start));
-build_processor_hierarchy_node(
-table_data,
+if (ms->smp.threads <= 1) {


why <= instead of < is used here?



It's the counterpart to the one in the original implementation,

Re: [PATCH v5 2/4] hw/arm/virt: Consider SMP configuration in CPU topology

2022-04-13 Thread Gavin Shan


Hi Yanan,

On 4/13/22 8:39 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

Currently, the SMP configuration isn't considered when the CPU
topology is populated. In this case, it's impossible to provide
the default CPU-to-NUMA mapping or association based on the socket
ID of the given CPU.

This takes account of SMP configuration when the CPU topology
is populated. The die ID for the given CPU isn't assigned since
it's not supported on arm/virt machine yet.

Signed-off-by: Gavin Shan 
---
  hw/arm/virt.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d2e5ecd234..3174526730 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2505,6 +2505,7 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)
  int n;
  unsigned int max_cpus = ms->smp.max_cpus;
  VirtMachineState *vms = VIRT_MACHINE(ms);
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
  if (ms->possible_cpus) {
  assert(ms->possible_cpus->len == max_cpus);
@@ -2518,8 +2519,21 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)
  ms->possible_cpus->cpus[n].type = ms->cpu_type;
  ms->possible_cpus->cpus[n].arch_id =
  virt_cpu_mp_affinity(vms, n);
+
+    assert(!mc->smp_props.dies_supported);
+    ms->possible_cpus->cpus[n].props.has_socket_id = true;
+    ms->possible_cpus->cpus[n].props.socket_id =
+    (n / (ms->smp.clusters * ms->smp.cores * ms->smp.threads)) %
+    ms->smp.sockets;

No need for "% ms->smp.sockets".


Yeah, lets remove it in v6.


+    ms->possible_cpus->cpus[n].props.has_cluster_id = true;
+    ms->possible_cpus->cpus[n].props.cluster_id =
+    (n / (ms->smp.cores * ms->smp.threads)) % ms->smp.clusters;
+    ms->possible_cpus->cpus[n].props.has_core_id = true;
+    ms->possible_cpus->cpus[n].props.core_id =
+    (n / ms->smp.threads) % ms->smp.cores;
  ms->possible_cpus->cpus[n].props.has_thread_id = true;
-    ms->possible_cpus->cpus[n].props.thread_id = n;
+    ms->possible_cpus->cpus[n].props.thread_id =
+    n % ms->smp.threads;
  }
  return ms->possible_cpus;
  }

Otherwise, looks good to me:
Reviewed-by: Yanan Wang 



Thanks for your time to review :)

Thanks,
Gavin

Re: [PATCH v5 1/4] qapi/machine.json: Add cluster-id

2022-04-13 Thread Gavin Shan


Hi Yanan,

On 4/13/22 7:49 PM, wangyanan (Y) wrote:

On 2022/4/3 22:59, Gavin Shan wrote:

This adds cluster-id in CPU instance properties, which will be used
by arm/virt machine. Besides, the cluster-id is also verified or
dumped in various spots:

   * hw/core/machine.c::machine_set_cpu_numa_node() to associate
 CPU with its NUMA node.

   * hw/core/machine.c::machine_numa_finish_cpu_init() to associate
 CPU with NUMA node when no default association isn't provided.

   * hw/core/machine-hmp-cmds.c::hmp_hotpluggable_cpus() to dump
 cluster-id.

Signed-off-by: Gavin Shan 
---
  hw/core/machine-hmp-cmds.c |  4 
  hw/core/machine.c  | 16 
  qapi/machine.json  |  6 --
  3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c
index 4e2f319aeb..5cb5eecbfc 100644
--- a/hw/core/machine-hmp-cmds.c
+++ b/hw/core/machine-hmp-cmds.c
@@ -77,6 +77,10 @@ void hmp_hotpluggable_cpus(Monitor *mon, const QDict *qdict)
  if (c->has_die_id) {
  monitor_printf(mon, "    die-id: \"%" PRIu64 "\"\n", c->die_id);
  }
+    if (c->has_cluster_id) {
+    monitor_printf(mon, "    cluster-id: \"%" PRIu64 "\"\n",
+   c->cluster_id);
+    }
  if (c->has_core_id) {
  monitor_printf(mon, "    core-id: \"%" PRIu64 "\"\n", c->core_id);
  }
diff --git a/hw/core/machine.c b/hw/core/machine.c
index d856485cb4..8748b64657 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -677,6 +677,11 @@ void machine_set_cpu_numa_node(MachineState *machine,
  return;
  }
+    if (props->has_cluster_id && !slot->props.has_cluster_id) {
+    error_setg(errp, "cluster-id is not supported");
+    return;
+    }
+
  if (props->has_socket_id && !slot->props.has_socket_id) {
  error_setg(errp, "socket-id is not supported");
  return;
@@ -696,6 +701,11 @@ void machine_set_cpu_numa_node(MachineState *machine,
  continue;
  }
+    if (props->has_cluster_id &&
+    props->cluster_id != slot->props.cluster_id) {
+    continue;
+    }
+
  if (props->has_die_id && props->die_id != slot->props.die_id) {
  continue;
  }
@@ -990,6 +1000,12 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
  }
  g_string_append_printf(s, "die-id: %"PRId64, cpu->props.die_id);
  }
+    if (cpu->props.has_cluster_id) {
+    if (s->len) {
+    g_string_append_printf(s, ", ");
+    }
+    g_string_append_printf(s, "cluster-id: %"PRId64, 
cpu->props.cluster_id);
+    }
  if (cpu->props.has_core_id) {
  if (s->len) {
  g_string_append_printf(s, ", ");
diff --git a/qapi/machine.json b/qapi/machine.json
index 9c460ec450..ea22b574b0 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -868,10 +868,11 @@
  # @node-id: NUMA node ID the CPU belongs to
  # @socket-id: socket number within node/board the CPU belongs to
  # @die-id: die number within socket the CPU belongs to (since 4.1)
-# @core-id: core number within die the CPU belongs to
+# @cluster-id: cluster number within die the CPU belongs to

I remember this should be "cluster number within socket..."
according to Igor's comments in v3 ?


Igor had suggestion to correct the description for 'core-id' like
below, but he didn't suggest anything for 'cluster-id'. The question
is clusters are sub-components of die, instead of socket, if die
is supported? You may want to me change it like below and please
confirm.

  @cluster-id: cluster number within die/socket the CPU belongs to

suggestion from Ignor in v3:

   > +# @core-id: core number within cluster the CPU belongs to

   s:cluster:cluster/die:



+# @core-id: core number within cluster/die the CPU belongs to
  # @thread-id: thread number within core the CPU belongs to
  #
-# Note: currently there are 5 properties that could be present
+# Note: currently there are 6 properties that could be present
  #   but management should be prepared to pass through other
  #   properties with device_add command to allow for future
  #   interface extension. This also requires the filed names to be kept in
@@ -883,6 +884,7 @@
    'data': { '*node-id': 'int',
  '*socket-id': 'int',
  '*die-id': 'int',
+    '*cluster-id': 'int',
  '*core-id': 'int',
  '*thread-id': 'int'
    }

Otherwise, looks good to me:
Reviewed-by: Yanan Wang 

Please also keep the involved Maintainers on Cc list in next version,
an Ack from them is best. :)



Thanks again for your time to review. Sure, I will do in next posting.

Thanks,
Gavin

Re: [PULL v2 29/35] hw/intc: Add RISC-V AIA APLIC device emulation

2022-04-13 Thread Alistair Francis

On Wed, Apr 13, 2022 at 12:53 AM Peter Maydell  wrote:
>
> On Wed, 16 Feb 2022 at 08:43, Alistair Francis
>  wrote:
> >
> > From: Anup Patel 
> >
> > The RISC-V AIA (Advanced Interrupt Architecture) defines a new
> > interrupt controller for wired interrupts called APLIC (Advanced
> > Platform Level Interrupt Controller). The APLIC is capabable of
> > forwarding wired interupts to RISC-V HARTs directly or as MSIs
> > (Message Signaled Interupts).
> >
> > This patch adds device emulation for RISC-V AIA APLIC.
>
> Hi; Coverity has some issues with this change; they're kind of
> false positives but they do flag up a minor issue with the code.
> This is CID 1487105, 1487113, 1487185, 1487208.
>
> > +} else if ((APLIC_TARGET_BASE <= addr) &&
> > +(addr < (APLIC_TARGET_BASE + (aplic->num_irqs - 1) * 4))) {
> > +irq = ((addr - APLIC_TARGET_BASE) >> 2) + 1;
> > +return aplic->target[irq];
> > +} else if (!aplic->msimode && (APLIC_IDC_BASE <= addr) &&
> > +(addr < (APLIC_IDC_BASE + aplic->num_harts * APLIC_IDC_SIZE))) 
> > {
> > +idc = (addr - APLIC_IDC_BASE) / APLIC_IDC_SIZE;
>
> In expressions like these where we're checking "is addr between
> some base address and an upper bound calculated from num_irqs
> or num_harts", Coverity warns that we calculate expressions like
> (APLIC_TARGET_BASE + (aplic->num_irqs - 1) * 4) using 32-bits and
> then compare against the 64-bit 'addr', so there might be an
> unintentional overflow. Now clearly in this case num_irqs and num_harts
> should never be so large that there is an overflow, so in that
> sense Coverity is wrong and these are false positives. However...
>
> > +static void riscv_aplic_realize(DeviceState *dev, Error **errp)
> > +{
> > +uint32_t i;
> > +RISCVAPLICState *aplic = RISCV_APLIC(dev);
> > +
> > +aplic->bitfield_words = (aplic->num_irqs + 31) >> 5;
> > +aplic->sourcecfg = g_new0(uint32_t, aplic->num_irqs);
> > +aplic->state = g_new(uint32_t, aplic->num_irqs);
> > +aplic->target = g_new0(uint32_t, aplic->num_irqs);
> > +if (!aplic->msimode) {
> > +for (i = 0; i < aplic->num_irqs; i++) {
> > +aplic->target[i] = 1;
> > +}
> > +}
> > +aplic->idelivery = g_new0(uint32_t, aplic->num_harts);
> > +aplic->iforce = g_new0(uint32_t, aplic->num_harts);
> > +aplic->ithreshold = g_new0(uint32_t, aplic->num_harts);
> > +
> > +memory_region_init_io(>mmio, OBJECT(dev), _aplic_ops, 
> > aplic,
> > +  TYPE_RISCV_APLIC, aplic->aperture_size);
> > +sysbus_init_mmio(SYS_BUS_DEVICE(dev), >mmio);
> > +
> > +/*
> > + * Only root APLICs have hardware IRQ lines. All non-root APLICs
> > + * have IRQ lines delegated by their parent APLIC.
> > + */
> > +if (!aplic->parent) {
> > +qdev_init_gpio_in(dev, riscv_aplic_request, aplic->num_irqs);
> > +}
> > +
> > +/* Create output IRQ lines for non-MSI mode */
> > +if (!aplic->msimode) {
> > +aplic->external_irqs = g_malloc(sizeof(qemu_irq) * 
> > aplic->num_harts);
> > +qdev_init_gpio_out(dev, aplic->external_irqs, aplic->num_harts);
> > +
> > +/* Claim the CPU interrupt to be triggered by this APLIC */
> > +for (i = 0; i < aplic->num_harts; i++) {
> > +RISCVCPU *cpu = RISCV_CPU(qemu_get_cpu(aplic->hartid_base + 
> > i));
> > +if (riscv_cpu_claim_interrupts(cpu,
> > +(aplic->mmode) ? MIP_MEIP : MIP_SEIP) < 0) {
> > +error_report("%s already claimed",
> > + (aplic->mmode) ? "MEIP" : "SEIP");
> > +exit(1);
> > +}
> > +}
> > +}
> > +
> > +msi_nonbroken = true;
> > +}
>
> ...in the realize method we don't do any sanity checking of our
> QOM properties that set aplic->num_irqs and aplic->num_harts, so the
> creator of the device could in theory pass us some bogus values that
> cause overflow and other bad things.
>
> > +/*
> > + * Create APLIC device.
> > + */
> > +DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size,
> > +uint32_t hartid_base, uint32_t num_harts, uint32_t num_sources,
> > +uint32_t iprio_bits, bool msimode, bool mmode, DeviceState *parent)
> > +{
> > +DeviceState *dev = qdev_new(TYPE_RISCV_APLIC);
> > +uint32_t i;
> > +
> > +assert(num_harts < APLIC_MAX_IDC);
> > +assert((APLIC_IDC_BASE + (num_harts * APLIC_IDC_SIZE)) <= size);
> > +assert(num_sources < APLIC_MAX_SOURCE);
> > +assert(APLIC_MIN_IPRIO_BITS <= iprio_bits);
> > +assert(iprio_bits <= APLIC_MAX_IPRIO_BITS);
> > +
> > +qdev_prop_set_uint32(dev, "aperture-size", size);
> > +qdev_prop_set_uint32(dev, "hartid-base", hartid_base);
> > +qdev_prop_set_uint32(dev, "num-harts", num_harts);
> > +qdev_prop_set_uint32(dev, "iprio-mask", ((1U << iprio_bits) - 1));
> > +qdev_prop_set_uint32(dev, "num-irqs", num_sources + 1);
>
> You do make some assert()s on num_harts

Re: [PATCH v9 12/14] target/riscv: rvk: add CSR support for Zkr

2022-04-13 Thread Alistair Francis

On Mon, Apr 11, 2022 at 2:46 PM Weiwei Li  wrote:
>
> Hi, any comments on this patch or patchset?
>
> Currently, read-only instruction to access Seed CSR is checked as a
> special case in helper_csrr as suggested in
>
> https://lists.nongnu.org/archive/html/qemu-riscv/2022-03/msg00146.html.

Ah sorry, I didn't realise you had updated this.

>
> (The new version for that patch is in
> https://lists.nongnu.org/archive/html/qemu-riscv/2022-03/msg00156.html)
>
> Regards,
>
> Weiwei Li
>
> 在 2022/3/18 下午12:19, Weiwei Li 写道:
> >   - add SEED CSR which must be accessed with a read-write instruction:
> > A read-only instruction such as CSRRS/CSRRC with rs1=x0 or CSRRSI/CSRRCI
> > with uimm=0 will raise an illegal instruction exception.
> >   - add USEED, SSEED fields for MSECCFG CSR
> >
> > Co-authored-by: Ruibo Lu 
> > Co-authored-by: Zewen Ye 
> > Signed-off-by: Weiwei Li 
> > Signed-off-by: Junqiang Wang 
> > ---
> >   target/riscv/cpu_bits.h  |  9 ++
> >   target/riscv/csr.c   | 68 
> >   target/riscv/op_helper.c |  9 ++
> >   target/riscv/pmp.h   |  8 +++--
> >   4 files changed, 91 insertions(+), 3 deletions(-)
> >
> > diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
> > index bb47cf7e77..d401100f47 100644
> > --- a/target/riscv/cpu_bits.h
> > +++ b/target/riscv/cpu_bits.h
> > @@ -458,6 +458,9 @@
> >   #define CSR_VSPMMASK0x2c1
> >   #define CSR_VSPMBASE0x2c2
> >
> > +/* Crypto Extension */
> > +#define CSR_SEED0x015
> > +
> >   /* mstatus CSR bits */
> >   #define MSTATUS_UIE 0x0001
> >   #define MSTATUS_SIE 0x0002
> > @@ -800,4 +803,10 @@ typedef enum RISCVException {
> >   #define HVICTL_VALID_MASK  \
> >   (HVICTL_VTI | HVICTL_IID | HVICTL_IPRIOM | HVICTL_IPRIO)
> >
> > +/* seed CSR bits */
> > +#define SEED_OPST(0b11 << 30)
> > +#define SEED_OPST_BIST   (0b00 << 30)
> > +#define SEED_OPST_WAIT   (0b01 << 30)
> > +#define SEED_OPST_ES16   (0b10 << 30)
> > +#define SEED_OPST_DEAD   (0b11 << 30)
> >   #endif
> > diff --git a/target/riscv/csr.c b/target/riscv/csr.c
> > index 3c61dd69af..5717a51f56 100644
> > --- a/target/riscv/csr.c
> > +++ b/target/riscv/csr.c
> > @@ -24,6 +24,8 @@
> >   #include "qemu/main-loop.h"
> >   #include "exec/exec-all.h"
> >   #include "sysemu/cpu-timers.h"
> > +#include "qemu/guest-random.h"
> > +#include "qapi/error.h"
> >
> >   /* CSR function table public API */
> >   void riscv_get_csr_ops(int csrno, riscv_csr_operations *ops)
> > @@ -292,6 +294,40 @@ static RISCVException epmp(CPURISCVState *env, int 
> > csrno)
> >   }
> >   #endif
> >
> > +static RISCVException seed(CPURISCVState *env, int csrno)
> > +{
> > +RISCVCPU *cpu = env_archcpu(env);
> > +
> > +if (!cpu->cfg.ext_zkr) {
> > +return RISCV_EXCP_ILLEGAL_INST;
> > +}
> > +
> > +#if !defined(CONFIG_USER_ONLY)
> > +if (riscv_has_ext(env, RVS) && riscv_has_ext(env, RVH)) {
> > +/* Hypervisor extension is supported */
> > +if (riscv_cpu_virt_enabled(env) && (env->priv != PRV_M)) {

You can simplify this to just riscv_cpu_virt_enabled(). You don't need
to check if we have the extension as well.

> > +if (env->mseccfg & MSECCFG_SSEED) {
> > +return RISCV_EXCP_VIRT_INSTRUCTION_FAULT;
> > +} else {
> > +return RISCV_EXCP_ILLEGAL_INST;
> > +}
> > +}
> > +}
> > +
> > +if (env->priv == PRV_M) {
> > +return RISCV_EXCP_NONE;
> > +} else if (env->priv == PRV_S && (env->mseccfg & MSECCFG_SSEED)) {
> > +return RISCV_EXCP_NONE;
> > +} else if (env->priv == PRV_U && (env->mseccfg & MSECCFG_USEED)) {
> > +return RISCV_EXCP_NONE;
> > +} else {
> > +return RISCV_EXCP_ILLEGAL_INST;
> > +}
> > +#else
> > +return RISCV_EXCP_NONE;
> > +#endif
> > +}
> > +
> >   /* User Floating-Point CSRs */
> >   static RISCVException read_fflags(CPURISCVState *env, int csrno,
> > target_ulong *val)
> > @@ -2961,6 +2997,35 @@ static RISCVException write_upmbase(CPURISCVState 
> > *env, int csrno,
> >
> >   #endif
> >
> > +/* Crypto Extension */
> > +static RISCVException rmw_seed(CPURISCVState *env, int csrno,
> > +  target_ulong *ret_value,
> > +  target_ulong new_value, target_ulong 
> > write_mask)
> > +{
> > +uint16_t random_v;
> > +Error *random_e = NULL;
> > +int random_r;
> > +
> > +random_r = qemu_guest_getrandom(_v, 2, _e);
> > +if (unlikely(random_r < 0)) {
> > +/*
> > + * Failed, for unknown reasons in the crypto subsystem.
> > + * The best we can do is log the reason and return a
> > + * failure indication to the guest.  There is no reason
> > + * we know to expect the failure to be transitory,

Re: [PATCH] target/riscv/pmp: fix NAPOT range computation overflow

2022-04-13 Thread Alistair Francis

On Sat, Apr 9, 2022 at 2:25 AM Nicolas Pitre  wrote:
>
> There is an overflow with the current code where a pmpaddr value of
> 0x1fff is decoded as sa=0 and ea=0 whereas it should be sa=0 and
> ea=0x.
>
> Fix that by simplifying the computation. There is in fact no need for
> ctz64() nor special case for -1 to achieve proper results.
>
> Signed-off-by: Nicolas Pitre 

Reviewed-by: Alistair Francis 

Alistair

> ---
>
> This is in fact the same patch I posted yesterday but turns out its
> scope is far more important than I initially thought.
>
> diff --git a/target/riscv/pmp.c b/target/riscv/pmp.c
> index 81b61bb65c..151da3fa08 100644
> --- a/target/riscv/pmp.c
> +++ b/target/riscv/pmp.c
> @@ -141,17 +141,9 @@ static void pmp_decode_napot(target_ulong a, 
> target_ulong *sa, target_ulong *ea)
> 0111...   2^(XLEN+2)-byte NAPOT range
> ...   Reserved
>  */
> -if (a == -1) {
> -*sa = 0u;
> -*ea = -1;
> -return;
> -} else {
> -target_ulong t1 = ctz64(~a);
> -target_ulong base = (a & ~(((target_ulong)1 << t1) - 1)) << 2;
> -target_ulong range = ((target_ulong)1 << (t1 + 3)) - 1;
> -*sa = base;
> -*ea = base + range;
> -}
> +a = (a << 2) | 0x3;
> +*sa = a & (a + 1);
> +*ea = a | (a + 1);
>  }
>
>  void pmp_update_rule_addr(CPURISCVState *env, uint32_t pmp_index)
>

[PATCH] hw/arm/virt: impact of gic-version on max CPUs

2022-04-13 Thread Heinrich Schuchardt

Describe that the gic-version influences the maximum number of CPUs.

Signed-off-by: Heinrich Schuchardt 
---
 docs/system/arm/virt.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index 1544632b67..1af3f6a0a8 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -96,9 +96,9 @@ gic-version
   Valid values are:
 
   ``2``
-GICv2
+GICv2 - This limits the number of CPUs to 8.
   ``3``
-GICv3
+GICv3 - This allows up to 512 CPUs.
   ``host``
 Use the same GIC version the host provides, when using KVM
   ``max``
-- 
2.34.1

[PATCH v2 2/5] qga/commands-posix: Fix iface hw address detection

2022-04-13 Thread Andrew Deason

Since its introduction in commit 3424fc9f16a1 ("qemu-ga: add
guest-network-get-interfaces command"), guest-network-get-interfaces
seems to check if a given interface has a hardware address by checking
'ifa->ifa_flags & SIOCGIFHWADDR'. But ifa_flags is a field for IFF_*
flags (IFF_UP, IFF_LOOPBACK, etc), and comparing it to an ioctl like
SIOCGIFHWADDR doesn't make sense.

On Linux, this isn't a big deal, since SIOCGIFHWADDR has so many bits
set (0x8927), 'ifa->ifa_flags & SIOCGIFHWADDR' will usually have a
nonzero result for any 'normal'-looking interfaces: anything with
IFF_UP (0x1) or IFF_BROADCAST (0x2) set, as well as several
less-common flags. This means we'll try to get the hardware address
for most/all interfaces, even those that don't really have one (like
the loopback device). For those interfaces, Linux just returns a
hardware address of all zeroes.

On Solaris, however, trying to get the hardware address for a loopback
device returns an EADDRNOTAVAIL error. This causes us to return an
error and the entire guest-network-get-interfaces call fails.

Change this logic to always try to get the hardware address for each
interface, and don't return an error if we fail to get it. Instead,
just don't include the 'hardware-address' field in the result if we
can't get the hardware address.

Signed-off-by: Andrew Deason 
Reviewed-by: Michal Privoznik 
---
 qga/commands-posix.c | 39 ---
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index e0feb5ffb5..bd0d67f674 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2875,48 +2875,57 @@ GuestNetworkInterfaceList 
*qmp_guest_network_get_interfaces(Error **errp)
 
 info = guest_find_interface(head, ifa->ifa_name);
 
 if (!info) {
 info = g_malloc0(sizeof(*info));
 info->name = g_strdup(ifa->ifa_name);
 
 QAPI_LIST_APPEND(tail, info);
 }
 
-if (!info->has_hardware_address && ifa->ifa_flags & SIOCGIFHWADDR) {
+if (!info->has_hardware_address) {
 /* we haven't obtained HW address yet */
 sock = socket(PF_INET, SOCK_STREAM, 0);
 if (sock == -1) {
 error_setg_errno(errp, errno, "failed to create socket");
 goto error;
 }
 
 memset(, 0, sizeof(ifr));
 pstrcpy(ifr.ifr_name, IF_NAMESIZE, info->name);
 if (ioctl(sock, SIOCGIFHWADDR, ) == -1) {
-error_setg_errno(errp, errno,
- "failed to get MAC address of %s",
- ifa->ifa_name);
-close(sock);
-goto error;
-}
+/*
+ * We can't get the hw addr of this interface, but that's not a
+ * fatal error. Don't set info->hardware_address, but keep
+ * going.
+ */
+if (errno == EADDRNOTAVAIL) {
+/* The interface doesn't have a hw addr (e.g. loopback). */
+g_debug("failed to get MAC address of %s: %s",
+ifa->ifa_name, strerror(errno));
+} else{
+g_warning("failed to get MAC address of %s: %s",
+  ifa->ifa_name, strerror(errno));
+}
 
-close(sock);
-mac_addr = (unsigned char *) _hwaddr.sa_data;
+} else {
+mac_addr = (unsigned char *) _hwaddr.sa_data;
 
-info->hardware_address =
-g_strdup_printf("%02x:%02x:%02x:%02x:%02x:%02x",
-(int) mac_addr[0], (int) mac_addr[1],
-(int) mac_addr[2], (int) mac_addr[3],
-(int) mac_addr[4], (int) mac_addr[5]);
+info->hardware_address =
+g_strdup_printf("%02x:%02x:%02x:%02x:%02x:%02x",
+(int) mac_addr[0], (int) mac_addr[1],
+(int) mac_addr[2], (int) mac_addr[3],
+(int) mac_addr[4], (int) mac_addr[5]);
 
-info->has_hardware_address = true;
+info->has_hardware_address = true;
+}
+close(sock);
 }
 
 if (ifa->ifa_addr &&
 ifa->ifa_addr->sa_family == AF_INET) {
 /* interface with IPv4 address */
 p = &((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
 if (!inet_ntop(AF_INET, p, addr4, sizeof(addr4))) {
 error_setg_errno(errp, errno, "inet_ntop failed");
 goto error;
 }
-- 
2.11.0

[PATCH v2 3/5] qga/commands-posix: Fix listing ifaces for Solaris

2022-04-13 Thread Andrew Deason

The code for guest-network-get-interfaces needs a couple of small
adjustments for Solaris:

- The results from SIOCGIFHWADDR are documented as being in ifr_addr,
  not ifr_hwaddr (ifr_hwaddr doesn't exist on Solaris).

- The implementation of guest_get_network_stats is Linux-specific, so
  hide it under #ifdef CONFIG_LINUX. On non-Linux, we just won't
  provide network interface stats.

Signed-off-by: Andrew Deason 
Reviewed-by: Michal Privoznik 
---
 qga/commands-posix.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index bd0d67f674..c0b00fc488 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2781,20 +2781,21 @@ guest_find_interface(GuestNetworkInterfaceList *head,
 return head->value;
 }
 }
 
 return NULL;
 }
 
 static int guest_get_network_stats(const char *name,
GuestNetworkInterfaceStat *stats)
 {
+#ifdef CONFIG_LINUX
 int name_len;
 char const *devinfo = "/proc/net/dev";
 FILE *fp;
 char *line = NULL, *colon;
 size_t n = 0;
 fp = fopen(devinfo, "r");
 if (!fp) {
 return -1;
 }
 name_len = strlen(name);
@@ -2836,20 +2837,21 @@ static int guest_get_network_stats(const char *name,
 stats->tx_errs = tx_errs;
 stats->tx_dropped = tx_dropped;
 fclose(fp);
 g_free(line);
 return 0;
 }
 }
 fclose(fp);
 g_free(line);
 g_debug("/proc/net/dev: Interface '%s' not found", name);
+#endif /* CONFIG_LINUX */
 return -1;
 }
 
 /*
  * Build information about guest interfaces
  */
 GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp)
 {
 GuestNetworkInterfaceList *head = NULL, **tail = 
 struct ifaddrs *ifap, *ifa;
@@ -2901,22 +2903,25 @@ GuestNetworkInterfaceList 
*qmp_guest_network_get_interfaces(Error **errp)
 if (errno == EADDRNOTAVAIL) {
 /* The interface doesn't have a hw addr (e.g. loopback). */
 g_debug("failed to get MAC address of %s: %s",
 ifa->ifa_name, strerror(errno));
 } else{
 g_warning("failed to get MAC address of %s: %s",
   ifa->ifa_name, strerror(errno));
 }
 
 } else {
+#ifdef CONFIG_SOLARIS
+mac_addr = (unsigned char *) _addr.sa_data;
+#else
 mac_addr = (unsigned char *) _hwaddr.sa_data;
-
+#endif
 info->hardware_address =
 g_strdup_printf("%02x:%02x:%02x:%02x:%02x:%02x",
 (int) mac_addr[0], (int) mac_addr[1],
 (int) mac_addr[2], (int) mac_addr[3],
 (int) mac_addr[4], (int) mac_addr[5]);
 
 info->has_hardware_address = true;
 }
 close(sock);
 }
-- 
2.11.0

[PATCH v2 5/5] qga/commands-posix: 'guest-shutdown' for Solaris

2022-04-13 Thread Andrew Deason

On Solaris, instead of the -P, -H, and -r flags, we need to provide
the target init state to the 'shutdown' command: state 5 is poweroff,
0 is halt, and 6 is reboot. We also need to pass -g0 to avoid the
default 60-second delay, and -y to avoid a confirmation prompt.

Implement this logic under an #ifdef CONFIG_SOLARIS, so the
'guest-shutdown' command works properly on Solaris.

Signed-off-by: Andrew Deason 
---
Changes since v1:
- new in v2

 qga/commands-posix.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 97e001e998..8c30a9e575 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -88,43 +88,58 @@ static void ga_wait_child(pid_t pid, int *status, Error 
**errp)
 g_assert(rpid == pid);
 }
 
 void qmp_guest_shutdown(bool has_mode, const char *mode, Error **errp)
 {
 const char *shutdown_flag;
 Error *local_err = NULL;
 pid_t pid;
 int status;
 
+#ifdef CONFIG_SOLARIS
+const char *powerdown_flag = "-i5";
+const char *halt_flag = "-i0";
+const char *reboot_flag = "-i6";
+#else
+const char *powerdown_flag = "-P";
+const char *halt_flag = "-H";
+const char *reboot_flag = "-r";
+#endif
+
 slog("guest-shutdown called, mode: %s", mode);
 if (!has_mode || strcmp(mode, "powerdown") == 0) {
-shutdown_flag = "-P";
+shutdown_flag = powerdown_flag;
 } else if (strcmp(mode, "halt") == 0) {
-shutdown_flag = "-H";
+shutdown_flag = halt_flag;
 } else if (strcmp(mode, "reboot") == 0) {
-shutdown_flag = "-r";
+shutdown_flag = reboot_flag;
 } else {
 error_setg(errp,
"mode is invalid (valid values are: halt|powerdown|reboot");
 return;
 }
 
 pid = fork();
 if (pid == 0) {
 /* child, start the shutdown */
 setsid();
 reopen_fd_to_null(0);
 reopen_fd_to_null(1);
 reopen_fd_to_null(2);
 
+#ifdef CONFIG_SOLARIS
+execle("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y",
+   "hypervisor initiated shutdown", (char *)NULL, environ);
+#else
 execle("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0",
"hypervisor initiated shutdown", (char *)NULL, environ);
+#endif
 _exit(EXIT_FAILURE);
 } else if (pid < 0) {
 error_setg_errno(errp, errno, "failed to create child process");
 return;
 }
 
 ga_wait_child(pid, , _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
-- 
2.11.0

[PATCH v2 0/5] qga: Implement shutdown/network-get-interfaces on Solaris

2022-04-13 Thread Andrew Deason

This implements the guest agent commands guest-network-get-interfaces and
guest-shutdown on Solaris. The implementation for these on Solaris is very
similar as on Linux, since both platforms have a similar getifaddrs() and a
'shutdown' command.

Changes since v1:
- Add debug messages for failing to get network iface stats
- Add implementation for 'guest-shutdown'

Andrew Deason (5):
  qga/commands-posix: Use getifaddrs when available
  qga/commands-posix: Fix iface hw address detection
  qga/commands-posix: Fix listing ifaces for Solaris
  qga/commands-posix: Log all net stats failures
  qga/commands-posix: 'guest-shutdown' for Solaris

 meson.build  |   1 +
 qga/commands-posix.c | 513 ---
 2 files changed, 282 insertions(+), 232 deletions(-)

-- 
2.11.0

[PATCH v2 4/5] qga/commands-posix: Log all net stats failures

2022-04-13 Thread Andrew Deason

guest_get_network_stats can silently fail in a couple of ways. Add
debug messages to these cases, so we're never completely silent on
failure.

Signed-off-by: Andrew Deason 
---
Changes since v1:
- new in v2

 qga/commands-posix.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index c0b00fc488..97e001e998 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2789,20 +2789,22 @@ static int guest_get_network_stats(const char *name,
GuestNetworkInterfaceStat *stats)
 {
 #ifdef CONFIG_LINUX
 int name_len;
 char const *devinfo = "/proc/net/dev";
 FILE *fp;
 char *line = NULL, *colon;
 size_t n = 0;
 fp = fopen(devinfo, "r");
 if (!fp) {
+g_debug("failed to open network stats %s: %s", devinfo,
+g_strerror(errno));
 return -1;
 }
 name_len = strlen(name);
 while (getline(, , fp) != -1) {
 long long dummy;
 long long rx_bytes;
 long long rx_packets;
 long long rx_errs;
 long long rx_dropped;
 long long tx_bytes;
@@ -2837,21 +2839,23 @@ static int guest_get_network_stats(const char *name,
 stats->tx_errs = tx_errs;
 stats->tx_dropped = tx_dropped;
 fclose(fp);
 g_free(line);
 return 0;
 }
 }
 fclose(fp);
 g_free(line);
 g_debug("/proc/net/dev: Interface '%s' not found", name);
-#endif /* CONFIG_LINUX */
+#else /* !CONFIG_LINUX */
+g_debug("Network stats reporting available only for Linux");
+#endif /* !CONFIG_LINUX */
 return -1;
 }
 
 /*
  * Build information about guest interfaces
  */
 GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp)
 {
 GuestNetworkInterfaceList *head = NULL, **tail = 
 struct ifaddrs *ifap, *ifa;
-- 
2.11.0

[PATCH v2 1/5] qga/commands-posix: Use getifaddrs when available

2022-04-13 Thread Andrew Deason

Currently, commands-posix.c assumes that getifaddrs() is only
available on Linux, and so the related guest agent command
guest-network-get-interfaces is only implemented for #ifdef __linux__.
This function does exist on other platforms, though, such as Solaris.
So, add a meson check for getifaddrs(), and move the code for
guest-network-get-interfaces to be built whenever getifaddrs() is
available.

The implementation for guest-network-get-interfaces still has some
Linux-specific code, which is not fixed in this commit. This commit
moves the relevant big chunks of code around without changing them, so
a future commit can change the code in place.

Signed-off-by: Andrew Deason 
Reviewed-by: Michal Privoznik 
---
 meson.build  |   1 +
 qga/commands-posix.c | 474 ++-
 2 files changed, 246 insertions(+), 229 deletions(-)

diff --git a/meson.build b/meson.build
index 861de93c4f..1c033bcc58 100644
--- a/meson.build
+++ b/meson.build
@@ -1633,20 +1633,21 @@ config_host_data.set('CONFIG_MEMALIGN', 
cc.has_function('memalign'))
 config_host_data.set('CONFIG_PPOLL', cc.has_function('ppoll'))
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: 
'#include '))
 config_host_data.set('CONFIG_PTHREAD_FCHDIR_NP', 
cc.has_function('pthread_fchdir_np'))
 config_host_data.set('CONFIG_SEM_TIMEDWAIT', cc.has_function('sem_timedwait', 
dependencies: threads))
 config_host_data.set('CONFIG_SENDFILE', cc.has_function('sendfile'))
 config_host_data.set('CONFIG_SETNS', cc.has_function('setns') and 
cc.has_function('unshare'))
 config_host_data.set('CONFIG_SYNCFS', cc.has_function('syncfs'))
 config_host_data.set('CONFIG_SYNC_FILE_RANGE', 
cc.has_function('sync_file_range'))
 config_host_data.set('CONFIG_TIMERFD', cc.has_function('timerfd_create'))
 config_host_data.set('HAVE_COPY_FILE_RANGE', 
cc.has_function('copy_file_range'))
+config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
 config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: 
util))
 config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: 
'#include '))
 if rdma.found()
   config_host_data.set('HAVE_IBV_ADVISE_MR',
cc.has_function('ibv_advise_mr',
args: config_host['RDMA_LIBS'].split(),
prefix: '#include 
'))
 endif
 
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 75dbaab68e..e0feb5ffb5 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -37,38 +37,45 @@
 #include 
 #define environ (*_NSGetEnviron())
 #else
 extern char **environ;
 #endif
 #endif
 
 #if defined(__linux__)
 #include 
 #include 
-#include 
-#include 
-#include 
-#include 
 #include 
 
 #ifdef CONFIG_LIBUDEV
 #include 
 #endif
 
 #ifdef FIFREEZE
 #define CONFIG_FSFREEZE
 #endif
 #ifdef FITRIM
 #define CONFIG_FSTRIM
 #endif
 #endif
 
+#ifdef HAVE_GETIFADDRS
+#include 
+#include 
+#include 
+#include 
+#include 
+#ifdef CONFIG_SOLARIS
+#include 
+#endif
+#endif
+
 static void ga_wait_child(pid_t pid, int *status, Error **errp)
 {
 pid_t rpid;
 
 *status = 0;
 
 do {
 rpid = waitpid(pid, status, 0);
 } while (rpid == -1 && errno == EINTR);
 
@@ -2147,237 +2154,20 @@ void qmp_guest_suspend_disk(Error **errp)
 void qmp_guest_suspend_ram(Error **errp)
 {
 guest_suspend(SUSPEND_MODE_RAM, errp);
 }
 
 void qmp_guest_suspend_hybrid(Error **errp)
 {
 guest_suspend(SUSPEND_MODE_HYBRID, errp);
 }
 
-static GuestNetworkInterface *
-guest_find_interface(GuestNetworkInterfaceList *head,
- const char *name)
-{
-for (; head; head = head->next) {
-if (strcmp(head->value->name, name) == 0) {
-return head->value;
-}
-}
-
-return NULL;
-}
-
-static int guest_get_network_stats(const char *name,
-   GuestNetworkInterfaceStat *stats)
-{
-int name_len;
-char const *devinfo = "/proc/net/dev";
-FILE *fp;
-char *line = NULL, *colon;
-size_t n = 0;
-fp = fopen(devinfo, "r");
-if (!fp) {
-return -1;
-}
-name_len = strlen(name);
-while (getline(, , fp) != -1) {
-long long dummy;
-long long rx_bytes;
-long long rx_packets;
-long long rx_errs;
-long long rx_dropped;
-long long tx_bytes;
-long long tx_packets;
-long long tx_errs;
-long long tx_dropped;
-char *trim_line;
-trim_line = g_strchug(line);
-if (trim_line[0] == '\0') {
-continue;
-}
-colon = strchr(trim_line, ':');
-if (!colon) {
-continue;
-}
-if (colon - name_len  == trim_line &&
-   strncmp(trim_line, name, name_len) == 0) {
-if (sscanf(colon + 1,
-"%lld %lld %lld %lld %lld %lld %lld %lld %lld

Re: [PATCH v7 12/17] vfio-user: IOMMU support for remote device

2022-04-13 Thread Jag Raman



> On Apr 13, 2022, at 2:24 PM, Jag Raman  wrote:
> 
> 
> 
>> On Apr 13, 2022, at 10:25 AM, Igor Mammedov  wrote:
>> 
>> On Fri, 25 Mar 2022 15:19:41 -0400
>> Jagannathan Raman  wrote:
>> 
>>> Assign separate address space for each device in the remote processes.
>>> 
>>> Signed-off-by: Elena Ufimtseva 
>>> Signed-off-by: John G Johnson 
>>> Signed-off-by: Jagannathan Raman 
>>> ---
>>> include/hw/remote/iommu.h | 18 
>>> hw/remote/iommu.c | 95 +++
>>> MAINTAINERS   |  2 +
>>> hw/remote/meson.build |  1 +
>>> 4 files changed, 116 insertions(+)
>>> create mode 100644 include/hw/remote/iommu.h
>>> create mode 100644 hw/remote/iommu.c
>>> 
>>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>>> new file mode 100644
>>> index 00..8f850400f1
>>> --- /dev/null
>>> +++ b/include/hw/remote/iommu.h
>>> @@ -0,0 +1,18 @@
>>> +/**
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or 
>>> later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#ifndef REMOTE_IOMMU_H
>>> +#define REMOTE_IOMMU_H
>>> +
>>> +#include "hw/pci/pci_bus.h"
>>> +
>>> +void remote_configure_iommu(PCIBus *pci_bus);
>>> +
>>> +void remote_iommu_del_device(PCIDevice *pci_dev);
>>> +
>>> +#endif
>>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>>> new file mode 100644
>>> index 00..13f329b45d
>>> --- /dev/null
>>> +++ b/hw/remote/iommu.c
>>> @@ -0,0 +1,95 @@
>>> +/**
>>> + * IOMMU for remote device
>>> + *
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or 
>>> later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu-common.h"
>>> +
>>> +#include "hw/remote/iommu.h"
>>> +#include "hw/pci/pci_bus.h"
>>> +#include "hw/pci/pci.h"
>>> +#include "exec/memory.h"
>>> +#include "exec/address-spaces.h"
>>> +#include "trace.h"
>>> +
>>> +struct RemoteIommuElem {
>>> +AddressSpace  as;
>>> +MemoryRegion  mr;
>>> +};
>>> +
>>> +struct RemoteIommuTable {
>>> +QemuMutex lock;
>>> +GHashTable *elem_by_bdf;
>>> +} remote_iommu_table;
>>> +
>>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>>> +
>>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>>> +  void *opaque, int devfn)
>>> +{
>>> +struct RemoteIommuTable *iommu_table = opaque;
>>> +struct RemoteIommuElem *elem = NULL;
>>> +int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
>>> +
>>> +elem = g_hash_table_lookup(iommu_table->elem_by_bdf, 
>>> INT2VOIDP(pci_bdf));
>>> +
>>> +if (!elem) {
>>> +g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
>>> +g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
>>> +
>>> +elem = g_malloc0(sizeof(struct RemoteIommuElem));
>>> +
>>> +memory_region_init(>mr, NULL, mr_name, UINT64_MAX);
>> goes here:
>>  memory_region_do_init()
>>   if (!owner) {  
>>   
>>   owner = container_get(qdev_get_machine(), "/unattached");  
>>   
>>   }  
>> 
>> then
>> 
>>> +address_space_init(>as, >mr, as_name);
>>> +
>>> +qemu_mutex_lock(_table->lock);
>>> +g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), 
>>> elem);
>>> +qemu_mutex_unlock(_table->lock);
>>> +}
>>> +
>>> +return >as;
>>> +}
>>> +
>>> +static void remote_iommu_del_elem(gpointer data)
>>> +{
>>> +struct RemoteIommuElem *elem = data;
>>> +
>>> +g_assert(elem);
>>> +
>>> +memory_region_unref(>mr);
>> 
>> here we call
>> object_unref(mr->owner); 
>> leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
>> it doesn't look correct
>> 
>> I thought that memory_region_unref() should be always paired with 
>> memory_region_ref()
>> 
>> and looking at memory_region_init(...owner...) history it looks like
>> owner-less (NULL) regions are not meant to be deleted ever.
> 
> Hi Igor,
> 
> Thanks for the pointers about ref counters for MemoryRegions.
> 
> It makes sense - MemoryRegions are not QEMU Objects. So their
> owner’s ref counters are used instead. So the expectation is that
> when the owner is destroyed, the MemoryRegions initialized by them
> also get destroyed simultaneously.

Well, MemoryRegions are indeed QEMU objects -
"memory_region_init() -> object_initialize()" initializes the object.
So we should be able to unref the MemoryRegion object directly.

We could make the PCIDevice as the owner of its IOMMU region -
when the device is finalized, its region would be finalized as well.

Given the above, I don’t think we would need a separate delete
function (such as remote_iommu_del_device()). When the device is

Re: [PATCH for-7.1 8/8] nbd: document what is protected by the CoMutexes

2022-04-13 Thread Eric Blake

On Tue, Apr 12, 2022 at 09:42:04PM +0200, Paolo Bonzini wrote:
> Signed-off-by: Paolo Bonzini 
> ---
>  block/nbd.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 8954243f50..8297da7e89 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -82,12 +82,18 @@ typedef struct BDRVNBDState {
>  NBDClientRequest requests[MAX_NBD_REQUESTS];
>  QEMUTimer *reconnect_delay_timer;
>  
> +/* Protects sending data on the socket.  */
>  CoMutex send_mutex;
> +
> +/*
> + * Protects receiving reply headers from the socket, as well as the
> + * fields reply, requests[].receiving and requests[].reply_possible
> + */
>  CoMutex receive_mutex;
> +NBDReply reply;
>  
>  QEMUTimer *open_timer;
>  
> -NBDReply reply;
>  BlockDriverState *bs;

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH for-7.1 7/8] nbd: take receive_mutex when reading requests[].receiving

2022-04-13 Thread Eric Blake

On Tue, Apr 12, 2022 at 09:42:03PM +0200, Paolo Bonzini wrote:
> requests[].receiving is set by nbd_receive_replies() under the receive_mutex;
> Read it under the same mutex as well.  Waking up receivers on errors happens
> after each reply finishes processing, in nbd_co_receive_one_chunk().
> If there is no currently-active reply, there are two cases:
> 
> * either there is no active request at all, in which case no
> element of request[] can have .receiving = true
> 
> * or nbd_receive_replies() must be running and waiting for receive_mutex;
> in that case it will get back to nbd_co_receive_one_chunk() because
> the socket has been shutdown, and all waiting coroutines will wake up
> in turn.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/nbd.c | 15 +++
>  1 file changed, 7 insertions(+), 8 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Paolo Bonzini


On 4/13/22 16:51, Kevin Wolf wrote:

So the idea is that we can do bdrv_graph_co_rdlock() in one thread and
the corresponding bdrv_graph_co_rdunlock() in a different thread?

Would the unlock somehow remember the original thread, or do you use the
"sum is correct" argument and allow negative counter values, so you can
end up having count +1 in A and -1 in B to represent "no active
readers"? If this happens, it's likely to happen many times, so do we
have to take integer overflows into account then?


The counter cannot be negative, so you can use uint32_t and sum modulo 
2^32.  You might have a thread with counter 2^31+1 which is negative in 
twos complement; and a thread with counter -2^31-1 which is positive in 
twos complement; but their sum cancels out correctly.


Paolo

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Paolo Bonzini


On 4/13/22 18:29, Kevin Wolf wrote:

A reader does not have to be a coroutine. AIO_WAIT_WHILE is not
mandatory to allow it to finish, it helps to ensure progress in case
some reader is waiting for something, but other than that is not
necessary IMO.

When it's outside of a coroutine, how would you implement waiting for a
writer to finish if not with AIO_WAIT_WHILE()?



In the main thread a non-coroutine can always read the graph though, 
because the only writer can be the main thread.


If the critical sections are large enough, I don't think rdlock needs to 
be taken outside a coroutine in the iothread, e.g. in a bottom half.



No I think if we focus on small pieces of code we end up having a
million lock/unlock pairs.


Yes, I agree. On the other hand, if we're taking the locks in high-level
outer operations, avoiding to take the lock recursively might become
harder. I guess we'll see how it works out when we actually introduce
callers.


My "hope" is that taking the locks in blk_* functions covers most of the 
calls, and then only a few (dozens) direct uses of bdrv_* remain.


Unfortunately, adding assertions is not easy because "is there a reader" 
cannot be easily answered.  But I think Emanuele has a debug mode that 
can enable the assertions at a performance cost.


Paolo

Re: [PATCH for-7.1 6/8] nbd: move s->state under requests_lock

2022-04-13 Thread Paolo Bonzini


On 4/13/22 18:23, Eric Blake wrote:


The function nbd_client_connecting_wait() was used mostly to check if
a request had to be reissued (outside requests_lock), but also
under requests_lock in nbd_client_connecting_wait().  The two uses have to

"Function A was used mostly..., but also under requests_lock in
function A."  Reading the rest of the patch, I think...[1]


be separated; for the former we rename it to nbd_client_will_reconnect()
and make it take s->requests_lock; for the latter the access can simply
be inlined.  The new name is clearer, and ensures that a missing
conversion is caught by the compiler.


I take it your experiments with C++ coroutines helped find this;)


No, they never went that far. :)  Rather, these atomics have always 
bugged me, and after Emanuele pointed me to the enter_all without lock, 
I noticed that they can be fixed with the same hammer.



+QEMU_LOCK_GUARD(>requests_lock);
+return s->state == NBD_CLIENT_CONNECTING_WAIT;
 }


[2]...while here, you only needed two lines, using QEMU_LOCK_GUARD.
Both styles work, but it seems like we should be consistent, and I
would favor the shorter style when all that is being guarded is a
single line.



QEMU_LOCK_GUARD() is a declaration in some sense (well, it is also a 
declaration when you expand the macro) and QEMU in general doesn't do 
declaration-after-statement.


Also, QEMU_LOCK_GUARD() emphasizes that the whole function is guarded, 
while WITH_QEMU_LOCK_GUARD() has the opposite effect on the reader.



although the suggestion in [3] to split out the function motion to a
separate patch may result in the v2 series looking different enough
that you may want to leave off my R-b to ensure I still review things
carefully.


Will do.

Paolo

Re: [PATCH for-7.1 5/8] nbd: use a QemuMutex to synchronize reconnection with coroutines

2022-04-13 Thread Paolo Bonzini


On 4/13/22 17:55, Eric Blake wrote:

-g_assert(qemu_in_coroutine());

Why is this assert dropped?  Is it because we've marked the function
with coroutine_fn?  If so, should we drop it earlier in the series,
when you added the label?


The label doesn't guarantee much, but in this case it's pretty clear-cut 
that we must be in a coroutine, the code just doesn't make sense otherwise.


We're also about to take a CoMutex (and before the patch we had already 
taken it) so if anywhere the place for the assertion would be 
qemu_co_mutex_lock().



Otherwise, the patch makes sense to me.

Re: [PATCH for-7.1 2/8] nbd: mark more coroutine_fns

2022-04-13 Thread Paolo Bonzini


On 4/13/22 14:25, Eric Blake wrote:

-static bool nbd_recv_coroutine_wake_one(NBDClientRequest *req)
+static bool coroutine_fn nbd_recv_coroutine_wake_one(NBDClientRequest *req)

This already has_coroutine_  in the name, would it be better as_co_?


  {
  if (req->receiving) {
  req->receiving = false;
@@ -144,7 +144,7 @@ static bool nbd_recv_coroutine_wake_one(NBDClientRequest 
*req)
  return false;
  }

-static void nbd_recv_coroutines_wake(BDRVNBDState *s, bool all)
+static void coroutine_fn nbd_recv_coroutines_wake(BDRVNBDState *s, bool all)

This already has_coroutines_  in the name, would it be better as_co_?


These mean "wake a coroutine", not "I'm in a coroutine", so I'd say they 
are fine as is.


Paolo

Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers

2022-04-13 Thread Paolo Bonzini


On 4/13/22 20:44, Alex Bennée wrote:

And:

Fixes: b7711471f5 ("target-i386: make xmm_regs 512-bit wide")
Resolves:https://gitlab.com/qemu-project/qemu/-/issues/420


It's not really a bug, beyond the possibility of uninitialized data in 
the migration stream.  The reporter was probably using a fork of QEMU.


Paolo

Re: [PATCH v7 12/17] vfio-user: IOMMU support for remote device

2022-04-13 Thread Peter Xu

On Wed, Apr 13, 2022 at 04:37:35PM +0200, Igor Mammedov wrote:
> On Thu, 31 Mar 2022 08:41:01 -0400
> Peter Xu  wrote:
> 
> > On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:
> > > On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:  
> > > > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:  
> > > > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:  
> > > > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:  
> > > > > > > This makes me wonder whether there is a deeper issue with the
> > > > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > > > unplugged.
> > > > > > > 
> > > > > > > From what I can tell this is not the case today:
> > > > > > > 
> > > > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds 
> > > > > > > device
> > > > > > >   address spaces but I can't find where they are removed and 
> > > > > > > freed.
> > > > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are 
> > > > > > > leaked.
> > > > > > > 
> > > > > > > - hw/i386/amd_iommu.c has similar leaks.  
> > > > > > 
> > > > > > AFAICT it's because there's no device-specific data cached in the
> > > > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > > > structures allocated here can be re-used when a new device is 
> > > > > > plugged in
> > > > > > after the old device unplugged.
> > > > > > 
> > > > > > It's definitely not ideal since after unplug (and before a new 
> > > > > > device
> > > > > > plugged in) the resource is not needed at all so it's kind of 
> > > > > > wasted, but
> > > > > > it should work functionally.  If to achieve that, some 
> > > > > > iommu_unplug() or
> > > > > > iommu_cleanup() hook sounds reasonable.  
> > > > > 
> > > > > I guess the question is whether PCI busses can be hotplugged with
> > > > > IOMMUs. If yes, then there is a memory leak that matters for
> > > > > intel_iommu.c and amd_iommu.c.  
> > > > 
> > > > It can't, and we only support one vIOMMU so far for both (commit
> > > > 1b3bf13890fd849b26).  Thanks,  
> > > 
> > > I see, thanks!
> > > 
> > > Okay, summarizing options for the vfio-user IOMMU:
> > > 
> > > 1. Use the same singleton approach as existing IOMMUs where the
> > >MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> > > 
> > > 2. Keep the approach in this patch where vfio-user code manually calls a
> > >custom delete function (not part of the pci_setup_iommu() API). This
> > >is slightly awkward to do without global state and that's what
> > >started this discussion.
> > > 
> > > 3. Introduce an optional pci_setup_iommu() callback:
> > > 
> > >typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int 
> > > devfn);
> > > 
> > >Solves the awkwardness of option #2. Not needed by existing IOMMU
> > >devices.  
> > 
> > Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
> > pass over the PCIDevice* too because in this case IIUC we'd need to double
> > check the device class before doing anything - we may not want to call the
> > vfio-user callbacks for general emulated devices under the same pci bus.
> > 
> > I think we could also fetch that from PCIBus.devices[devfn] but that's just
> > not as obvious.
> > 
> > Option 4) is as mentioned previously, that we add another device unplug
> > hook that can be registered per-device.  I just didn't think thoroughly on
> can you expand on why per device hook is needed?

E.g. when the pci bus that contains the vfio-user device also contains
another emulated device?  Then IIUC we only want to call the vfio-user hook
for the vfio-user device, not the rest ones on the same bus?

Per-bus will work too, but again then the per-bus hook will need to first
identify the PCIDevice* object so it'll work similarly as a per-device hook.

> 
> > how it would interact with the current HotplugHandler design yet.. it looks
> > quite similar but so far it's either for the machine type or pci bus, not
> > capable of registering on one single device (and it's always a mistery to
> > me why we'd rather ignore the per-bus hook if the machine hook
> > existed.. that's in qdev_get_hotplug_handler).
> 
> machine hook is there for bus-less devices mainly, if it's not defined
> code will fallback to bus handler if any exists.
> 
> However machine hook can also be used to override default hotplug chain
> to do to implement non trivial plug/unplug flow.
> for example see pc_get_hotplug_handler(), corresponding
> pc_machine_device_[pre_plug|plug|unplug...]_cb() where
> plug/unplug chain is altered for some PCI devices types.
> Perhaps the same can be done for vfio.

It just seems non-obvious, no?  For example, if someone implementes a pci
bus with hotplug_handler() being provided, it will surprise me a bit if
it's triggered conditionally, depending on which

Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers

2022-04-13 Thread Alex Bennée



Paolo Bonzini  writes:

> The i386 target consolidates all vector registers so that instead of
> XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
> fit all of SSE, AVX and AVX512.
>
> When TCG copies data from and to the SSE registers, it uses the
> full 64-byte width.  This is not a correctness issue because TCG
> never lets guest code see beyond the first 128 bits of the ZMM
> registers, however it causes uninitialized stack memory to
> make it to the CPU's migration stream.
>
> Fix it by only copying the low 16 bytes of the ZMMReg union into
> the destination register.
>
> Signed-off-by: Paolo Bonzini 

And:

Fixes: b7711471f5 ("target-i386: make xmm_regs 512-bit wide")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/420

?
-- 
Alex Bennée

Re: [PATCH v2 13/39] exec/log: Remove log_disas and log_target_disas

2022-04-13 Thread Alex Bennée



Richard Henderson  writes:

> These functions are no longer used.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

Re: [PATCH v7 12/17] vfio-user: IOMMU support for remote device

2022-04-13 Thread Jag Raman



> On Apr 13, 2022, at 10:25 AM, Igor Mammedov  wrote:
> 
> On Fri, 25 Mar 2022 15:19:41 -0400
> Jagannathan Raman  wrote:
> 
>> Assign separate address space for each device in the remote processes.
>> 
>> Signed-off-by: Elena Ufimtseva 
>> Signed-off-by: John G Johnson 
>> Signed-off-by: Jagannathan Raman 
>> ---
>> include/hw/remote/iommu.h | 18 
>> hw/remote/iommu.c | 95 +++
>> MAINTAINERS   |  2 +
>> hw/remote/meson.build |  1 +
>> 4 files changed, 116 insertions(+)
>> create mode 100644 include/hw/remote/iommu.h
>> create mode 100644 hw/remote/iommu.c
>> 
>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>> new file mode 100644
>> index 00..8f850400f1
>> --- /dev/null
>> +++ b/include/hw/remote/iommu.h
>> @@ -0,0 +1,18 @@
>> +/**
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef REMOTE_IOMMU_H
>> +#define REMOTE_IOMMU_H
>> +
>> +#include "hw/pci/pci_bus.h"
>> +
>> +void remote_configure_iommu(PCIBus *pci_bus);
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev);
>> +
>> +#endif
>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>> new file mode 100644
>> index 00..13f329b45d
>> --- /dev/null
>> +++ b/hw/remote/iommu.c
>> @@ -0,0 +1,95 @@
>> +/**
>> + * IOMMU for remote device
>> + *
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +
>> +#include "hw/remote/iommu.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "hw/pci/pci.h"
>> +#include "exec/memory.h"
>> +#include "exec/address-spaces.h"
>> +#include "trace.h"
>> +
>> +struct RemoteIommuElem {
>> +AddressSpace  as;
>> +MemoryRegion  mr;
>> +};
>> +
>> +struct RemoteIommuTable {
>> +QemuMutex lock;
>> +GHashTable *elem_by_bdf;
>> +} remote_iommu_table;
>> +
>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>> +
>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>> +  void *opaque, int devfn)
>> +{
>> +struct RemoteIommuTable *iommu_table = opaque;
>> +struct RemoteIommuElem *elem = NULL;
>> +int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
>> +
>> +elem = g_hash_table_lookup(iommu_table->elem_by_bdf, 
>> INT2VOIDP(pci_bdf));
>> +
>> +if (!elem) {
>> +g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
>> +g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
>> +
>> +elem = g_malloc0(sizeof(struct RemoteIommuElem));
>> +
>> +memory_region_init(>mr, NULL, mr_name, UINT64_MAX);
> goes here:
>   memory_region_do_init()
>if (!owner) {  
>   
>owner = container_get(qdev_get_machine(), "/unattached");  
>   
>}  
> 
> then
> 
>> +address_space_init(>as, >mr, as_name);
>> +
>> +qemu_mutex_lock(_table->lock);
>> +g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), 
>> elem);
>> +qemu_mutex_unlock(_table->lock);
>> +}
>> +
>> +return >as;
>> +}
>> +
>> +static void remote_iommu_del_elem(gpointer data)
>> +{
>> +struct RemoteIommuElem *elem = data;
>> +
>> +g_assert(elem);
>> +
>> +memory_region_unref(>mr);
> 
> here we call
>  object_unref(mr->owner); 
> leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
> it doesn't look correct
> 
> I thought that memory_region_unref() should be always paired with 
> memory_region_ref()
> 
> and looking at memory_region_init(...owner...) history it looks like
> owner-less (NULL) regions are not meant to be deleted ever.

Hi Igor,

Thanks for the pointers about ref counters for MemoryRegions.

It makes sense - MemoryRegions are not QEMU Objects. So their
owner’s ref counters are used instead. So the expectation is that
when the owner is destroyed, the MemoryRegions initialized by them
also get destroyed simultaneously.

In this case, RemoteIommuElem->mr does not have an owner. Therefore,
we don’t have to manage its ref counter. When RemoteIommuElem is
destroyed, the MemoryRegion should be cleaned up automatically.

--
Jag

> 
>> +address_space_destroy(>as);
>> +
>> +g_free(elem);
>> +}
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev)
>> +{
>> +int pci_bdf;
>> +
>> +if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
>> +return;
>> +}
>> +
>> +pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), 
>> pci_dev->devfn);
>> +
>> +qemu_mutex_lock(_iommu_table.lock);
>> +g_hash_table_remove(remote_iommu_table.elem_by_bdf,

Re: [PATCH v5 04/13] mm/shmem: Restrict MFD_INACCESSIBLE memory against RLIMIT_MEMLOCK

2022-04-13 Thread Jason Gunthorpe

On Wed, Apr 13, 2022 at 06:24:56PM +0200, David Hildenbrand wrote:
> On 12.04.22 16:36, Jason Gunthorpe wrote:
> > On Fri, Apr 08, 2022 at 08:54:02PM +0200, David Hildenbrand wrote:
> > 
> >> RLIMIT_MEMLOCK was the obvious candidate, but as we discovered int he
> >> past already with secretmem, it's not 100% that good of a fit (unmovable
> >> is worth than mlocked). But it gets the job done for now at least.
> > 
> > No, it doesn't. There are too many different interpretations how
> > MELOCK is supposed to work
> > 
> > eg VFIO accounts per-process so hostile users can just fork to go past
> > it.
> > 
> > RDMA is per-process but uses a different counter, so you can double up
> > 
> > iouring is per-user and users a 3rd counter, so it can triple up on
> > the above two
> 
> Thanks for that summary, very helpful.

I kicked off a big discussion when I suggested to change vfio to use
the same as io_uring

We may still end up trying it, but the major concern is that libvirt
sets the RLIMIT_MEMLOCK and if we touch anything here - including
fixing RDMA, or anything really, it becomes a uAPI break for libvirt..

> >> So I'm open for alternative to limit the amount of unmovable memory we
> >> might allocate for user space, and then we could convert seretmem as well.
> > 
> > I think it has to be cgroup based considering where we are now :\
> 
> Most probably. I think the important lessons we learned are that
> 
> * mlocked != unmovable.
> * RLIMIT_MEMLOCK should most probably never have been abused for
>   unmovable memory (especially, long-term pinning)

The trouble is I'm not sure how anything can correctly/meaningfully
set a limit.

Consider qemu where we might have 3 different things all pinning the
same page (rdma, iouring, vfio) - should the cgroup give 3x the limit?
What use is that really?

IMHO there are only two meaningful scenarios - either you are unpriv
and limited to a very small number for your user/cgroup - or you are
priv and you can do whatever you want.

The idea we can fine tune this to exactly the right amount for a
workload does not seem realistic and ends up exporting internal kernel
decisions into a uAPI..

Jason

[PATCH v2] Warn user if the vga flag is passed but no vga device is created

2022-04-13 Thread Gautam Agrawal

A global boolean variable "vga_interface_created"(declared in softmmu/globals.c)
has been used to track the creation of vga interface. If the vga flag is passed
in the command line "default_vga"(declared in softmmu/vl.c) variable is set to 
0.
To warn user, the condition checks if vga_interface_created is false
and default_vga is equal to 0.If "-vga none" is passed, this patch will not 
warn the
user regarding the creation of VGA device.

The initialisation of Global variable "vga_interface_created" in 
softmmu/globals.c
has also been corrected.

The warning "A -vga option was passed but this
machine type does not use that option; no VGA device has been created"
is logged if vga flag is passed but no vga device is created.

"vga_interface_created" has also been included in "xen_machine_pv.c" and
"fuloong2e.c". This patch has been tested for x86_64, i386, sparc, sparc64 and 
arm boards.

Signed-off-by: Gautam Agrawal 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/581
---
 hw/isa/isa-bus.c  | 1 +
 hw/mips/fuloong2e.c   | 1 +
 hw/pci/pci.c  | 1 +
 hw/sparc/sun4m.c  | 2 ++
 hw/sparc64/sun4u.c| 1 +
 hw/xenpv/xen_machine_pv.c | 1 +
 include/sysemu/sysemu.h   | 1 +
 softmmu/globals.c | 1 +
 softmmu/vl.c  | 6 ++
 9 files changed, 15 insertions(+)

diff --git a/hw/isa/isa-bus.c b/hw/isa/isa-bus.c
index 0ad1c5fd65..cd5ad3687d 100644
--- a/hw/isa/isa-bus.c
+++ b/hw/isa/isa-bus.c
@@ -166,6 +166,7 @@ bool isa_realize_and_unref(ISADevice *dev, ISABus *bus, 
Error **errp)
 
 ISADevice *isa_vga_init(ISABus *bus)
 {
+vga_interface_created = true;
 switch (vga_interface_type) {
 case VGA_CIRRUS:
 return isa_create_simple(bus, "isa-cirrus-vga");
diff --git a/hw/mips/fuloong2e.c b/hw/mips/fuloong2e.c
index c9f14e70a0..538453b426 100644
--- a/hw/mips/fuloong2e.c
+++ b/hw/mips/fuloong2e.c
@@ -321,6 +321,7 @@ static void mips_fuloong2e_init(MachineState *machine)
 
 /* GPU */
 if (vga_interface_type != VGA_NONE) {
+vga_interface_created = true;
 pci_dev = pci_new(-1, "ati-vga");
 dev = DEVICE(pci_dev);
 qdev_prop_set_uint32(dev, "vgamem_mb", 16);
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index dae9119bfe..fab9c80f8d 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2038,6 +2038,7 @@ PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus 
*rootbus,
 
 PCIDevice *pci_vga_init(PCIBus *bus)
 {
+vga_interface_created = true;
 switch (vga_interface_type) {
 case VGA_CIRRUS:
 return pci_create_simple(bus, -1, "cirrus-vga");
diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c
index 7f3a7c0027..f45e29acc8 100644
--- a/hw/sparc/sun4m.c
+++ b/hw/sparc/sun4m.c
@@ -921,6 +921,7 @@ static void sun4m_hw_init(MachineState *machine)
 /* sbus irq 5 */
 cg3_init(hwdef->tcx_base, slavio_irq[11], 0x0010,
  graphic_width, graphic_height, graphic_depth);
+vga_interface_created = true;
 } else {
 /* If no display specified, default to TCX */
 if (graphic_depth != 8 && graphic_depth != 24) {
@@ -936,6 +937,7 @@ static void sun4m_hw_init(MachineState *machine)
 
 tcx_init(hwdef->tcx_base, slavio_irq[11], 0x0010,
  graphic_width, graphic_height, graphic_depth);
+vga_interface_created = true;
 }
 }
 
diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c
index cda7df36e3..75334dba71 100644
--- a/hw/sparc64/sun4u.c
+++ b/hw/sparc64/sun4u.c
@@ -633,6 +633,7 @@ static void sun4uv_init(MemoryRegion *address_space_mem,
 switch (vga_interface_type) {
 case VGA_STD:
 pci_create_simple(pci_busA, PCI_DEVFN(2, 0), "VGA");
+vga_interface_created = true;
 break;
 case VGA_NONE:
 break;
diff --git a/hw/xenpv/xen_machine_pv.c b/hw/xenpv/xen_machine_pv.c
index 8df575a457..20c9611d71 100644
--- a/hw/xenpv/xen_machine_pv.c
+++ b/hw/xenpv/xen_machine_pv.c
@@ -63,6 +63,7 @@ static void xen_init_pv(MachineState *machine)
 if (vga_interface_type == VGA_XENFB) {
 xen_config_dev_vfb(0, "vnc");
 xen_config_dev_vkbd(0);
+vga_interface_created = true;
 }
 
 /* configure disks */
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index b9421e03ff..a558b895e4 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -32,6 +32,7 @@ typedef enum {
 } VGAInterfaceType;
 
 extern int vga_interface_type;
+extern bool vga_interface_created;
 
 extern int graphic_width;
 extern int graphic_height;
diff --git a/softmmu/globals.c b/softmmu/globals.c
index 3ebd718e35..98b64e0492 100644
--- a/softmmu/globals.c
+++ b/softmmu/globals.c
@@ -40,6 +40,7 @@ int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart = 1;
 int vga_interface_type = VGA_NONE;
+bool vga_interface_created;
 Chardev *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack;
 int singlestep;
diff --git a/softmmu/vl.c b/softmmu/vl.c

[RFC PATCH v7 23/25] vhost: Make possible to check for device exclusive vq group

2022-04-13 Thread Eugenio Pérez

CVQ needs to be in its own group, not shared with any data vq. Enable
the checking of it here, before introducing address space id concepts.

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/vhost.h |  2 +
 hw/net/vhost_net.c|  4 +-
 hw/virtio/vhost-vdpa.c| 79 ++-
 hw/virtio/trace-events|  1 +
 4 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 58a73e7b7a..034868fa9e 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -78,6 +78,8 @@ struct vhost_dev {
 int vq_index_end;
 /* if non-zero, minimum required value for max_queues */
 int num_queues;
+/* Must be a vq group different than any other vhost dev */
+bool independent_vq_group;
 uint64_t features;
 uint64_t acked_features;
 uint64_t backend_features;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 44a105ec29..10480e19e5 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -343,14 +343,16 @@ int vhost_net_start(VirtIODevice *dev, NetClientState 
*ncs,
 }
 
 for (i = 0; i < nvhosts; i++) {
+bool cvq_idx = i >= data_queue_pairs;
 
-if (i < data_queue_pairs) {
+if (!cvq_idx) {
 peer = qemu_get_peer(ncs, i);
 } else { /* Control Virtqueue */
 peer = qemu_get_peer(ncs, n->max_queue_pairs);
 }
 
 net = get_vhost_net(peer);
+net->dev.independent_vq_group = !!cvq_idx;
 vhost_net_set_vq_index(net, i * 2, index_end);
 
 /* Suppress the masking guest notifiers on vhost user
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 1948c5ca7d..4096555242 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -678,7 +678,8 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
 {
 uint64_t features;
 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
-0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
+0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
+0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
 int r;
 
 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, )) {
@@ -1098,6 +1099,78 @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 return true;
 }
 
+static int vhost_vdpa_get_vring_group(struct vhost_dev *dev,
+  struct vhost_vring_state *state)
+{
+int ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_VRING_GROUP, state);
+trace_vhost_vdpa_get_vring_group(dev, state->index, state->num);
+return ret;
+}
+
+static bool vhost_dev_is_independent_group(struct vhost_dev *dev)
+{
+struct vhost_vdpa *v = dev->opaque;
+struct vhost_vring_state this_vq_group = {
+.index = dev->vq_index,
+};
+int ret;
+
+if (!(dev->backend_cap & VHOST_BACKEND_F_IOTLB_ASID)) {
+return true;
+}
+
+if (!v->shadow_vqs_enabled) {
+return true;
+}
+
+ret = vhost_vdpa_get_vring_group(dev, _vq_group);
+if (unlikely(ret)) {
+goto call_err;
+}
+
+for (int i = 1; i < dev->nvqs; ++i) {
+struct vhost_vring_state vq_group = {
+.index = dev->vq_index + i,
+};
+
+ret = vhost_vdpa_get_vring_group(dev, _group);
+if (unlikely(ret)) {
+goto call_err;
+}
+if (unlikely(vq_group.num != this_vq_group.num)) {
+error_report("VQ %d group is different than VQ %d one",
+ this_vq_group.index, vq_group.index);
+return false;
+}
+}
+
+for (int i = 0; i < dev->vq_index_end; ++i) {
+struct vhost_vring_state vq_group = {
+.index = i,
+};
+
+if (dev->vq_index <= i && i < dev->vq_index + dev->nvqs) {
+continue;
+}
+
+ret = vhost_vdpa_get_vring_group(dev, _group);
+if (unlikely(ret)) {
+goto call_err;
+}
+if (unlikely(vq_group.num == this_vq_group.num)) {
+error_report("VQ %d group is the same as VQ %d one",
+ this_vq_group.index, vq_group.index);
+return false;
+}
+}
+
+return true;
+
+call_err:
+error_report("Can't read vq group, errno=%d (%s)", ret, g_strerror(-ret));
+return false;
+}
+
 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
 {
 struct vhost_vdpa *v = dev->opaque;
@@ -1106,6 +1179,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, 
bool started)
 
 if (started) {
 vhost_vdpa_host_notifiers_init(dev);
+if (dev->independent_vq_group &&
+!vhost_dev_is_independent_group(dev)) {
+return -1;
+}
 ok = vhost_vdpa_svqs_start(dev);
 if (unlikely(!ok)) {
 return -1;
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 48d9d5..e6fdc03514 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -43,6 +43,7 @@

[RFC PATCH v7 24/25] vdpa: Add asid attribute to vdpa device

2022-04-13 Thread Eugenio Pérez

We can configure ASID per group, but we still use asid 0 for every vdpa
device. Multiple asid support for cvq will be introduced in next
patches

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/vhost.h |  4 ++
 hw/net/vhost_net.c|  5 +++
 hw/virtio/vhost-vdpa.c| 95 ---
 net/vhost-vdpa.c  |  4 +-
 hw/virtio/trace-events|  9 ++--
 5 files changed, 94 insertions(+), 23 deletions(-)

diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 034868fa9e..640cf82168 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -76,8 +76,12 @@ struct vhost_dev {
 int vq_index;
 /* one past the last vq index for the virtio device (not vhost) */
 int vq_index_end;
+/* one past the last vq index of this virtqueue group */
+int vq_group_index_end;
 /* if non-zero, minimum required value for max_queues */
 int num_queues;
+/* address space id */
+uint32_t address_space_id;
 /* Must be a vq group different than any other vhost dev */
 bool independent_vq_group;
 uint64_t features;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 10480e19e5..a34df739a7 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -344,15 +344,20 @@ int vhost_net_start(VirtIODevice *dev, NetClientState 
*ncs,
 
 for (i = 0; i < nvhosts; i++) {
 bool cvq_idx = i >= data_queue_pairs;
+uint32_t vq_group_end;
 
 if (!cvq_idx) {
 peer = qemu_get_peer(ncs, i);
+vq_group_end = 2 * data_queue_pairs;
 } else { /* Control Virtqueue */
 peer = qemu_get_peer(ncs, n->max_queue_pairs);
+vq_group_end = 2 * data_queue_pairs + 1;
 }
 
 net = get_vhost_net(peer);
+net->dev.address_space_id = !!cvq_idx;
 net->dev.independent_vq_group = !!cvq_idx;
+net->dev.vq_group_index_end = vq_group_end;
 vhost_net_set_vq_index(net, i * 2, index_end);
 
 /* Suppress the masking guest notifiers on vhost user
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 4096555242..5ed211287c 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -79,6 +79,9 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr 
iova, hwaddr size,
 int ret = 0;
 
 msg.type = v->msg_type;
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) {
+msg.asid = v->dev->address_space_id;
+}
 msg.iotlb.iova = iova;
 msg.iotlb.size = size;
 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
@@ -90,8 +93,9 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr 
iova, hwaddr size,
 return 0;
 }
 
-   trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
-msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
+trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.asid, msg.iotlb.iova,
+ msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
+ msg.iotlb.type);
 
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write, fd=%d, errno=%d (%s)",
@@ -109,6 +113,9 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, 
hwaddr iova,
 int fd = v->device_fd;
 int ret = 0;
 
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) {
+msg.asid = v->dev->address_space_id;
+}
 msg.type = v->msg_type;
 msg.iotlb.iova = iova;
 msg.iotlb.size = size;
@@ -119,7 +126,7 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, 
hwaddr iova,
 return 0;
 }
 
-trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
+trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.asid, msg.iotlb.iova,
msg.iotlb.size, msg.iotlb.type);
 
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
@@ -134,6 +141,7 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, 
hwaddr iova,
 static void vhost_vdpa_listener_commit(MemoryListener *listener)
 {
 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+struct vhost_dev *dev = v->dev;
 struct vhost_msg_v2 msg = {};
 int fd = v->device_fd;
 size_t num = v->iotlb_updates->len;
@@ -142,9 +150,14 @@ static void vhost_vdpa_listener_commit(MemoryListener 
*listener)
 return;
 }
 
+if (dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_ASID)) {
+msg.asid = v->dev->address_space_id;
+}
+
 msg.type = v->msg_type;
 msg.iotlb.type = VHOST_IOTLB_BATCH_BEGIN;
-trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.asid,
+  msg.iotlb.type);
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write BEGIN_BATCH, fd=%d, errno=%d (%s)",
  fd, errno, strerror(errno));
@@ -162,7

[RFC PATCH v7 25/25] vdpa: Add x-cvq-svq

2022-04-13 Thread Eugenio Pérez

This isolates shadow cvq in its own group.

Signed-off-by: Eugenio Pérez 
---
 qapi/net.json|  8 +++-
 net/vhost-vdpa.c | 98 ++--
 2 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/qapi/net.json b/qapi/net.json
index 92848e4362..39c245e6cd 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -447,9 +447,12 @@
 #
 # @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
 # (default: false)
+# @x-cvq-svq: Start device with (experimental) shadow virtqueue in its own
+# virtqueue group. (Since 7.1)
+# (default: false)
 #
 # Features:
-# @unstable: Member @x-svq is experimental.
+# @unstable: Members @x-svq and x-cvq-svq are experimental.
 #
 # Since: 5.1
 ##
@@ -457,7 +460,8 @@
   'data': {
 '*vhostdev': 'str',
 '*queues':   'int',
-'*x-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
+'*x-svq':{'type': 'bool', 'features' : [ 'unstable'] },
+'*x-cvq-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
 
 ##
 # @NetClientDriver:
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index a6f803ea4e..851dacb902 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -377,6 +377,17 @@ static int vhost_vdpa_get_features(int fd, uint64_t 
*features, Error **errp)
 return ret;
 }
 
+static int vhost_vdpa_get_backend_features(int fd, uint64_t *features,
+   Error **errp)
+{
+int ret = ioctl(fd, VHOST_GET_BACKEND_FEATURES, features);
+if (ret) {
+error_setg_errno(errp, errno,
+"Fail to query backend features from vhost-vDPA device");
+}
+return ret;
+}
+
 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
   int *has_cvq, Error **errp)
 {
@@ -410,16 +421,56 @@ static int vhost_vdpa_get_max_queue_pairs(int fd, 
uint64_t features,
 return 1;
 }
 
+/**
+ * Check vdpa device to support CVQ group asid 1
+ *
+ * @vdpa_device_fd: Vdpa device fd
+ * @queue_pairs: Queue pairs
+ * @errp: Error
+ */
+static int vhost_vdpa_check_cvq_svq(int vdpa_device_fd, int queue_pairs,
+Error **errp)
+{
+uint64_t backend_features;
+unsigned num_as;
+int r;
+
+r = vhost_vdpa_get_backend_features(vdpa_device_fd, _features,
+errp);
+if (unlikely(r)) {
+return -1;
+}
+
+if (unlikely(!(backend_features & VHOST_BACKEND_F_IOTLB_ASID))) {
+error_setg(errp, "Device without IOTLB_ASID feature");
+return -1;
+}
+
+r = ioctl(vdpa_device_fd, VHOST_VDPA_GET_AS_NUM, _as);
+if (unlikely(r)) {
+error_setg_errno(errp, errno,
+ "Cannot retrieve number of supported ASs");
+return -1;
+}
+if (unlikely(num_as < 2)) {
+error_setg(errp, "Insufficient number of ASs (%u, min: 2)", num_as);
+}
+
+return 0;
+}
+
 int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
 NetClientState *peer, Error **errp)
 {
 const NetdevVhostVDPAOptions *opts;
+struct vhost_vdpa_iova_range iova_range;
 uint64_t features;
 int vdpa_device_fd;
 g_autofree NetClientState **ncs = NULL;
 NetClientState *nc;
 int queue_pairs, r, i, has_cvq = 0;
 g_autoptr(VhostIOVATree) iova_tree = NULL;
+ERRP_GUARD();
 
 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 opts = >u.vhost_vdpa;
@@ -444,8 +495,9 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 qemu_close(vdpa_device_fd);
 return queue_pairs;
 }
-if (opts->x_svq) {
-struct vhost_vdpa_iova_range iova_range;
+if (opts->x_cvq_svq || opts->x_svq) {
+vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+
 uint64_t invalid_dev_features =
 features & ~vdpa_svq_device_features &
 /* Transport are all accepted at this point */
@@ -457,7 +509,21 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
invalid_dev_features);
 goto err_svq;
 }
-vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+}
+
+if (opts->x_cvq_svq) {
+if (!has_cvq) {
+error_setg(errp, "Cannot use x-cvq-svq with a device without cvq");
+goto err_svq;
+}
+
+r = vhost_vdpa_check_cvq_svq(vdpa_device_fd, queue_pairs, errp);
+if (unlikely(r)) {
+error_prepend(errp, "Cannot configure CVQ SVQ: ");
+goto err_svq;
+}
+}
+if (opts->x_svq) {
 iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
 }
 
@@ -472,11 +538,35 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 }
 
 if (has_cvq) {
+g_autoptr(VhostIOVATree) cvq_iova_tree = NULL;
+
+if (opts->x_cvq_svq) {
+cvq_iova_tree =

[RFC PATCH v7 19/25] vhost: Add vhost_svq_inject

2022-04-13 Thread Eugenio Pérez

This allows qemu to inject packets to the device without guest's notice.

This will be use to inject net CVQ messages to restore status in the destination

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.h |   5 +
 hw/virtio/vhost-shadow-virtqueue.c | 179 +
 2 files changed, 160 insertions(+), 24 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index e06ac52158..2a5229e77f 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -17,6 +17,9 @@
 
 typedef struct SVQElement {
 VirtQueueElement elem;
+hwaddr in_iova;
+hwaddr out_iova;
+bool not_from_guest;
 } SVQElement;
 
 typedef void (*VirtQueueElementCallback)(VirtIODevice *vdev,
@@ -106,6 +109,8 @@ typedef struct VhostShadowVirtqueue {
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
+bool vhost_svq_inject(VhostShadowVirtqueue *svq, const struct iovec *iov,
+  size_t out_num, size_t in_num);
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 87980e2a9c..f3600df133 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -16,6 +16,7 @@
 #include "qemu/log.h"
 #include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
+#include "qemu/iov.h"
 
 /**
  * Validate the transport device features that both guests can use with the SVQ
@@ -122,7 +123,8 @@ static bool vhost_svq_translate_addr(const 
VhostShadowVirtqueue *svq,
 return true;
 }
 
-static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq,
+SVQElement *svq_elem, hwaddr *sg,
 const struct iovec *iovec, size_t num,
 bool more_descs, bool write)
 {
@@ -130,15 +132,39 @@ static bool 
vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
 unsigned n;
 uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 vring_desc_t *descs = svq->vring.desc;
-bool ok;
 
 if (num == 0) {
 return true;
 }
 
-ok = vhost_svq_translate_addr(svq, sg, iovec, num);
-if (unlikely(!ok)) {
-return false;
+if (svq_elem->not_from_guest) {
+DMAMap map = {
+.translated_addr = (hwaddr)iovec->iov_base,
+.size = ROUND_UP(iovec->iov_len, 4096) - 1,
+.perm = write ? IOMMU_RW : IOMMU_RO,
+};
+int r;
+
+if (unlikely(num != 1)) {
+error_report("Unexpected chain of element injected");
+return false;
+}
+r = vhost_iova_tree_map_alloc(svq->iova_tree, );
+if (unlikely(r != IOVA_OK)) {
+error_report("Cannot map injected element");
+return false;
+}
+
+r = svq->map_ops->map(map.iova, map.size + 1,
+  (void *)map.translated_addr, !write,
+  svq->map_ops_opaque);
+assert(r == 0);
+sg[0] = map.iova;
+} else {
+bool ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+if (unlikely(!ok)) {
+return false;
+}
 }
 
 for (n = 0; n < num; n++) {
@@ -166,7 +192,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, 
SVQElement *svq_elem,
 unsigned avail_idx;
 vring_avail_t *avail = svq->vring.avail;
 bool ok;
-g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
+g_autofree hwaddr *sgs = NULL;
+hwaddr *in_sgs, *out_sgs;
 
 *head = svq->free_head;
 
@@ -177,15 +204,23 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue 
*svq, SVQElement *svq_elem,
 return false;
 }
 
-ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
- elem->in_num > 0, false);
+if (!svq_elem->not_from_guest) {
+sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
+in_sgs = out_sgs = sgs;
+} else {
+in_sgs = _elem->in_iova;
+out_sgs = _elem->out_iova;
+}
+ok = vhost_svq_vring_write_descs(svq, svq_elem, out_sgs, elem->out_sg,
+ elem->out_num, elem->in_num > 0, false);
 if (unlikely(!ok)) {
 return false;
 }
 
-ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, 
false,
- true);
+ok = vhost_svq_vring_write_descs(svq, svq_elem, in_sgs, elem->in_sg,
+ elem->in_num, false, true);
 if (unlikely(!ok)) {
+/* TODO unwind out_sg */

[RFC PATCH v7 20/25] vdpa: add NetClientState->start() callback

2022-04-13 Thread Eugenio Pérez

It allows to inject custom code on device success start, right before
release lock.

Signed-off-by: Eugenio Pérez 
---
 include/net/net.h  | 2 ++
 hw/net/vhost_net.c | 4 
 2 files changed, 6 insertions(+)

diff --git a/include/net/net.h b/include/net/net.h
index 523136c7ac..2fc3002ab4 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -44,6 +44,7 @@ typedef struct NICConf {
 
 typedef void (NetPoll)(NetClientState *, bool enable);
 typedef bool (NetCanReceive)(NetClientState *);
+typedef void (NetStart)(NetClientState *);
 typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t);
 typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int);
 typedef void (NetCleanup) (NetClientState *);
@@ -71,6 +72,7 @@ typedef struct NetClientInfo {
 NetReceive *receive_raw;
 NetReceiveIOV *receive_iov;
 NetCanReceive *can_receive;
+NetStart *start;
 NetCleanup *cleanup;
 LinkStatusChanged *link_status_changed;
 QueryRxFilter *query_rx_filter;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 30379d2ca4..44a105ec29 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -274,6 +274,10 @@ static int vhost_net_start_one(struct vhost_net *net,
 }
 }
 }
+
+if (net->nc->info->start) {
+net->nc->info->start(net->nc);
+}
 return 0;
 fail:
 file.fd = -1;
-- 
2.27.0

[RFC PATCH v7 17/25] vhost: Add vhost_iova_tree_find

2022-04-13 Thread Eugenio Pérez

Just a simple wrapper so we can find DMAMap entries based on iova

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-iova-tree.h |  2 ++
 hw/virtio/vhost-iova-tree.c | 14 ++
 2 files changed, 16 insertions(+)

diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
index 2fc825d7b1..bacd17d99c 100644
--- a/hw/virtio/vhost-iova-tree.h
+++ b/hw/virtio/vhost-iova-tree.h
@@ -20,6 +20,8 @@ VhostIOVATree *vhost_iova_tree_acquire(VhostIOVATree 
*iova_tree);
 void vhost_iova_tree_release(VhostIOVATree *iova_tree);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_release);
 
+const DMAMap *vhost_iova_tree_find(const VhostIOVATree *iova_tree,
+   const DMAMap *map);
 const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
 const DMAMap *map);
 int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
index 31445cbdfc..c3d89a85ad 100644
--- a/hw/virtio/vhost-iova-tree.c
+++ b/hw/virtio/vhost-iova-tree.c
@@ -73,6 +73,20 @@ void vhost_iova_tree_release(VhostIOVATree *iova_tree)
 g_free(iova_tree);
 }
 
+/**
+ * Find a mapping in the tree that matches map
+ *
+ * @iova_tree  The iova tree
+ * @mapThe map
+ *
+ * Return a matching map that contains argument map or NULL
+ */
+const DMAMap *vhost_iova_tree_find(const VhostIOVATree *iova_tree,
+   const DMAMap *map)
+{
+return iova_tree_find(iova_tree->iova_taddr_map, map);
+}
+
 /**
  * Find the IOVA address stored from a memory address
  *
-- 
2.27.0

[RFC PATCH v7 18/25] vdpa: Add map/unmap operation callback to SVQ

2022-04-13 Thread Eugenio Pérez

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.h | 21 +++--
 hw/virtio/vhost-shadow-virtqueue.c |  8 +++-
 hw/virtio/vhost-vdpa.c | 20 +++-
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index 2809dee27b..e06ac52158 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -26,6 +26,15 @@ typedef struct VhostShadowVirtqueueOps {
 VirtQueueElementCallback used_elem_handler;
 } VhostShadowVirtqueueOps;
 
+typedef int (*vhost_svq_map_op)(hwaddr iova, hwaddr size, void *vaddr,
+bool readonly, void *opaque);
+typedef int (*vhost_svq_unmap_op)(hwaddr iova, hwaddr size, void *opaque);
+
+typedef struct VhostShadowVirtqueueMapOps {
+vhost_svq_map_op map;
+vhost_svq_unmap_op unmap;
+} VhostShadowVirtqueueMapOps;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
 /* Shadow vring */
@@ -73,6 +82,12 @@ typedef struct VhostShadowVirtqueue {
 /* Optional callbacks */
 const VhostShadowVirtqueueOps *ops;
 
+/* Device memory mapping callbacks */
+const VhostShadowVirtqueueMapOps *map_ops;
+
+/* Device memory mapping callbacks opaque */
+void *map_ops_opaque;
+
 /* Optional custom used virtqueue element handler */
 VirtQueueElementCallback used_elem_cb;
 
@@ -102,8 +117,10 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
VirtIODevice *vdev,
  VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
-const VhostShadowVirtqueueOps *ops);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_map,
+const VhostShadowVirtqueueOps *ops,
+const VhostShadowVirtqueueMapOps *map_ops,
+void *map_ops_opaque);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 72a403d90b..87980e2a9c 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -612,13 +612,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  *
  * @iova_tree: Tree to perform descriptors translations
  * @ops: SVQ operations hooks
+ * @map_ops: SVQ mapping operation hooks
+ * @map_ops_opaque: Opaque data to pass to mapping operations
  *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
 VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
-const VhostShadowVirtqueueOps *ops)
+const VhostShadowVirtqueueOps *ops,
+const VhostShadowVirtqueueMapOps *map_ops,
+void *map_ops_opaque)
 {
 g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 int r;
@@ -641,6 +645,8 @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree 
*iova_tree,
 event_notifier_set_handler(>hdev_call, vhost_svq_handle_call);
 svq->iova_tree = iova_tree;
 svq->ops = ops;
+svq->map_ops = map_ops;
+svq->map_ops_opaque = map_ops_opaque;
 return g_steal_pointer();
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9e62f3280d..1948c5ca7d 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -384,6 +384,22 @@ static int vhost_vdpa_get_dev_features(struct vhost_dev 
*dev,
 return ret;
 }
 
+static int vhost_vdpa_svq_map(hwaddr iova, hwaddr size, void *vaddr,
+  bool readonly, void *opaque)
+{
+return vhost_vdpa_dma_map(opaque, iova, size, vaddr, readonly);
+}
+
+static int vhost_vdpa_svq_unmap(hwaddr iova, hwaddr size, void *opaque)
+{
+return vhost_vdpa_dma_unmap(opaque, iova, size);
+}
+
+static const VhostShadowVirtqueueMapOps vhost_vdpa_svq_map_ops = {
+.map = vhost_vdpa_svq_map,
+.unmap = vhost_vdpa_svq_unmap,
+};
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
Error **errp)
 {
@@ -411,7 +427,9 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
struct vhost_vdpa *v,
 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 for (unsigned n = 0; n < hdev->nvqs; ++n) {
 g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree,
-v->shadow_vq_ops);
+   v->shadow_vq_ops,
+   _vdpa_svq_map_ops,
+   v);
 
 if

[RFC PATCH v7 21/25] vdpa: Add vhost_vdpa_start_control_svq

2022-04-13 Thread Eugenio Pérez

This will send CVQ commands in the destination machine, seting up
everything o there is no guest-visible change.

Signed-off-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 63 
 1 file changed, 63 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 38e6912255..15c3e4f703 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -203,10 +203,73 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, 
const uint8_t *buf,
 return 0;
 }
 
+static bool vhost_vdpa_start_control_svq(VhostShadowVirtqueue *svq,
+ VirtIODevice *vdev)
+{
+VirtIONet *n = VIRTIO_NET(vdev);
+uint64_t features = vdev->host_features;
+
+if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+const struct virtio_net_ctrl_hdr ctrl = {
+.class = VIRTIO_NET_CTRL_MAC,
+.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET,
+};
+uint8_t mac[6];
+const struct iovec data[] = {
+{
+.iov_base = (void *),
+.iov_len = sizeof(ctrl),
+},{
+.iov_base = mac,
+.iov_len = sizeof(mac),
+},{
+.iov_base = NULL,
+.iov_len = sizeof(virtio_net_ctrl_ack),
+}
+};
+bool ret;
+
+/* TODO: Only best effort? */
+memcpy(mac, n->mac, sizeof(mac));
+ret = vhost_svq_inject(svq, data, 2, 1);
+if (!ret) {
+return false;
+}
+}
+
+return true;
+}
+
+static void vhost_vdpa_start(NetClientState *nc)
+{
+assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+struct vhost_vdpa *v = >vhost_vdpa;
+struct vhost_dev *dev = >vhost_net->dev;
+VhostShadowVirtqueue *svq;
+
+if (nc->is_datapath) {
+/* This is not the cvq dev */
+return;
+}
+
+if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+return;
+}
+
+if (!v->shadow_vqs_enabled) {
+return;
+}
+
+svq = g_ptr_array_index(v->shadow_vqs, 0);
+vhost_vdpa_start_control_svq(svq, dev->vdev);
+}
+
 static NetClientInfo net_vhost_vdpa_info = {
 .type = NET_CLIENT_DRIVER_VHOST_VDPA,
 .size = sizeof(VhostVDPAState),
 .receive = vhost_vdpa_receive,
+.start = vhost_vdpa_start,
 .cleanup = vhost_vdpa_cleanup,
 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
 .has_ufo = vhost_vdpa_has_ufo,
-- 
2.27.0

[RFC PATCH v7 14/25] vhost: Add SVQElement

2022-04-13 Thread Eugenio Pérez

This allows SVQ to add metadata to the different queue elements

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.h |  8 --
 hw/virtio/vhost-shadow-virtqueue.c | 46 --
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index c132c994e9..f35d4b8f90 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -15,6 +15,10 @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/vhost-iova-tree.h"
 
+typedef struct SVQElement {
+VirtQueueElement elem;
+} SVQElement;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
 /* Shadow vring */
@@ -48,10 +52,10 @@ typedef struct VhostShadowVirtqueue {
 VhostIOVATree *iova_tree;
 
 /* Map for use the guest's descriptors */
-VirtQueueElement **ring_id_maps;
+SVQElement **ring_id_maps;
 
 /* Next VirtQueue element that guest made available */
-VirtQueueElement *next_guest_avail_elem;
+SVQElement *next_guest_avail_elem;
 
 /*
  * Backup next field for each descriptor so we can recover securely, not
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index f874374651..1702365475 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -159,9 +159,10 @@ static bool 
vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
 return true;
 }
 
-static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-VirtQueueElement *elem, unsigned *head)
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, SVQElement 
*svq_elem,
+unsigned *head)
 {
+const VirtQueueElement *elem = _elem->elem;
 unsigned avail_idx;
 vring_avail_t *avail = svq->vring.avail;
 bool ok;
@@ -203,7 +204,7 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 return true;
 }
 
-static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
 {
 unsigned qemu_head;
 bool ok = vhost_svq_add_split(svq, elem, _head);
@@ -252,19 +253,21 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue 
*svq)
 virtio_queue_set_notification(svq->vq, false);
 
 while (true) {
+SVQElement *svq_elem;
 VirtQueueElement *elem;
 bool ok;
 
 if (svq->next_guest_avail_elem) {
-elem = g_steal_pointer(>next_guest_avail_elem);
+svq_elem = g_steal_pointer(>next_guest_avail_elem);
 } else {
-elem = virtqueue_pop(svq->vq, sizeof(*elem));
+svq_elem = virtqueue_pop(svq->vq, sizeof(*svq_elem));
 }
 
-if (!elem) {
+if (!svq_elem) {
 break;
 }
 
+elem = _elem->elem;
 if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) 
{
 /*
  * This condition is possible since a contiguous buffer in GPA
@@ -277,11 +280,11 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue 
*svq)
  * queue the current guest descriptor and ignore further kicks
  * until some elements are used.
  */
-svq->next_guest_avail_elem = elem;
+svq->next_guest_avail_elem = svq_elem;
 return;
 }
 
-ok = vhost_svq_add(svq, elem);
+ok = vhost_svq_add(svq, svq_elem);
 if (unlikely(!ok)) {
 /* VQ is broken, just return and ignore any other kicks */
 return;
@@ -348,8 +351,7 @@ static uint16_t vhost_svq_last_desc_of_chain(const 
VhostShadowVirtqueue *svq,
 return i;
 }
 
-static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
-   uint32_t *len)
+static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, uint32_t *len)
 {
 const vring_used_t *used = svq->vring.used;
 vring_used_elem_t used_elem;
@@ -379,8 +381,8 @@ static VirtQueueElement 
*vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 return NULL;
 }
 
-num = svq->ring_id_maps[used_elem.id]->in_num +
-  svq->ring_id_maps[used_elem.id]->out_num;
+num = svq->ring_id_maps[used_elem.id]->elem.in_num +
+  svq->ring_id_maps[used_elem.id]->elem.out_num;
 last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
 svq->desc_next[last_used_chain] = svq->free_head;
 svq->free_head = used_elem.id;
@@ -401,11 +403,13 @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 vhost_svq_disable_notification(svq);
 while (true) {
 uint32_t len;
-g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, );

[RFC PATCH v7 15/25] vhost: Add custom used buffer callback

2022-04-13 Thread Eugenio Pérez

The callback allows SVQ users to know the VirtQueue requests and
responses. QEMU can use this to synchronize virtio device model state,
allowing to migrate it with minimum changes to the migration code.

In the case of networking, this will be used to inspect control
virtqueue messages.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.h | 16 +++-
 include/hw/virtio/vhost-vdpa.h |  2 ++
 hw/virtio/vhost-shadow-virtqueue.c |  9 -
 hw/virtio/vhost-vdpa.c |  3 ++-
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index f35d4b8f90..2809dee27b 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -19,6 +19,13 @@ typedef struct SVQElement {
 VirtQueueElement elem;
 } SVQElement;
 
+typedef void (*VirtQueueElementCallback)(VirtIODevice *vdev,
+ const VirtQueueElement *elem);
+
+typedef struct VhostShadowVirtqueueOps {
+VirtQueueElementCallback used_elem_handler;
+} VhostShadowVirtqueueOps;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
 /* Shadow vring */
@@ -63,6 +70,12 @@ typedef struct VhostShadowVirtqueue {
  */
 uint16_t *desc_next;
 
+/* Optional callbacks */
+const VhostShadowVirtqueueOps *ops;
+
+/* Optional custom used virtqueue element handler */
+VirtQueueElementCallback used_elem_cb;
+
 /* Next head to expose to the device */
 uint16_t shadow_avail_idx;
 
@@ -89,7 +102,8 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice 
*vdev,
  VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+const VhostShadowVirtqueueOps *ops);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index 4961acea8b..8b8834dd24 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -17,6 +17,7 @@
 #include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
 
 typedef struct VhostVDPAHostNotifier {
 MemoryRegion mr;
@@ -35,6 +36,7 @@ typedef struct vhost_vdpa {
 /* IOVA mapping used by the Shadow Virtqueue */
 VhostIOVATree *iova_tree;
 GPtrArray *shadow_vqs;
+const VhostShadowVirtqueueOps *shadow_vq_ops;
 struct vhost_dev *dev;
 VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 1702365475..72a403d90b 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -419,6 +419,10 @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 return;
 }
 virtqueue_fill(vq, elem, len, i++);
+
+if (svq->ops && svq->ops->used_elem_handler) {
+svq->ops->used_elem_handler(svq->vdev, elem);
+}
 }
 
 virtqueue_flush(vq, i);
@@ -607,12 +611,14 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * shadow methods and file descriptors.
  *
  * @iova_tree: Tree to perform descriptors translations
+ * @ops: SVQ operations hooks
  *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+const VhostShadowVirtqueueOps *ops)
 {
 g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 int r;
@@ -634,6 +640,7 @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree 
*iova_tree)
 event_notifier_init_fd(>svq_kick, VHOST_FILE_UNBIND);
 event_notifier_set_handler(>hdev_call, vhost_svq_handle_call);
 svq->iova_tree = iova_tree;
+svq->ops = ops;
 return g_steal_pointer();
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 6b370c918c..9e62f3280d 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -410,7 +410,8 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
struct vhost_vdpa *v,
 
 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 for (unsigned n = 0; n < hdev->nvqs; ++n) {
-g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
+g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree,
+v->shadow_vq_ops);
 
 if (unlikely(!svq)) {
 error_setg(errp, "Cannot create svq %u", n);
--

[RFC PATCH v7 22/25] vhost: Update kernel headers

2022-04-13 Thread Eugenio Pérez

Signed-off-by: Eugenio Pérez 
---
 include/standard-headers/linux/vhost_types.h | 11 -
 linux-headers/linux/vhost.h  | 25 
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/include/standard-headers/linux/vhost_types.h 
b/include/standard-headers/linux/vhost_types.h
index 0bd2684a2a..ce78551b0f 100644
--- a/include/standard-headers/linux/vhost_types.h
+++ b/include/standard-headers/linux/vhost_types.h
@@ -87,7 +87,7 @@ struct vhost_msg {
 
 struct vhost_msg_v2 {
uint32_t type;
-   uint32_t reserved;
+   uint32_t asid;
union {
struct vhost_iotlb_msg iotlb;
uint8_t padding[64];
@@ -153,4 +153,13 @@ struct vhost_vdpa_iova_range {
 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
 #define VHOST_NET_F_VIRTIO_NET_HDR 27
 
+/* Use message type V2 */
+#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
+/* IOTLB can accept batching hints */
+#define VHOST_BACKEND_F_IOTLB_BATCH  0x2
+/* IOTLB can accept address space identifier through V2 type of IOTLB
+ * message
+ */
+#define VHOST_BACKEND_F_IOTLB_ASID  0x3
+
 #endif
diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h
index c998860d7b..5e083490f1 100644
--- a/linux-headers/linux/vhost.h
+++ b/linux-headers/linux/vhost.h
@@ -89,11 +89,6 @@
 
 /* Set or get vhost backend capability */
 
-/* Use message type V2 */
-#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
-/* IOTLB can accept batching hints */
-#define VHOST_BACKEND_F_IOTLB_BATCH  0x2
-
 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
 #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64)
 
@@ -150,4 +145,24 @@
 /* Get the valid iova range */
 #define VHOST_VDPA_GET_IOVA_RANGE  _IOR(VHOST_VIRTIO, 0x78, \
 struct vhost_vdpa_iova_range)
+/* Get the number of virtqueue groups. */
+#define VHOST_VDPA_GET_GROUP_NUM   _IOR(VHOST_VIRTIO, 0x79, unsigned int)
+
+/* Get the number of address spaces. */
+#define VHOST_VDPA_GET_AS_NUM  _IOR(VHOST_VIRTIO, 0x7A, unsigned int)
+
+/* Get the group for a virtqueue: read index, write group in num,
+ * The virtqueue index is stored in the index field of
+ * vhost_vring_state. The group for this specific virtqueue is
+ * returned via num field of vhost_vring_state.
+ */
+#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B,   \
+ struct vhost_vring_state)
+/* Set the ASID for a virtqueue group. The group index is stored in
+ * the index field of vhost_vring_state, the ASID associated with this
+ * group is stored at num field of vhost_vring_state.
+ */
+#define VHOST_VDPA_SET_GROUP_ASID  _IOW(VHOST_VIRTIO, 0x7C, \
+struct vhost_vring_state)
+
 #endif
-- 
2.27.0

[RFC PATCH v7 11/25] virtio-net: Expose ctrl virtqueue logic

2022-04-13 Thread Eugenio Pérez

This allows external vhost-net devices to modify the state of the
VirtIO device model once vhost-vdpa device has acknowledge the control
commands.

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/virtio-net.h |  3 ++
 hw/net/virtio-net.c| 83 --
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index eb87032627..e62f9e227f 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -218,6 +218,9 @@ struct VirtIONet {
 struct EBPFRSSContext ebpf_rss;
 };
 
+unsigned virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+const struct iovec *in_sg, size_t in_num,
+struct iovec *out_sg, unsigned out_num);
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
const char *type);
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e4748a7e6c..5905a9285c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1419,57 +1419,70 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t 
cmd,
 return VIRTIO_NET_OK;
 }
 
-static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+unsigned virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+const struct iovec *in_sg, size_t in_num,
+struct iovec *out_sg, unsigned out_num)
 {
 VirtIONet *n = VIRTIO_NET(vdev);
 struct virtio_net_ctrl_hdr ctrl;
 virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-VirtQueueElement *elem;
 size_t s;
 struct iovec *iov, *iov2;
-unsigned int iov_cnt;
+
+if (iov_size(in_sg, in_num) < sizeof(status) ||
+iov_size(out_sg, out_num) < sizeof(ctrl)) {
+virtio_error(vdev, "virtio-net ctrl missing headers");
+return 0;
+}
+
+iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
+s = iov_to_buf(iov, out_num, 0, , sizeof(ctrl));
+iov_discard_front(, _num, sizeof(ctrl));
+if (s != sizeof(ctrl)) {
+status = VIRTIO_NET_ERR;
+} else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
+status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
+} else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
+status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
+} else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
+status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
+} else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
+status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
+} else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
+status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
+} else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
+status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+}
+
+s = iov_from_buf(in_sg, in_num, 0, , sizeof(status));
+assert(s == sizeof(status));
+
+g_free(iov2);
+return sizeof(status);
+}
+
+static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+{
+VirtQueueElement *elem;
 
 for (;;) {
+unsigned written;
 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
 if (!elem) {
 break;
 }
-if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
-iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
-virtio_error(vdev, "virtio-net ctrl missing headers");
+
+written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
+ elem->out_sg, elem->out_num);
+if (written > 0) {
+virtqueue_push(vq, elem, written);
+virtio_notify(vdev, vq);
+g_free(elem);
+} else {
 virtqueue_detach_element(vq, elem, 0);
 g_free(elem);
 break;
 }
-
-iov_cnt = elem->out_num;
-iov2 = iov = g_memdup2(elem->out_sg,
-   sizeof(struct iovec) * elem->out_num);
-s = iov_to_buf(iov, iov_cnt, 0, , sizeof(ctrl));
-iov_discard_front(, _cnt, sizeof(ctrl));
-if (s != sizeof(ctrl)) {
-status = VIRTIO_NET_ERR;
-} else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
-status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
-} else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
-status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
-} else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
-status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
-} else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
-status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
-} else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
-status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
-} else if

[RFC PATCH v7 16/25] vdpa: control virtqueue support on shadow virtqueue

2022-04-13 Thread Eugenio Pérez

Introduce the control virtqueue support for vDPA shadow virtqueue. This
is needed for advanced networking features like multiqueue.

To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR and
VIRTIO_NET_CTRL_MQ are implemented. If vDPA device is started with SVQ
support and virtio-net driver changes MAC or the number of queues
virtio-net device model will be updated with the new one.

Others cvq commands could be added here straightforwardly but they have
been not tested.

Signed-off-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 80 ++--
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index a8dde49198..38e6912255 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -11,6 +11,7 @@
 
 #include "qemu/osdep.h"
 #include "clients.h"
+#include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
 #include "net/vhost-vdpa.h"
 #include "hw/virtio/vhost-vdpa.h"
@@ -69,6 +70,30 @@ const int vdpa_feature_bits[] = {
 VHOST_INVALID_FEATURE_BIT
 };
 
+/** Supported device specific feature bits with SVQ */
+static const uint64_t vdpa_svq_device_features =
+BIT_ULL(VIRTIO_NET_F_CSUM) |
+BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
+BIT_ULL(VIRTIO_NET_F_MTU) |
+BIT_ULL(VIRTIO_NET_F_MAC) |
+BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
+BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
+BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
+BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
+BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
+BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
+BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
+BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
+BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
+BIT_ULL(VIRTIO_NET_F_STATUS) |
+BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+BIT_ULL(VIRTIO_NET_F_MQ) |
+BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
+BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
+BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
+BIT_ULL(VIRTIO_NET_F_STANDBY);
+
 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 {
 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -196,6 +221,46 @@ static int vhost_vdpa_get_iova_range(int fd,
 return ret < 0 ? -errno : 0;
 }
 
+static void vhost_vdpa_net_handle_ctrl(VirtIODevice *vdev,
+   const VirtQueueElement *elem)
+{
+struct virtio_net_ctrl_hdr ctrl;
+virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+size_t s;
+struct iovec in = {
+.iov_base = ,
+.iov_len = sizeof(status),
+};
+
+s = iov_to_buf(elem->out_sg, elem->out_num, 0, , sizeof(ctrl.class));
+if (s != sizeof(ctrl.class)) {
+return;
+}
+
+switch (ctrl.class) {
+case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+case VIRTIO_NET_CTRL_MQ:
+break;
+default:
+return;
+};
+
+s = iov_to_buf(elem->in_sg, elem->in_num, 0, , sizeof(status));
+if (s != sizeof(status) || status != VIRTIO_NET_OK) {
+return;
+}
+
+status = VIRTIO_NET_ERR;
+virtio_net_handle_ctrl_iov(vdev, , 1, elem->out_sg, elem->out_num);
+if (status != VIRTIO_NET_OK) {
+error_report("Bad CVQ processing in model");
+}
+}
+
+static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
+.used_elem_handler = vhost_vdpa_net_handle_ctrl,
+};
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
const char *device,
const char *name,
@@ -225,6 +290,9 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 s->vhost_vdpa.shadow_vqs_enabled = svq;
 s->vhost_vdpa.iova_tree = iova_tree ? vhost_iova_tree_acquire(iova_tree) :
   NULL;
+if (!is_datapath) {
+s->vhost_vdpa.shadow_vq_ops = _vdpa_net_svq_ops;
+}
 ret = vhost_vdpa_add(nc, (void *)>vhost_vdpa, queue_pair_index, nvqs);
 if (ret) {
 if (iova_tree) {
@@ -315,9 +383,15 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 }
 if (opts->x_svq) {
 struct vhost_vdpa_iova_range iova_range;
-
-if (has_cvq) {
-error_setg(errp, "vdpa svq does not work with cvq");
+uint64_t invalid_dev_features =
+features & ~vdpa_svq_device_features &
+/* Transport are all accepted at this point */
+~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
+ VIRTIO_TRANSPORT_F_END - 
VIRTIO_TRANSPORT_F_START);
+
+if (invalid_dev_features) {
+error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
+   invalid_dev_features);
 goto err_svq;
 }
 vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
-- 
2.27.0

[RFC PATCH v7 03/25] vdpa: Fix bad index calculus at vhost_vdpa_get_vring_base

2022-04-13 Thread Eugenio Pérez

Fixes: 6d0b222666 ("vdpa: Adapt vhost_vdpa_get_vring_base to SVQ")

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9e5fe15d03..1f229ff4cb 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1172,11 +1172,11 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev 
*dev,
struct vhost_vring_state *ring)
 {
 struct vhost_vdpa *v = dev->opaque;
+int vdpa_idx = ring->index - dev->vq_index;
 int ret;
 
 if (v->shadow_vqs_enabled) {
-VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-  ring->index);
+VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 
 /*
  * Setting base as last used idx, so destination will see as available
-- 
2.27.0

[RFC PATCH v7 12/25] vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs

2022-04-13 Thread Eugenio Pérez

To know the device features is also needed for CVQ SVQ. Extract from
vhost_vdpa_get_max_queue_pairs so we can reuse it.

Report errno in case of failure getting them while we're at it.

Signed-off-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 9261101af2..a8dde49198 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -236,20 +236,24 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 return nc;
 }
 
-static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
+static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
+{
+int ret = ioctl(fd, VHOST_GET_FEATURES, features);
+if (ret) {
+error_setg_errno(errp, errno,
+ "Fail to query features from vhost-vDPA device");
+}
+return ret;
+}
+
+static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
+  int *has_cvq, Error **errp)
 {
 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
 g_autofree struct vhost_vdpa_config *config = NULL;
 __virtio16 *max_queue_pairs;
-uint64_t features;
 int ret;
 
-ret = ioctl(fd, VHOST_GET_FEATURES, );
-if (ret) {
-error_setg(errp, "Fail to query features from vhost-vDPA device");
-return ret;
-}
-
 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
 *has_cvq = 1;
 } else {
@@ -279,10 +283,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 NetClientState *peer, Error **errp)
 {
 const NetdevVhostVDPAOptions *opts;
+uint64_t features;
 int vdpa_device_fd;
 g_autofree NetClientState **ncs = NULL;
 NetClientState *nc;
-int queue_pairs, i, has_cvq = 0;
+int queue_pairs, r, i, has_cvq = 0;
 g_autoptr(VhostIOVATree) iova_tree = NULL;
 
 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
@@ -297,7 +302,12 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 return -errno;
 }
 
-queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
+r = vhost_vdpa_get_features(vdpa_device_fd, , errp);
+if (r) {
+return r;
+}
+
+queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
  _cvq, errp);
 if (queue_pairs < 0) {
 qemu_close(vdpa_device_fd);
-- 
2.27.0

[RFC PATCH v7 13/25] virtio: Make virtqueue_alloc_element non-static

2022-04-13 Thread Eugenio Pérez

So SVQ can allocate elements using it

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/virtio.h | 1 +
 hw/virtio/virtio.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b31c4507f5..1e85833897 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -195,6 +195,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 unsigned int len, unsigned int idx);
 
 void virtqueue_map(VirtIODevice *vdev, VirtQueueElement *elem);
+void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num);
 void *virtqueue_pop(VirtQueue *vq, size_t sz);
 unsigned int virtqueue_drop_all(VirtQueue *vq);
 void *qemu_get_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, size_t sz);
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 9d637e043e..17cbbb5fca 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1376,7 +1376,7 @@ void virtqueue_map(VirtIODevice *vdev, VirtQueueElement 
*elem)
 false);
 }
 
-static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned 
in_num)
+void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num)
 {
 VirtQueueElement *elem;
 size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0]));
-- 
2.27.0

[RFC PATCH v7 07/25] vhost: Add reference counting to vhost_iova_tree

2022-04-13 Thread Eugenio Pérez

Now that different vqs can have different ASIDs its easier to track them
using reference counters.

QEMU's glib version still does not have them so we've copied g_rc_box,
so the implementation can be converted to glib's one when the minimum
version is raised.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-iova-tree.h |  5 +++--
 hw/virtio/vhost-iova-tree.c | 21 +++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
index 6a4f24e0f9..2fc825d7b1 100644
--- a/hw/virtio/vhost-iova-tree.h
+++ b/hw/virtio/vhost-iova-tree.h
@@ -16,8 +16,9 @@
 typedef struct VhostIOVATree VhostIOVATree;
 
 VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
-void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
-G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+VhostIOVATree *vhost_iova_tree_acquire(VhostIOVATree *iova_tree);
+void vhost_iova_tree_release(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_release);
 
 const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
 const DMAMap *map);
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
index 55fed1fefb..31445cbdfc 100644
--- a/hw/virtio/vhost-iova-tree.c
+++ b/hw/virtio/vhost-iova-tree.c
@@ -28,6 +28,9 @@ struct VhostIOVATree {
 
 /* IOVA address to qemu memory maps. */
 IOVATree *iova_taddr_map;
+
+/* Reference count */
+size_t refcnt;
 };
 
 /**
@@ -44,14 +47,28 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, 
hwaddr iova_last)
 tree->iova_last = iova_last;
 
 tree->iova_taddr_map = iova_tree_new();
+tree->refcnt = 1;
 return tree;
 }
 
 /**
- * Delete an iova tree
+ * Increases the reference count of the iova tree
+ */
+VhostIOVATree *vhost_iova_tree_acquire(VhostIOVATree *iova_tree)
+{
+++iova_tree->refcnt;
+return iova_tree;
+}
+
+/**
+ * Decrease reference counter of iova tree, freeing if it reaches 0
  */
-void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+void vhost_iova_tree_release(VhostIOVATree *iova_tree)
 {
+if (--iova_tree->refcnt) {
+return;
+}
+
 iova_tree_destroy(iova_tree->iova_taddr_map);
 g_free(iova_tree);
 }
-- 
2.27.0

[RFC PATCH v7 10/25] vdpa: Fix index calculus at vhost_vdpa_svqs_start

2022-04-13 Thread Eugenio Pérez

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 27ee678dc9..6b370c918c 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1019,7 +1019,7 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 struct vhost_vring_addr addr = {
-.index = i,
+.index = dev->vq_index + i,
 };
 int r;
 bool ok = vhost_vdpa_svq_setup(dev, svq, i, );
-- 
2.27.0

[RFC PATCH v7 08/25] vdpa: Add x-svq to NetdevVhostVDPAOptions

2022-04-13 Thread Eugenio Pérez

Finally offering the possibility to enable SVQ from the command line.

Signed-off-by: Eugenio Pérez 
---
 qapi/net.json|  9 -
 net/vhost-vdpa.c | 48 
 2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/qapi/net.json b/qapi/net.json
index b92f3f5fb4..92848e4362 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -445,12 +445,19 @@
 # @queues: number of queues to be created for multiqueue vhost-vdpa
 #  (default: 1)
 #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+# (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
 # Since: 5.1
 ##
 { 'struct': 'NetdevVhostVDPAOptions',
   'data': {
 '*vhostdev': 'str',
-'*queues':   'int' } }
+'*queues':   'int',
+'*x-svq':{'type': 'bool', 'features' : [ 'unstable'] } } }
 
 ##
 # @NetClientDriver:
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 1e9fe47c03..9261101af2 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -128,6 +128,7 @@ static void vhost_vdpa_cleanup(NetClientState *nc)
 {
 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 
+g_clear_pointer(>vhost_vdpa.iova_tree, vhost_iova_tree_release);
 if (s->vhost_net) {
 vhost_net_cleanup(s->vhost_net);
 g_free(s->vhost_net);
@@ -187,13 +188,23 @@ static NetClientInfo net_vhost_vdpa_info = {
 .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+static int vhost_vdpa_get_iova_range(int fd,
+ struct vhost_vdpa_iova_range *iova_range)
+{
+int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+return ret < 0 ? -errno : 0;
+}
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-   const char *device,
-   const char *name,
-   int vdpa_device_fd,
-   int queue_pair_index,
-   int nvqs,
-   bool is_datapath)
+   const char *device,
+   const char *name,
+   int vdpa_device_fd,
+   int queue_pair_index,
+   int nvqs,
+   bool is_datapath,
+   bool svq,
+   VhostIOVATree *iova_tree)
 {
 NetClientState *nc = NULL;
 VhostVDPAState *s;
@@ -211,8 +222,14 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 
 s->vhost_vdpa.device_fd = vdpa_device_fd;
 s->vhost_vdpa.index = queue_pair_index;
+s->vhost_vdpa.shadow_vqs_enabled = svq;
+s->vhost_vdpa.iova_tree = iova_tree ? vhost_iova_tree_acquire(iova_tree) :
+  NULL;
 ret = vhost_vdpa_add(nc, (void *)>vhost_vdpa, queue_pair_index, nvqs);
 if (ret) {
+if (iova_tree) {
+vhost_iova_tree_release(iova_tree);
+}
 qemu_del_net_client(nc);
 return NULL;
 }
@@ -266,6 +283,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 g_autofree NetClientState **ncs = NULL;
 NetClientState *nc;
 int queue_pairs, i, has_cvq = 0;
+g_autoptr(VhostIOVATree) iova_tree = NULL;
 
 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 opts = >u.vhost_vdpa;
@@ -285,19 +303,31 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 qemu_close(vdpa_device_fd);
 return queue_pairs;
 }
+if (opts->x_svq) {
+struct vhost_vdpa_iova_range iova_range;
+
+if (has_cvq) {
+error_setg(errp, "vdpa svq does not work with cvq");
+goto err_svq;
+}
+vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+}
 
 ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
 
 for (i = 0; i < queue_pairs; i++) {
 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 2, true);
+ vdpa_device_fd, i, 2, true, opts->x_svq,
+ iova_tree);
 if (!ncs[i])
 goto err;
 }
 
 if (has_cvq) {
 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
- vdpa_device_fd, i, 1, false);
+ vdpa_device_fd, i, 1, false, opts->x_svq,
+ iova_tree);
 if (!nc)
 goto err;
 }
@@ -308,6 +338,8 @@ err:
 if (i) {
 qemu_del_net_client(ncs[0]);
 }
+
+err_svq:
 qemu_close(vdpa_device_fd);
 
 return -1;
-- 
2.27.0

[RFC PATCH v7 09/25] vhost: move descriptor translation to vhost_svq_vring_write_descs

2022-04-13 Thread Eugenio Pérez

It's done for both in and out descriptors so it's better placed here.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index a2531d5874..f874374651 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -122,17 +122,23 @@ static bool vhost_svq_translate_addr(const 
VhostShadowVirtqueue *svq,
 return true;
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
-const struct iovec *iovec, size_t num,
-bool more_descs, bool write)
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+const struct iovec *iovec, size_t num,
+bool more_descs, bool write)
 {
 uint16_t i = svq->free_head, last = svq->free_head;
 unsigned n;
 uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 vring_desc_t *descs = svq->vring.desc;
+bool ok;
 
 if (num == 0) {
-return;
+return true;
+}
+
+ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+if (unlikely(!ok)) {
+return false;
 }
 
 for (n = 0; n < num; n++) {
@@ -150,6 +156,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
 }
 
 svq->free_head = le16_to_cpu(svq->desc_next[last]);
+return true;
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
@@ -169,21 +176,18 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 return false;
 }
 
-ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+ elem->in_num > 0, false);
 if (unlikely(!ok)) {
 return false;
 }
-vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-elem->in_num > 0, false);
 
-
-ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, 
false,
+ true);
 if (unlikely(!ok)) {
 return false;
 }
 
-vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
-
 /*
  * Put the entry in the available array (but don't update avail->idx until
  * they do sync).
-- 
2.27.0

[RFC PATCH v7 02/25] vdpa: Add missing tracing to batch mapping functions

2022-04-13 Thread Eugenio Pérez

These functions were not traced properly.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 2 ++
 hw/virtio/trace-events | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 8adf7c0b92..9e5fe15d03 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -129,6 +129,7 @@ static void vhost_vdpa_listener_begin_batch(struct 
vhost_vdpa *v)
 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
 };
 
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write, fd=%d, errno=%d (%s)",
  fd, errno, strerror(errno));
@@ -163,6 +164,7 @@ static void vhost_vdpa_listener_commit(MemoryListener 
*listener)
 msg.type = v->msg_type;
 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
 
+trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write, fd=%d, errno=%d (%s)",
  fd, errno, strerror(errno));
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index a5102eac9e..48d9d5 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -25,6 +25,8 @@ vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t 
rb_offset) "%s + 0x%"
 # vhost-vdpa.c
 vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, 
uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa:%p fd: %d 
msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" 
perm: 0x%"PRIx8" type: %"PRIu8
 vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, 
uint64_t size, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 
0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
+vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t 
type)  "vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
+vhost_vdpa_listener_commit(void *v, int fd, uint32_t msg_type, uint8_t type)  
"vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
 vhost_vdpa_listener_region_add(void *vdpa, uint64_t iova, uint64_t llend, void 
*vaddr, bool readonly) "vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64" vaddr: %p 
read-only: %d"
 vhost_vdpa_listener_region_del(void *vdpa, uint64_t iova, uint64_t llend) 
"vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64
 vhost_vdpa_add_status(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8
-- 
2.27.0

[RFC PATCH v7 06/25] vdpa: Send all updates in memory listener commit

2022-04-13 Thread Eugenio Pérez

With the introduction of many ASID it can happen that many changes on
different listeners come before the commit call. Since kernel vhost-vdpa
still does not support it, send it all in one shot.

This also have one extra advantage: If there is no update to notify, we
save the iotlb_{begin,end} calls.

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/vhost-vdpa.h |  2 +-
 hw/virtio/vhost-vdpa.c | 69 +-
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index a29dbb3f53..4961acea8b 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -27,7 +27,7 @@ typedef struct vhost_vdpa {
 int device_fd;
 int index;
 uint32_t msg_type;
-bool iotlb_batch_begin_sent;
+GArray *iotlb_updates;
 MemoryListener listener;
 struct vhost_vdpa_iova_range iova_range;
 uint64_t acked_features;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 1f229ff4cb..27ee678dc9 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -85,6 +85,11 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr 
iova, hwaddr size,
 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
 msg.iotlb.type = VHOST_IOTLB_UPDATE;
 
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH)) {
+g_array_append_val(v->iotlb_updates, msg);
+return 0;
+}
+
trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
 
@@ -109,6 +114,11 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, 
hwaddr iova,
 msg.iotlb.size = size;
 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
 
+if (v->dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH)) {
+g_array_append_val(v->iotlb_updates, msg);
+return 0;
+}
+
 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
msg.iotlb.size, msg.iotlb.type);
 
@@ -121,56 +131,47 @@ static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, 
hwaddr iova,
 return ret;
 }
 
-static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
-{
-int fd = v->device_fd;
-struct vhost_msg_v2 msg = {
-.type = v->msg_type,
-.iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
-};
-
-trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
-if (write(fd, , sizeof(msg)) != sizeof(msg)) {
-error_report("failed to write, fd=%d, errno=%d (%s)",
- fd, errno, strerror(errno));
-}
-}
-
-static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
-{
-if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
-!v->iotlb_batch_begin_sent) {
-vhost_vdpa_listener_begin_batch(v);
-}
-
-v->iotlb_batch_begin_sent = true;
-}
-
 static void vhost_vdpa_listener_commit(MemoryListener *listener)
 {
 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
-struct vhost_dev *dev = v->dev;
 struct vhost_msg_v2 msg = {};
 int fd = v->device_fd;
+size_t num = v->iotlb_updates->len;
 
-if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
+if (!num) {
 return;
 }
 
-if (!v->iotlb_batch_begin_sent) {
-return;
+msg.type = v->msg_type;
+msg.iotlb.type = VHOST_IOTLB_BATCH_BEGIN;
+trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
+if (write(fd, , sizeof(msg)) != sizeof(msg)) {
+error_report("failed to write BEGIN_BATCH, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+goto done;
 }
 
-msg.type = v->msg_type;
-msg.iotlb.type = VHOST_IOTLB_BATCH_END;
+for (size_t i = 0; i < num; ++i) {
+struct vhost_msg_v2 *update = _array_index(v->iotlb_updates,
+ struct vhost_msg_v2, i);
+if (write(fd, update, sizeof(*update)) != sizeof(*update)) {
+error_report("failed to write dma update, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+goto done;
+}
+}
 
+msg.iotlb.type = VHOST_IOTLB_BATCH_END;
 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
 if (write(fd, , sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write, fd=%d, errno=%d (%s)",
  fd, errno, strerror(errno));
 }
 
-v->iotlb_batch_begin_sent = false;
+done:
+g_array_set_size(v->iotlb_updates, 0);
+return;
+
 }
 
 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
@@ -227,7 +228,6 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
 iova = mem_region.iova;
 }
 
-vhost_vdpa_iotlb_batch_begin_once(v);
 ret = vhost_vdpa_dma_map(v, iova,

[RFC PATCH v7 05/25] hw/virtio: Replace g_memdup() by g_memdup2()

2022-04-13 Thread Eugenio Pérez

From: Philippe Mathieu-Daudé 

Per 
https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

  The old API took the size of the memory to duplicate as a guint,
  whereas most memory functions take memory sizes as a gsize. This
  made it easy to accidentally pass a gsize to g_memdup(). For large
  values, that would lead to a silent truncation of the size from 64
  to 32 bits, and result in a heap area being returned which is
  significantly smaller than what the caller expects. This can likely
  be exploited in various modules to cause a heap buffer overflow.

Replace g_memdup() by the safer g_memdup2() wrapper.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/net/virtio-net.c   | 3 ++-
 hw/virtio/virtio-crypto.c | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1067e72b39..e4748a7e6c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1443,7 +1443,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
 }
 
 iov_cnt = elem->out_num;
-iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * 
elem->out_num);
+iov2 = iov = g_memdup2(elem->out_sg,
+   sizeof(struct iovec) * elem->out_num);
 s = iov_to_buf(iov, iov_cnt, 0, , sizeof(ctrl));
 iov_discard_front(, _cnt, sizeof(ctrl));
 if (s != sizeof(ctrl)) {
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index dcd80b904d..0e31e3cc04 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
 }
 
 out_num = elem->out_num;
-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
 out_iov = out_iov_copy;
 
 in_num = elem->in_num;
@@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
 }
 
 out_num = elem->out_num;
-out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
+out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
 out_iov = out_iov_copy;
 
 in_num = elem->in_num;
-in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num);
+in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
 in_iov = in_iov_copy;
 
 if (unlikely(iov_to_buf(out_iov, out_num, 0, , sizeof(req))
-- 
2.27.0

[RFC PATCH v7 04/25] util: Return void on iova_tree_remove

2022-04-13 Thread Eugenio Pérez

It always returns IOVA_OK so nobody uses it.

Signed-off-by: Eugenio Pérez 
---
 include/qemu/iova-tree.h | 4 +---
 util/iova-tree.c | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index c938fb0793..16bbfdf5f8 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -72,10 +72,8 @@ int iova_tree_insert(IOVATree *tree, const DMAMap *map);
  * provided.  The range does not need to be exactly what has inserted,
  * all the mappings that are included in the provided range will be
  * removed from the tree.  Here map->translated_addr is meaningless.
- *
- * Return: 0 if succeeded, or <0 if error.
  */
-int iova_tree_remove(IOVATree *tree, const DMAMap *map);
+void iova_tree_remove(IOVATree *tree, const DMAMap *map);
 
 /**
  * iova_tree_find:
diff --git a/util/iova-tree.c b/util/iova-tree.c
index 6dff29c1f6..fee530a579 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -164,15 +164,13 @@ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator 
iterator)
 g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
 }
 
-int iova_tree_remove(IOVATree *tree, const DMAMap *map)
+void iova_tree_remove(IOVATree *tree, const DMAMap *map)
 {
 const DMAMap *overlap;
 
 while ((overlap = iova_tree_find(tree, map))) {
 g_tree_remove(tree->tree, overlap);
 }
-
-return IOVA_OK;
 }
 
 /**
-- 
2.27.0

[RFC PATCH v7 00/25] Net Control VQ support with asid in vDPA SVQ

2022-04-13 Thread Eugenio Pérez

Control virtqueue is used by networking device for accepting various
commands from the driver. It's a must to support multiqueue and other
configurations.

Shadow VirtQueue (SVQ) already makes possible migration of virtqueue
states, effectively intercepting them so qemu can track what regions of memory
are dirty because device action and needs migration. However, this does not
solve networking device state seen by the driver because CVQ messages, like
changes on MAC addresses from the driver.

To solve that, this series uses SVQ infraestructure proposed at SVQ to
intercept networking control messages used by the device. This way, qemu is
able to update VirtIONet device model and to migrate it.

You can run qemu in two modes after applying this series: only intercepting
cvq with x-cvq-svq=on or intercept all the virtqueues adding cmdline x-svq=on:

-netdev 
type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0,x-cvq-svq=on,x-svq=on

The most updated kernel part of ASID is proposed at [1].

Other modes without x-cvq-svq have been not tested with this series. Other vq
cmd commands than set mac or mq are not tested. Some details like error control
are not 100% tested neither.

The firsts 5 patches will be or have already been proposed sepratedly. Patch 6
and 7 enable some pre-requisites. Patch 8 add cmdline parameter to shadow all
virtqueues. The rest of commits introduce the actual functionality.

Comments are welcomed.

Changes from rfc v6:
* Fix bad iotlb updates order when batching was enabled
* Add reference counting to iova_tree so cleaning is simpler.

Changes from rfc v5:
* Fixes bad calculus of cvq end group when MQ is not acked by the guest.

Changes from rfc v4:
* Add missing tracing
* Add multiqueue support
* Use already sent version for replacing g_memdup
* Care with memory management

Changes from rfc v3:
* Fix bad returning of descriptors to SVQ list.

Changes from rfc v2:
* Fix use-after-free.

Changes from rfc v1:
* Rebase to latest master.
* Configure ASID instead of assuming cvq asid != data vqs asid.
* Update device model so (MAC) state can be migrated too.

[1] https://lkml.kernel.org/kvm/20220224212314.1326-1-gda...@xilinx.com/

Eugenio Pérez (24):
  vhost: Track descriptor chain in private at SVQ
  vdpa: Add missing tracing to batch mapping functions
  vdpa: Fix bad index calculus at vhost_vdpa_get_vring_base
  util: Return void on iova_tree_remove
  vdpa: Send all updates in memory listener commit
  vhost: Add reference counting to vhost_iova_tree
  vdpa: Add x-svq to NetdevVhostVDPAOptions
  vhost: move descriptor translation to vhost_svq_vring_write_descs
  vdpa: Fix index calculus at vhost_vdpa_svqs_start
  virtio-net: Expose ctrl virtqueue logic
  vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
  virtio: Make virtqueue_alloc_element non-static
  vhost: Add SVQElement
  vhost: Add custom used buffer callback
  vdpa: control virtqueue support on shadow virtqueue
  vhost: Add vhost_iova_tree_find
  vdpa: Add map/unmap operation callback to SVQ
  vhost: Add vhost_svq_inject
  vdpa: add NetClientState->start() callback
  vdpa: Add vhost_vdpa_start_control_svq
  vhost: Update kernel headers
  vhost: Make possible to check for device exclusive vq group
  vdpa: Add asid attribute to vdpa device
  vdpa: Add x-cvq-svq

Philippe Mathieu-Daudé (1):
  hw/virtio: Replace g_memdup() by g_memdup2()

 qapi/net.json|  13 +-
 hw/virtio/vhost-iova-tree.h  |   7 +-
 hw/virtio/vhost-shadow-virtqueue.h   |  52 +++-
 include/hw/virtio/vhost-vdpa.h   |   4 +-
 include/hw/virtio/vhost.h|   6 +
 include/hw/virtio/virtio-net.h   |   3 +
 include/hw/virtio/virtio.h   |   1 +
 include/net/net.h|   2 +
 include/qemu/iova-tree.h |   4 +-
 include/standard-headers/linux/vhost_types.h |  11 +-
 linux-headers/linux/vhost.h  |  25 +-
 hw/net/vhost_net.c   |  13 +-
 hw/net/virtio-net.c  |  82 ++---
 hw/virtio/vhost-iova-tree.c  |  35 ++-
 hw/virtio/vhost-shadow-virtqueue.c   | 265 +---
 hw/virtio/vhost-vdpa.c   | 262 
 hw/virtio/virtio-crypto.c|   6 +-
 hw/virtio/virtio.c   |   2 +-
 net/vhost-vdpa.c | 305 +--
 util/iova-tree.c |   4 +-
 hw/virtio/trace-events   |   8 +-
 21 files changed, 930 insertions(+), 180 deletions(-)

-- 
2.27.0

[RFC PATCH v7 01/25] vhost: Track descriptor chain in private at SVQ

2022-04-13 Thread Eugenio Pérez

Only the first one of them were properly enqueued back.

While we're at it, harden SVQ: The device could have access to modify
them, and it definitely have access when we implement packed vq. Harden
SVQ maintaining a private copy of the descriptor chain. Other fields
like buffer addresses are already maintained sepparatedly.

Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding")

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.h |  6 ++
 hw/virtio/vhost-shadow-virtqueue.c | 27 +--
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index e5e24c536d..c132c994e9 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -53,6 +53,12 @@ typedef struct VhostShadowVirtqueue {
 /* Next VirtQueue element that guest made available */
 VirtQueueElement *next_guest_avail_elem;
 
+/*
+ * Backup next field for each descriptor so we can recover securely, not
+ * needing to trust the device access.
+ */
+uint16_t *desc_next;
+
 /* Next head to expose to the device */
 uint16_t shadow_avail_idx;
 
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index b232803d1b..a2531d5874 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -138,6 +138,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
 for (n = 0; n < num; n++) {
 if (more_descs || (n + 1 < num)) {
 descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+descs[i].next = cpu_to_le16(svq->desc_next[i]);
 } else {
 descs[i].flags = flags;
 }
@@ -145,10 +146,10 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue 
*svq, hwaddr *sg,
 descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
 last = i;
-i = cpu_to_le16(descs[i].next);
+i = cpu_to_le16(svq->desc_next[i]);
 }
 
-svq->free_head = le16_to_cpu(descs[last].next);
+svq->free_head = le16_to_cpu(svq->desc_next[last]);
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
@@ -333,13 +334,22 @@ static void 
vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 }
 
+static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq,
+ uint16_t num, uint16_t i)
+{
+for (uint16_t j = 0; j < num; ++j) {
+i = le16_to_cpu(svq->desc_next[i]);
+}
+
+return i;
+}
+
 static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
uint32_t *len)
 {
-vring_desc_t *descs = svq->vring.desc;
 const vring_used_t *used = svq->vring.used;
 vring_used_elem_t used_elem;
-uint16_t last_used;
+uint16_t last_used, last_used_chain, num;
 
 if (!vhost_svq_more_used(svq)) {
 return NULL;
@@ -365,7 +375,10 @@ static VirtQueueElement 
*vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 return NULL;
 }
 
-descs[used_elem.id].next = svq->free_head;
+num = svq->ring_id_maps[used_elem.id]->in_num +
+  svq->ring_id_maps[used_elem.id]->out_num;
+last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
+svq->desc_next[last_used_chain] = svq->free_head;
 svq->free_head = used_elem.id;
 
 *len = used_elem.len;
@@ -540,8 +553,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
VirtIODevice *vdev,
 svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 memset(svq->vring.used, 0, device_size);
 svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+svq->desc_next = g_new0(uint16_t, svq->vring.num);
 for (unsigned i = 0; i < svq->vring.num - 1; i++) {
-svq->vring.desc[i].next = cpu_to_le16(i + 1);
+svq->desc_next[i] = cpu_to_le16(i + 1);
 }
 }
 
@@ -574,6 +588,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
 virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 }
 svq->vq = NULL;
+g_free(svq->desc_next);
 g_free(svq->ring_id_maps);
 qemu_vfree(svq->vring.desc);
 qemu_vfree(svq->vring.used);
-- 
2.27.0

Re: [PATCH v5 04/13] mm/shmem: Restrict MFD_INACCESSIBLE memory against RLIMIT_MEMLOCK

2022-04-13 Thread David Hildenbrand

> 
> So this is another situation where the actual backend (TDX, SEV, pKVM, pure 
> software) makes a difference -- depending on exactly what backend we're 
> using, the memory may not be unmoveable.  It might even be swappable (in the 
> potentially distant future).

Right. And on a system without swap we don't particularly care about
mlock, but we might (in most cases) care about fragmentation with
unmovable memory.

> 
> Anyway, here's a concrete proposal, with a bit of handwaving:

Thanks for investing some brainpower.

> 
> We add new cgroup limits:
> 
> memory.unmoveable
> memory.locked
> 
> These can be set to an actual number or they can be set to the special value 
> ROOT_CAP.  If they're set to ROOT_CAP, then anyone in the cgroup with 
> capable(CAP_SYS_RESOURCE) (i.e. the global capability) can allocate movable 
> or locked memory with this (and potentially other) new APIs.  If it's 0, then 
> they can't.  If it's another value, then the memory can be allocated, charged 
> to the cgroup, up to the limit, with no particular capability needed.  The 
> default at boot is ROOT_CAP.  Anyone who wants to configure it differently is 
> free to do so.  This avoids introducing a DoS, makes it easy to run tests 
> without configuring cgroup, and lets serious users set up their cgroups.

I wonder what the implications are for existing user space.

Assume we want to move page pinning (rdma, vfio, io_uring, ...) to the
new model. How can we be sure

a) We don't break existing user space
b) We don't open the doors unnoticed for the admin to go crazy on
   unmovable memory.

Any ideas?

> 
> Nothing is charge per mm.
> 
> To make this fully sensible, we need to know what the backend is for the 
> private memory before allocating any so that we can charge it accordingly.

Right, the support for migration and/or swap defines how to account.

-- 
Thanks,

David / dhildenb

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Kevin Wolf

Am 13.04.2022 um 17:14 hat Emanuele Giuseppe Esposito geschrieben:
> Am 13/04/2022 um 16:51 schrieb Kevin Wolf:
> > Am 13.04.2022 um 15:43 hat Emanuele Giuseppe Esposito geschrieben:
> >> So this is a more concrete and up-to-date header.
> >>
> >> Few things to notice:
> >> - we have a list of AioContext. They are registered once an aiocontext
> >> is created, and deleted when it is destroyed.
> >> This list is helpful because each aiocontext can only modify its own
> >> number of readers, avoiding unnecessary cacheline bouncing
> >>
> >> - if a coroutine changes aiocontext, it's ok with regards to the
> >> per-aiocontext reader counter. As long as the sum is correct, there's no
> >> issue. The problem comes only once the original aiocontext is deleted,
> >> and at that point we need to move the count it held to a shared global
> >> variable, otherwise we risk to lose track of readers.
> > 
> > So the idea is that we can do bdrv_graph_co_rdlock() in one thread and
> > the corresponding bdrv_graph_co_rdunlock() in a different thread?
> > 
> > Would the unlock somehow remember the original thread, or do you use the
> > "sum is correct" argument and allow negative counter values, so you can
> > end up having count +1 in A and -1 in B to represent "no active
> > readers"? If this happens, it's likely to happen many times, so do we
> > have to take integer overflows into account then?
> > 
> >> - All synchronization between the flags explained in this header is of
> >> course handled in the implementation. But for now it would be nice to
> >> have a feedback on the idea/API.
> >>
> >> So in short we need:
> >> - per-aiocontext counter
> >> - global list of aiocontext
> >> - global additional reader counter (in case an aiocontext is deleted)
> >> - global CoQueue
> >> - global has_writer flag
> >> - global QemuMutex to protect the list access
> >>
> >> Emanuele
> >>
> >> #ifndef BLOCK_LOCK_H
> >> #define BLOCK_LOCK_H
> >>
> >> #include "qemu/osdep.h"
> >>
> >> /*
> >>  * register_aiocontext:
> >>  * Add AioContext @ctx to the list of AioContext.
> >>  * This list is used to obtain the total number of readers
> >>  * currently running the graph.
> >>  */
> >> void register_aiocontext(AioContext *ctx);
> >>
> >> /*
> >>  * unregister_aiocontext:
> >>  * Removes AioContext @ctx to the list of AioContext.
> >>  */
> >> void unregister_aiocontext(AioContext *ctx);
> >>
> >> /*
> >>  * bdrv_graph_wrlock:
> >>  * Modify the graph. Nobody else is allowed to access the graph.
> >>  * Set global has_writer to 1, so that the next readers will wait
> >>  * that writer is done in a coroutine queue.
> >>  * Then keep track of the running readers by counting what is the total
> >>  * amount of readers (sum of all aiocontext readers), and wait until
> >>  * they all finish with AIO_WAIT_WHILE.
> >>  */
> >> void bdrv_graph_wrlock(void);
> > 
> > Do we need a coroutine version that yields instead of using
> > AIO_WAIT_WHILE() or are we sure this will only ever be called from
> > non-coroutine contexts?
> 
> writes (graph modifications) are always done under BQL in the main loop.

Yes, I think we're fairly certain about this part.

> Except an unit test, I don't think a coroutine ever does that.

I'm not sure about this one, though. Didn't you have cases where
bdrv_replace_child_noperm() was called in coroutine context? Or maybe
I'm mixing up things here.

> >> /*
> >>  * bdrv_graph_wrunlock:
> >>  * Write finished, reset global has_writer to 0 and restart
> >>  * all readers that are waiting.
> >>  */
> >> void bdrv_graph_wrunlock(void);
> >>
> >> /*
> >>  * bdrv_graph_co_rdlock:
> >>  * Read the bs graph. Increases the reader counter of the current
> >> aiocontext,
> >>  * and if has_writer is set, it means that the writer is modifying
> >>  * the graph, therefore wait in a coroutine queue.
> >>  * The writer will then wake this coroutine once it is done.
> >>  *
> >>  * This lock cannot be taken recursively.
> >>  */
> >> void coroutine_fn bdrv_graph_co_rdlock(void);
> > 
> > What prevents it from being taken recursively when it's just a counter?
> > (I do see however, that you can't take a reader lock while you have the
> > writer lock or vice versa because it would deadlock.)
> > 
> I actually didn't add the assertion to prevent it from being recoursive
> yet, but I think it simplifies everything if it's not recoursive
> 
> > Does this being a coroutine_fn mean that we would have to convert QMP
> > command handlers to coroutines so that they can take the rdlock while
> > they don't expect the graph to change? Or should we have a non-coroutine
> > version, too, that works with AIO_WAIT_WHILE()?
> 
> Why convert the QMP command handlers? coroutine_fn was just to signal
> that it can also be called from coroutines, like the ones created by the
> blk_* API.

coroutine_fn means that it can _only_ be called from coroutines (because
it will yield, which doesn't work outside of a coroutine - not sure what
happens, probably just a

Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers

2022-04-13 Thread Peter Maydell

On Wed, 13 Apr 2022 at 17:09, Paolo Bonzini  wrote:
>
> The i386 target consolidates all vector registers so that instead of
> XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
> fit all of SSE, AVX and AVX512.
>
> When TCG copies data from and to the SSE registers, it uses the
> full 64-byte width.  This is not a correctness issue because TCG
> never lets guest code see beyond the first 128 bits of the ZMM
> registers, however it causes uninitialized stack memory to
> make it to the CPU's migration stream.
>
> Fix it by only copying the low 16 bytes of the ZMMReg union into
> the destination register.
>

> +/*
> + * Copy the relevant parts of a Reg value around. In the case where
> + * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
> + * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
> + * untouched in the guest-visible destination destination register.
> + * Note that the "lower bytes" are placed last in memory on big-endian
> + * hosts, which store the vector backwards in memory.  In that case the
> + * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
> + * the little-endian case.
> + */
> +#ifdef HOST_WORDS_BIGENDIAN
> +#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(d).B(SIZE - 1), SIZE)

Still has the typo where it's copying d to d, not r to d.


> +#else
> +#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
> +#endif

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v5 04/13] mm/shmem: Restrict MFD_INACCESSIBLE memory against RLIMIT_MEMLOCK

2022-04-13 Thread David Hildenbrand

On 12.04.22 16:36, Jason Gunthorpe wrote:
> On Fri, Apr 08, 2022 at 08:54:02PM +0200, David Hildenbrand wrote:
> 
>> RLIMIT_MEMLOCK was the obvious candidate, but as we discovered int he
>> past already with secretmem, it's not 100% that good of a fit (unmovable
>> is worth than mlocked). But it gets the job done for now at least.
> 
> No, it doesn't. There are too many different interpretations how
> MELOCK is supposed to work
> 
> eg VFIO accounts per-process so hostile users can just fork to go past
> it.
> 
> RDMA is per-process but uses a different counter, so you can double up
> 
> iouring is per-user and users a 3rd counter, so it can triple up on
> the above two

Thanks for that summary, very helpful.

> 
>> So I'm open for alternative to limit the amount of unmovable memory we
>> might allocate for user space, and then we could convert seretmem as well.
> 
> I think it has to be cgroup based considering where we are now :\

Most probably. I think the important lessons we learned are that

* mlocked != unmovable.
* RLIMIT_MEMLOCK should most probably never have been abused for
  unmovable memory (especially, long-term pinning)


-- 
Thanks,

David / dhildenb

Re: [PATCH for-7.1 6/8] nbd: move s->state under requests_lock

2022-04-13 Thread Eric Blake

On Tue, Apr 12, 2022 at 09:42:02PM +0200, Paolo Bonzini wrote:
> Remove the confusing, and most likely wrong, atomics.  The only function
> that used to be somewhat in a hot path was nbd_client_connected(),
> but it is not anymore after the previous patches.
> 
> The function nbd_client_connecting_wait() was used mostly to check if
> a request had to be reissued (outside requests_lock), but also
> under requests_lock in nbd_client_connecting_wait().  The two uses have to

"Function A was used mostly..., but also under requests_lock in
function A."  Reading the rest of the patch, I think...[1]

> be separated; for the former we rename it to nbd_client_will_reconnect()
> and make it take s->requests_lock; for the latter the access can simply
> be inlined.  The new name is clearer, and ensures that a missing
> conversion is caught by the compiler.

I take it your experiments with C++ coroutines helped find this ;)

> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/nbd.c | 88 +
>  1 file changed, 48 insertions(+), 40 deletions(-)

> @@ -226,7 +225,9 @@ static void nbd_teardown_connection(BlockDriverState *bs)
>  s->ioc = NULL;
>  }
>  
> -s->state = NBD_CLIENT_QUIT;
> +WITH_QEMU_LOCK_GUARD(>requests_lock) {
> +s->state = NBD_CLIENT_QUIT;
> +}
>  }

This style for protecting s->state at the end of the function takes 3
lines thanks to WITH_QEMU_LOCK_GUARD...[2]

>  
>  static void open_timer_del(BDRVNBDState *s)
> @@ -255,16 +256,13 @@ static void open_timer_init(BDRVNBDState *s, uint64_t 
> expire_time_ns)
>  timer_mod(s->open_timer, expire_time_ns);
>  }
>  
> -static bool nbd_client_connecting(BDRVNBDState *s)
> +static bool nbd_client_will_reconnect(BDRVNBDState *s)

This part of the diff got hard to read, since you mixed shuffling
functions with a rename.  On a closer read, I see that
nbd_client_connecting() was merely moved later[3], while the new name
nbd_client_will_reconnect()...[4]

>  {
> -NBDClientState state = qatomic_load_acquire(>state);
> -return state == NBD_CLIENT_CONNECTING_WAIT ||
> -state == NBD_CLIENT_CONNECTING_NOWAIT;
> -}
> -
> -static bool nbd_client_connecting_wait(BDRVNBDState *s)

[4]...is indeed happening to nbd_client_connecting_wait(), as promised
in the commit message.  Which means:

[1]...so it looks like the first 'function A' did indeed want to be
nbd_client_connecting_wait() which got renamed (since
nbd_client_connecting() was moved later in the file), while...[1]

> -{
> -return qatomic_load_acquire(>state) == NBD_CLIENT_CONNECTING_WAIT;
> +/*
> + * Called only after a socket error, so this is not performance 
> sensitive.
> + */
> +QEMU_LOCK_GUARD(>requests_lock);
> +return s->state == NBD_CLIENT_CONNECTING_WAIT;
>  }

[2]...while here, you only needed two lines, using QEMU_LOCK_GUARD.
Both styles work, but it seems like we should be consistent, and I
would favor the shorter style when all that is being guarded is a
single line.

>  
>  /*
> @@ -351,15 +349,24 @@ int coroutine_fn 
> nbd_co_do_establish_connection(BlockDriverState *bs,
>  qio_channel_attach_aio_context(s->ioc, bdrv_get_aio_context(bs));
>  
>  /* successfully connected */
> -s->state = NBD_CLIENT_CONNECTED;
> +WITH_QEMU_LOCK_GUARD(>requests_lock) {
> +s->state = NBD_CLIENT_CONNECTED;
> +}
>  
>  return 0;
>  }

Another place where the shorter QEMU_LOCK_GUARD() would work.

>  
> +/* Called with s->requests_lock held.  */
> +static bool nbd_client_connecting(BDRVNBDState *s)

[3]...here's where the moved function threw me off.  Perhaps splitting
out a preliminary patch to just move the function before converting it
to be under s->requests_lock, so that the rename of a different
function doesn't cause a hard-to-grok diff, would be wise.

> +{
> +return s->state == NBD_CLIENT_CONNECTING_WAIT ||
> +s->state == NBD_CLIENT_CONNECTING_NOWAIT;
> +}
> +
>  /* Called with s->requests_lock taken.  */
>  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>  {
> -bool blocking = nbd_client_connecting_wait(s);
> +bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;

[1]...and the second instance of 'function A' in the commit message
should have been nbd_reconnect_attempt().

As messy as the diff was, I still think I understood it.  With the
necessary correction to the commit message according to [1], I could
be comfortable with

Reviewed-by: Eric Blake 

although the suggestion in [3] to split out the function motion to a
separate patch may result in the v2 series looking different enough
that you may want to leave off my R-b to ensure I still review things
carefully.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

[PATCH] target/i386: do not access beyond the low 128 bits of SSE registers

2022-04-13 Thread Paolo Bonzini

The i386 target consolidates all vector registers so that instead of
XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
fit all of SSE, AVX and AVX512.

When TCG copies data from and to the SSE registers, it uses the
full 64-byte width.  This is not a correctness issue because TCG
never lets guest code see beyond the first 128 bits of the ZMM
registers, however it causes uninitialized stack memory to
make it to the CPU's migration stream.

Fix it by only copying the low 16 bytes of the ZMMReg union into
the destination register.

Signed-off-by: Paolo Bonzini 
---
 target/i386/ops_sse.h | 75 +++
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 6f1fc174b3..b415809396 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -22,6 +22,7 @@
 
 #if SHIFT == 0
 #define Reg MMXReg
+#define SIZE 8
 #define XMM_ONLY(...)
 #define B(n) MMX_B(n)
 #define W(n) MMX_W(n)
@@ -30,6 +31,7 @@
 #define SUFFIX _mmx
 #else
 #define Reg ZMMReg
+#define SIZE 16
 #define XMM_ONLY(...) __VA_ARGS__
 #define B(n) ZMM_B(n)
 #define W(n) ZMM_W(n)
@@ -38,6 +40,22 @@
 #define SUFFIX _xmm
 #endif
 
+/*
+ * Copy the relevant parts of a Reg value around. In the case where
+ * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
+ * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
+ * untouched in the guest-visible destination destination register.
+ * Note that the "lower bytes" are placed last in memory on big-endian
+ * hosts, which store the vector backwards in memory.  In that case the
+ * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
+ * the little-endian case.
+ */
+#ifdef HOST_WORDS_BIGENDIAN
+#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(d).B(SIZE - 1), SIZE)
+#else
+#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
+#endif
+
 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int shift;
@@ -516,7 +534,7 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
 r.W(1) = s->W((order >> 2) & 3);
 r.W(2) = s->W((order >> 4) & 3);
 r.W(3) = s->W((order >> 6) & 3);
-*d = r;
+MOVE(*d, r);
 }
 #else
 void helper_shufps(Reg *d, Reg *s, int order)
@@ -527,7 +545,7 @@ void helper_shufps(Reg *d, Reg *s, int order)
 r.L(1) = d->L((order >> 2) & 3);
 r.L(2) = s->L((order >> 4) & 3);
 r.L(3) = s->L((order >> 6) & 3);
-*d = r;
+MOVE(*d, r);
 }
 
 void helper_shufpd(Reg *d, Reg *s, int order)
@@ -536,7 +554,7 @@ void helper_shufpd(Reg *d, Reg *s, int order)
 
 r.Q(0) = d->Q(order & 1);
 r.Q(1) = s->Q((order >> 1) & 1);
-*d = r;
+MOVE(*d, r);
 }
 
 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
@@ -547,7 +565,7 @@ void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
 r.L(1) = s->L((order >> 2) & 3);
 r.L(2) = s->L((order >> 4) & 3);
 r.L(3) = s->L((order >> 6) & 3);
-*d = r;
+MOVE(*d, r);
 }
 
 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
@@ -559,7 +577,7 @@ void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
 r.W(2) = s->W((order >> 4) & 3);
 r.W(3) = s->W((order >> 6) & 3);
 r.Q(1) = s->Q(1);
-*d = r;
+MOVE(*d, r);
 }
 
 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
@@ -571,7 +589,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 r.W(5) = s->W(4 + ((order >> 2) & 3));
 r.W(6) = s->W(4 + ((order >> 4) & 3));
 r.W(7) = s->W(4 + ((order >> 6) & 3));
-*d = r;
+MOVE(*d, r);
 }
 #endif
 
@@ -937,7 +955,7 @@ void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), >sse_status);
 r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
 r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
-*d = r;
+MOVE(*d, r);
 }
 
 void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -946,7 +964,7 @@ void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 
 r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), >sse_status);
 r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
-*d = r;
+MOVE(*d, r);
 }
 
 void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -957,7 +975,7 @@ void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), >sse_status);
 r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
 r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
-*d = r;
+MOVE(*d, r);
 }
 
 void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -966,7 +984,7 @@ void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 
 r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), >sse_status);
 r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
-*d = r;
+MOVE(*d, r);
 }
 
 void helper_addsubps(CPUX86State *env,

Re: [PATCH for-7.1 5/8] nbd: use a QemuMutex to synchronize reconnection with coroutines

2022-04-13 Thread Eric Blake

On Tue, Apr 12, 2022 at 09:42:01PM +0200, Paolo Bonzini wrote:
> The condition for waiting on the s->free_sema queue depends on
> both s->in_flight and s->state.  The latter is currently using
> atomics, but this is quite dubious and probably wrong.
> 
> Because s->state is written in the main thread too, for example by
> the reconnect timer callback, it cannot be protected by a CoMutex.
> Introduce a separate lock that can be used by nbd_co_send_request();
> later on this lock will also be used for s->state.  There will not
> be any contention on the lock unless there is a reconnect, so this
> is not performance sensitive.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/nbd.c | 46 +++---
>  1 file changed, 27 insertions(+), 19 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 0ff41cb914..c908ea6ae3 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -72,17 +72,22 @@ typedef struct BDRVNBDState {
>  QIOChannel *ioc; /* The current I/O channel */
>  NBDExportInfo info;
>  
> -CoMutex send_mutex;
> +/*
> + * Protects free_sema, in_flight, requests[].coroutine,
> + * reconnect_delay_timer.
> + */
> +QemuMutex requests_lock;
>  CoQueue free_sema;
> -
> -CoMutex receive_mutex;
>  int in_flight;
> +NBDClientRequest requests[MAX_NBD_REQUESTS];
> +QEMUTimer *reconnect_delay_timer;
> +
> +CoMutex send_mutex;
> +CoMutex receive_mutex;
>  NBDClientState state;
>  
> -QEMUTimer *reconnect_delay_timer;
>  QEMUTimer *open_timer;
>  
> -NBDClientRequest requests[MAX_NBD_REQUESTS];

Reordering of the struct makes sense.

> @@ -468,11 +473,10 @@ static int coroutine_fn 
> nbd_co_send_request(BlockDriverState *bs,
>  BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
>  int rc, i = -1;
>  
> -qemu_co_mutex_lock(>send_mutex);
> -
> +qemu_mutex_lock(>requests_lock);
>  while (s->in_flight == MAX_NBD_REQUESTS ||
> (!nbd_client_connected(s) && s->in_flight > 0)) {
> -qemu_co_queue_wait(>free_sema, >send_mutex);
> +qemu_co_queue_wait(>free_sema, >requests_lock);
>  }
>  
>  s->in_flight++;
> @@ -493,14 +497,14 @@ static int coroutine_fn 
> nbd_co_send_request(BlockDriverState *bs,
>  }
>  }
>  
> -g_assert(qemu_in_coroutine());

Why is this assert dropped?  Is it because we've marked the function
with coroutine_fn?  If so, should we drop it earlier in the series,
when you added the label?

Otherwise, the patch makes sense to me.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH for-7.1 4/8] nbd: keep send_mutex/free_sema handling outside nbd_co_do_establish_connection

2022-04-13 Thread Eric Blake

On Tue, Apr 12, 2022 at 09:42:00PM +0200, Paolo Bonzini wrote:
> Elevate s->in_flight early so that other incoming requests will wait
> on the CoQueue in nbd_co_send_request; restart them after getting back
> from nbd_reconnect_attempt.  This could be after the reconnect timer or
> nbd_cancel_in_flight have cancelled the attempt, so there is no
> need anymore to cancel the requests there.
> 
> nbd_co_send_request now handles both stopping and restarting pending
> requests after a successful connection, and there is no need to
> hold send_mutex in nbd_co_do_establish_connection.  The current setup
> is confusing because nbd_co_do_establish_connection is called both with
> send_mutex taken and without it.  Before the patch it uses free_sema which
> (at least in theory...) is protected by send_mutex, after the patch it
> does not anymore.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/coroutines.h |  4 +--
>  block/nbd.c| 66 ++
>  2 files changed, 33 insertions(+), 37 deletions(-)
> 

> +++ b/block/nbd.c

> @@ -359,25 +354,25 @@ int coroutine_fn 
> nbd_co_do_establish_connection(BlockDriverState *bs,
>  /* called under s->send_mutex */
>  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
>  {
> -assert(nbd_client_connecting(s));
> -assert(s->in_flight == 0);
> -
> -if (nbd_client_connecting_wait(s) && s->reconnect_delay &&
> -!s->reconnect_delay_timer)
> -{
> -/*
> - * It's first reconnect attempt after switching to
> - * NBD_CLIENT_CONNECTING_WAIT
> - */
> -reconnect_delay_timer_init(s,
> -qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
> -s->reconnect_delay * NANOSECONDS_PER_SECOND);
> -}
> +bool blocking = nbd_client_connecting_wait(s);
>  
>  /*
>   * Now we are sure that nobody is accessing the channel, and no one will
>   * try until we set the state to CONNECTED.
>   */
> +assert(nbd_client_connecting(s));
> +assert(s->in_flight == 1);
> +
> +if (blocking && !s->reconnect_delay_timer) {
> +/*
> + * It's first reconnect attempt after switching to

While moving this, we could add the missing article: "It's the first"

> + * NBD_CLIENT_CONNECTING_WAIT
> + */
> +g_assert(s->reconnect_delay);
> +reconnect_delay_timer_init(s,
> +qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
> +s->reconnect_delay * NANOSECONDS_PER_SECOND);
> +}
>  
>  /* Finalize previous connection if any */
>  if (s->ioc) {
> @@ -388,7 +383,9 @@ static coroutine_fn void 
> nbd_reconnect_attempt(BDRVNBDState *s)
>  s->ioc = NULL;
>  }
>  
> -nbd_co_do_establish_connection(s->bs, NULL);
> +qemu_co_mutex_unlock(>send_mutex);
> +nbd_co_do_establish_connection(s->bs, blocking, NULL);
> +qemu_co_mutex_lock(>send_mutex);
>  
>  /*
>   * The reconnect attempt is done (maybe successfully, maybe not), so
> @@ -474,21 +471,21 @@ static int coroutine_fn 
> nbd_co_send_request(BlockDriverState *bs,
>  qemu_co_mutex_lock(>send_mutex);
>  
>  while (s->in_flight == MAX_NBD_REQUESTS ||
> -   (!nbd_client_connected(s) && s->in_flight > 0))
> -{
> +   (!nbd_client_connected(s) && s->in_flight > 0)) {

Mixing in a style change here.  Not the end of the world.

The cosmetics are trivial, and the real change of enlarging the scope
of in_flight makes sense to me.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Emanuele Giuseppe Esposito




Am 13/04/2022 um 17:14 schrieb Emanuele Giuseppe Esposito:
> 
> 
> Am 13/04/2022 um 16:51 schrieb Kevin Wolf:
>> Am 13.04.2022 um 15:43 hat Emanuele Giuseppe Esposito geschrieben:
>>> So this is a more concrete and up-to-date header.
>>>
>>> Few things to notice:
>>> - we have a list of AioContext. They are registered once an aiocontext
>>> is created, and deleted when it is destroyed.
>>> This list is helpful because each aiocontext can only modify its own
>>> number of readers, avoiding unnecessary cacheline bouncing
>>>
>>> - if a coroutine changes aiocontext, it's ok with regards to the
>>> per-aiocontext reader counter. As long as the sum is correct, there's no
>>> issue. The problem comes only once the original aiocontext is deleted,
>>> and at that point we need to move the count it held to a shared global
>>> variable, otherwise we risk to lose track of readers.
>>
>> So the idea is that we can do bdrv_graph_co_rdlock() in one thread and
>> the corresponding bdrv_graph_co_rdunlock() in a different thread?
>>
>> Would the unlock somehow remember the original thread, or do you use the
>> "sum is correct" argument and allow negative counter values, so you can
>> end up having count +1 in A and -1 in B to represent "no active
>> readers"? If this happens, it's likely to happen many times, so do we
>> have to take integer overflows into account then?
>>
>>> - All synchronization between the flags explained in this header is of
>>> course handled in the implementation. But for now it would be nice to
>>> have a feedback on the idea/API.
>>>
>>> So in short we need:
>>> - per-aiocontext counter
>>> - global list of aiocontext
>>> - global additional reader counter (in case an aiocontext is deleted)
>>> - global CoQueue
>>> - global has_writer flag
>>> - global QemuMutex to protect the list access
>>>
>>> Emanuele
>>>
>>> #ifndef BLOCK_LOCK_H
>>> #define BLOCK_LOCK_H
>>>
>>> #include "qemu/osdep.h"
>>>
>>> /*
>>>  * register_aiocontext:
>>>  * Add AioContext @ctx to the list of AioContext.
>>>  * This list is used to obtain the total number of readers
>>>  * currently running the graph.
>>>  */
>>> void register_aiocontext(AioContext *ctx);
>>>
>>> /*
>>>  * unregister_aiocontext:
>>>  * Removes AioContext @ctx to the list of AioContext.
>>>  */
>>> void unregister_aiocontext(AioContext *ctx);
>>>
>>> /*
>>>  * bdrv_graph_wrlock:
>>>  * Modify the graph. Nobody else is allowed to access the graph.
>>>  * Set global has_writer to 1, so that the next readers will wait
>>>  * that writer is done in a coroutine queue.
>>>  * Then keep track of the running readers by counting what is the total
>>>  * amount of readers (sum of all aiocontext readers), and wait until
>>>  * they all finish with AIO_WAIT_WHILE.
>>>  */
>>> void bdrv_graph_wrlock(void);
>>
>> Do we need a coroutine version that yields instead of using
>> AIO_WAIT_WHILE() or are we sure this will only ever be called from
>> non-coroutine contexts?
> 
> writes (graph modifications) are always done under BQL in the main loop.
> Except an unit test, I don't think a coroutine ever does that.

Additional point (1): I am also prepraring a serie with all "helpful
fixes" I got through all other discarderd/obsolete series, like
subtree_drain and similar.

So except for the job patches, you can discard all other series.

> 
>>
>>> /*
>>>  * bdrv_graph_wrunlock:
>>>  * Write finished, reset global has_writer to 0 and restart
>>>  * all readers that are waiting.
>>>  */
>>> void bdrv_graph_wrunlock(void);
>>>
>>> /*
>>>  * bdrv_graph_co_rdlock:
>>>  * Read the bs graph. Increases the reader counter of the current
>>> aiocontext,
>>>  * and if has_writer is set, it means that the writer is modifying
>>>  * the graph, therefore wait in a coroutine queue.
>>>  * The writer will then wake this coroutine once it is done.
>>>  *
>>>  * This lock cannot be taken recursively.
>>>  */
>>> void coroutine_fn bdrv_graph_co_rdlock(void);
>>
>> What prevents it from being taken recursively when it's just a counter?
>> (I do see however, that you can't take a reader lock while you have the
>> writer lock or vice versa because it would deadlock.)
>>
> I actually didn't add the assertion to prevent it from being recoursive
> yet, but I think it simplifies everything if it's not recoursive


Additional point (2): I forgot that with counters there's no easy way to
avoid recursion, so yeah theoretically it can be recursive. Still,
better avoid doing it intentionally though.

> 
>> Does this being a coroutine_fn mean that we would have to convert QMP
>> command handlers to coroutines so that they can take the rdlock while
>> they don't expect the graph to change? Or should we have a non-coroutine
>> version, too, that works with AIO_WAIT_WHILE()?
> 
> Why convert the QMP command handlers? coroutine_fn was just to signal
> that it can also be called from coroutines, like the ones created by the
> blk_* API.
> A reader does not have to be a coroutine. AIO_WAIT_WHILE is not
>

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Emanuele Giuseppe Esposito




Am 13/04/2022 um 16:51 schrieb Kevin Wolf:
> Am 13.04.2022 um 15:43 hat Emanuele Giuseppe Esposito geschrieben:
>> So this is a more concrete and up-to-date header.
>>
>> Few things to notice:
>> - we have a list of AioContext. They are registered once an aiocontext
>> is created, and deleted when it is destroyed.
>> This list is helpful because each aiocontext can only modify its own
>> number of readers, avoiding unnecessary cacheline bouncing
>>
>> - if a coroutine changes aiocontext, it's ok with regards to the
>> per-aiocontext reader counter. As long as the sum is correct, there's no
>> issue. The problem comes only once the original aiocontext is deleted,
>> and at that point we need to move the count it held to a shared global
>> variable, otherwise we risk to lose track of readers.
> 
> So the idea is that we can do bdrv_graph_co_rdlock() in one thread and
> the corresponding bdrv_graph_co_rdunlock() in a different thread?
> 
> Would the unlock somehow remember the original thread, or do you use the
> "sum is correct" argument and allow negative counter values, so you can
> end up having count +1 in A and -1 in B to represent "no active
> readers"? If this happens, it's likely to happen many times, so do we
> have to take integer overflows into account then?
> 
>> - All synchronization between the flags explained in this header is of
>> course handled in the implementation. But for now it would be nice to
>> have a feedback on the idea/API.
>>
>> So in short we need:
>> - per-aiocontext counter
>> - global list of aiocontext
>> - global additional reader counter (in case an aiocontext is deleted)
>> - global CoQueue
>> - global has_writer flag
>> - global QemuMutex to protect the list access
>>
>> Emanuele
>>
>> #ifndef BLOCK_LOCK_H
>> #define BLOCK_LOCK_H
>>
>> #include "qemu/osdep.h"
>>
>> /*
>>  * register_aiocontext:
>>  * Add AioContext @ctx to the list of AioContext.
>>  * This list is used to obtain the total number of readers
>>  * currently running the graph.
>>  */
>> void register_aiocontext(AioContext *ctx);
>>
>> /*
>>  * unregister_aiocontext:
>>  * Removes AioContext @ctx to the list of AioContext.
>>  */
>> void unregister_aiocontext(AioContext *ctx);
>>
>> /*
>>  * bdrv_graph_wrlock:
>>  * Modify the graph. Nobody else is allowed to access the graph.
>>  * Set global has_writer to 1, so that the next readers will wait
>>  * that writer is done in a coroutine queue.
>>  * Then keep track of the running readers by counting what is the total
>>  * amount of readers (sum of all aiocontext readers), and wait until
>>  * they all finish with AIO_WAIT_WHILE.
>>  */
>> void bdrv_graph_wrlock(void);
> 
> Do we need a coroutine version that yields instead of using
> AIO_WAIT_WHILE() or are we sure this will only ever be called from
> non-coroutine contexts?

writes (graph modifications) are always done under BQL in the main loop.
Except an unit test, I don't think a coroutine ever does that.

> 
>> /*
>>  * bdrv_graph_wrunlock:
>>  * Write finished, reset global has_writer to 0 and restart
>>  * all readers that are waiting.
>>  */
>> void bdrv_graph_wrunlock(void);
>>
>> /*
>>  * bdrv_graph_co_rdlock:
>>  * Read the bs graph. Increases the reader counter of the current
>> aiocontext,
>>  * and if has_writer is set, it means that the writer is modifying
>>  * the graph, therefore wait in a coroutine queue.
>>  * The writer will then wake this coroutine once it is done.
>>  *
>>  * This lock cannot be taken recursively.
>>  */
>> void coroutine_fn bdrv_graph_co_rdlock(void);
> 
> What prevents it from being taken recursively when it's just a counter?
> (I do see however, that you can't take a reader lock while you have the
> writer lock or vice versa because it would deadlock.)
> 
I actually didn't add the assertion to prevent it from being recoursive
yet, but I think it simplifies everything if it's not recoursive

> Does this being a coroutine_fn mean that we would have to convert QMP
> command handlers to coroutines so that they can take the rdlock while
> they don't expect the graph to change? Or should we have a non-coroutine
> version, too, that works with AIO_WAIT_WHILE()?

Why convert the QMP command handlers? coroutine_fn was just to signal
that it can also be called from coroutines, like the ones created by the
blk_* API.
A reader does not have to be a coroutine. AIO_WAIT_WHILE is not
mandatory to allow it to finish, it helps to ensure progress in case
some reader is waiting for something, but other than that is not
necessary IMO.

> Or should this only be taken for very small pieces of code directly
> accessing the BdrvChild objects, and high-level users like QMP commands
> shouldn't even consider themselves readers?
> 

No I think if we focus on small pieces of code we end up having a
million lock/unlock pairs.

>> /*
>>  * bdrv_graph_rdunlock:
>>  * Read terminated, decrease the count of readers in the current aiocontext.
>>  * If the writer is waiting for reads to finish

Re: [PATCH v2] hw/ppc: change indentation to spaces from TABs

2022-04-13 Thread Guo Zhi

Thanks for your help, I'll pay attention to the commit specification next time.

Guo

- Original Message -
From: "Daniel Henrique Barboza" 
To: "Guo Zhi" , "Cédric Le Goater" , 
"David Gibson" , "Greg Kurz" 
Cc: "qemu-ppc" , "qemu-devel@nongnu.org Developers" 

Sent: Wednesday, April 13, 2022 9:00:15 PM
Subject: Re: [PATCH v2] hw/ppc: change indentation to spaces from TABs

On 4/11/22 23:12, Guo Zhi wrote:
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/374
> 
> There are still some files in the QEMU PPC code base that use TABs for 
> indentation instead of using  spaces. The TABs should be replaced so that we 
> have a consistent coding style.
> 

I forgot to mention about the commit msg. Ideally we want the commit msg
to have shorter lines. If you use 'vim' you can do that by selecting
the commit msg and hitting 'G' and 'W' in non-insert mode.

I've amended it before queueing, so don't worry about it. I also moved the
'Resolves' tag to the end of the commit msg, which I also forgot to mention
about.

> Signed-off-by: Guo Zhi 
> ---


Reviewed-by: Daniel Henrique Barboza 



>   hw/ppc/ppc440_bamboo.c |  6 +++---
>   hw/ppc/spapr_rtas.c| 18 +-
>   include/hw/ppc/ppc.h   | 10 +-
>   3 files changed, 17 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/ppc/ppc440_bamboo.c b/hw/ppc/ppc440_bamboo.c
> index 7fb620b9a0..5ec3a9a17f 100644
> --- a/hw/ppc/ppc440_bamboo.c
> +++ b/hw/ppc/ppc440_bamboo.c
> @@ -3,9 +3,9 @@
>*
>* Copyright 2007 IBM Corporation.
>* Authors:
> - *   Jerone Young 
> - *   Christian Ehrhardt 
> - *   Hollis Blanchard 
> + *  Jerone Young 
> + *  Christian Ehrhardt 
> + *  Hollis Blanchard 
>*
>* This work is licensed under the GNU GPL license version 2 or later.
>*
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index d7c04237fe..d58b65e88f 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -474,16 +474,16 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>   
>   if (spapr->fwnmi_machine_check_interlock != cpu->vcpu_id) {
>   /*
> -  * The vCPU that hit the NMI should invoke "ibm,nmi-interlock"
> + * The vCPU that hit the NMI should invoke "ibm,nmi-interlock"
>* This should be PARAM_ERROR, but Linux calls "ibm,nmi-interlock"
> -  * for system reset interrupts, despite them not being interlocked.
> -  * PowerVM silently ignores this and returns success here. Returning
> -  * failure causes Linux to print the error "FWNMI: nmi-interlock
> -  * failed: -3", although no other apparent ill effects, this is a
> -  * regression for the user when enabling FWNMI. So for now, match
> -  * PowerVM. When most Linux clients are fixed, this could be
> -  * changed.
> -  */
> + * for system reset interrupts, despite them not being interlocked.
> + * PowerVM silently ignores this and returns success here. Returning
> + * failure causes Linux to print the error "FWNMI: nmi-interlock
> + * failed: -3", although no other apparent ill effects, this is a
> + * regression for the user when enabling FWNMI. So for now, match
> + * PowerVM. When most Linux clients are fixed, this could be
> + * changed.
> + */
>   rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>   return;
>   }
> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
> index 364f165b4b..02af03ada2 100644
> --- a/include/hw/ppc/ppc.h
> +++ b/include/hw/ppc/ppc.h
> @@ -99,11 +99,11 @@ enum {
>   ARCH_MAC99_U3,
>   };
>   
> -#define FW_CFG_PPC_WIDTH (FW_CFG_ARCH_LOCAL + 0x00)
> -#define FW_CFG_PPC_HEIGHT(FW_CFG_ARCH_LOCAL + 0x01)
> -#define FW_CFG_PPC_DEPTH (FW_CFG_ARCH_LOCAL + 0x02)
> -#define FW_CFG_PPC_TBFREQ(FW_CFG_ARCH_LOCAL + 0x03)
> -#define FW_CFG_PPC_CLOCKFREQ (FW_CFG_ARCH_LOCAL + 0x04)
> +#define FW_CFG_PPC_WIDTH(FW_CFG_ARCH_LOCAL + 0x00)
> +#define FW_CFG_PPC_HEIGHT   (FW_CFG_ARCH_LOCAL + 0x01)
> +#define FW_CFG_PPC_DEPTH(FW_CFG_ARCH_LOCAL + 0x02)
> +#define FW_CFG_PPC_TBFREQ   (FW_CFG_ARCH_LOCAL + 0x03)
> +#define FW_CFG_PPC_CLOCKFREQ(FW_CFG_ARCH_LOCAL + 0x04)
>   #define FW_CFG_PPC_IS_KVM   (FW_CFG_ARCH_LOCAL + 0x05)
>   #define FW_CFG_PPC_KVM_HC   (FW_CFG_ARCH_LOCAL + 0x06)
>   #define FW_CFG_PPC_KVM_PID  (FW_CFG_ARCH_LOCAL + 0x07)

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-04-13 Thread Kevin Wolf

Am 13.04.2022 um 15:43 hat Emanuele Giuseppe Esposito geschrieben:
> So this is a more concrete and up-to-date header.
> 
> Few things to notice:
> - we have a list of AioContext. They are registered once an aiocontext
> is created, and deleted when it is destroyed.
> This list is helpful because each aiocontext can only modify its own
> number of readers, avoiding unnecessary cacheline bouncing
> 
> - if a coroutine changes aiocontext, it's ok with regards to the
> per-aiocontext reader counter. As long as the sum is correct, there's no
> issue. The problem comes only once the original aiocontext is deleted,
> and at that point we need to move the count it held to a shared global
> variable, otherwise we risk to lose track of readers.

So the idea is that we can do bdrv_graph_co_rdlock() in one thread and
the corresponding bdrv_graph_co_rdunlock() in a different thread?

Would the unlock somehow remember the original thread, or do you use the
"sum is correct" argument and allow negative counter values, so you can
end up having count +1 in A and -1 in B to represent "no active
readers"? If this happens, it's likely to happen many times, so do we
have to take integer overflows into account then?

> - All synchronization between the flags explained in this header is of
> course handled in the implementation. But for now it would be nice to
> have a feedback on the idea/API.
> 
> So in short we need:
> - per-aiocontext counter
> - global list of aiocontext
> - global additional reader counter (in case an aiocontext is deleted)
> - global CoQueue
> - global has_writer flag
> - global QemuMutex to protect the list access
> 
> Emanuele
> 
> #ifndef BLOCK_LOCK_H
> #define BLOCK_LOCK_H
> 
> #include "qemu/osdep.h"
> 
> /*
>  * register_aiocontext:
>  * Add AioContext @ctx to the list of AioContext.
>  * This list is used to obtain the total number of readers
>  * currently running the graph.
>  */
> void register_aiocontext(AioContext *ctx);
> 
> /*
>  * unregister_aiocontext:
>  * Removes AioContext @ctx to the list of AioContext.
>  */
> void unregister_aiocontext(AioContext *ctx);
> 
> /*
>  * bdrv_graph_wrlock:
>  * Modify the graph. Nobody else is allowed to access the graph.
>  * Set global has_writer to 1, so that the next readers will wait
>  * that writer is done in a coroutine queue.
>  * Then keep track of the running readers by counting what is the total
>  * amount of readers (sum of all aiocontext readers), and wait until
>  * they all finish with AIO_WAIT_WHILE.
>  */
> void bdrv_graph_wrlock(void);

Do we need a coroutine version that yields instead of using
AIO_WAIT_WHILE() or are we sure this will only ever be called from
non-coroutine contexts?

> /*
>  * bdrv_graph_wrunlock:
>  * Write finished, reset global has_writer to 0 and restart
>  * all readers that are waiting.
>  */
> void bdrv_graph_wrunlock(void);
> 
> /*
>  * bdrv_graph_co_rdlock:
>  * Read the bs graph. Increases the reader counter of the current
> aiocontext,
>  * and if has_writer is set, it means that the writer is modifying
>  * the graph, therefore wait in a coroutine queue.
>  * The writer will then wake this coroutine once it is done.
>  *
>  * This lock cannot be taken recursively.
>  */
> void coroutine_fn bdrv_graph_co_rdlock(void);

What prevents it from being taken recursively when it's just a counter?
(I do see however, that you can't take a reader lock while you have the
writer lock or vice versa because it would deadlock.)

Does this being a coroutine_fn mean that we would have to convert QMP
command handlers to coroutines so that they can take the rdlock while
they don't expect the graph to change? Or should we have a non-coroutine
version, too, that works with AIO_WAIT_WHILE()?

Or should this only be taken for very small pieces of code directly
accessing the BdrvChild objects, and high-level users like QMP commands
shouldn't even consider themselves readers?

> /*
>  * bdrv_graph_rdunlock:
>  * Read terminated, decrease the count of readers in the current aiocontext.
>  * If the writer is waiting for reads to finish (has_writer == 1), signal
>  * the writer that we are done via aio_wait_kick() to let it continue.
>  */
> void coroutine_fn bdrv_graph_co_rdunlock(void);
> 
> #endif /* BLOCK_LOCK_H */

I expect that in the final version, we might want to have some sugar
like a WITH_BDRV_GRAPH_RDLOCK_GUARD() macro, but obviously that doesn't
affect the fundamental design.

Kevin

Re: [PATCH v7 12/17] vfio-user: IOMMU support for remote device

2022-04-13 Thread Igor Mammedov

On Thu, 31 Mar 2022 08:41:01 -0400
Peter Xu  wrote:

> On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:
> > On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:  
> > > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:  
> > > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:  
> > > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:  
> > > > > > This makes me wonder whether there is a deeper issue with the
> > > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > > unplugged.
> > > > > > 
> > > > > > From what I can tell this is not the case today:
> > > > > > 
> > > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > > > >   address spaces but I can't find where they are removed and freed.
> > > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are 
> > > > > > leaked.
> > > > > > 
> > > > > > - hw/i386/amd_iommu.c has similar leaks.  
> > > > > 
> > > > > AFAICT it's because there's no device-specific data cached in the
> > > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > > structures allocated here can be re-used when a new device is plugged 
> > > > > in
> > > > > after the old device unplugged.
> > > > > 
> > > > > It's definitely not ideal since after unplug (and before a new device
> > > > > plugged in) the resource is not needed at all so it's kind of wasted, 
> > > > > but
> > > > > it should work functionally.  If to achieve that, some iommu_unplug() 
> > > > > or
> > > > > iommu_cleanup() hook sounds reasonable.  
> > > > 
> > > > I guess the question is whether PCI busses can be hotplugged with
> > > > IOMMUs. If yes, then there is a memory leak that matters for
> > > > intel_iommu.c and amd_iommu.c.  
> > > 
> > > It can't, and we only support one vIOMMU so far for both (commit
> > > 1b3bf13890fd849b26).  Thanks,  
> > 
> > I see, thanks!
> > 
> > Okay, summarizing options for the vfio-user IOMMU:
> > 
> > 1. Use the same singleton approach as existing IOMMUs where the
> >MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> > 
> > 2. Keep the approach in this patch where vfio-user code manually calls a
> >custom delete function (not part of the pci_setup_iommu() API). This
> >is slightly awkward to do without global state and that's what
> >started this discussion.
> > 
> > 3. Introduce an optional pci_setup_iommu() callback:
> > 
> >typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int 
> > devfn);
> > 
> >Solves the awkwardness of option #2. Not needed by existing IOMMU
> >devices.  
> 
> Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
> pass over the PCIDevice* too because in this case IIUC we'd need to double
> check the device class before doing anything - we may not want to call the
> vfio-user callbacks for general emulated devices under the same pci bus.
> 
> I think we could also fetch that from PCIBus.devices[devfn] but that's just
> not as obvious.
> 
> Option 4) is as mentioned previously, that we add another device unplug
> hook that can be registered per-device.  I just didn't think thoroughly on
can you expand on why per device hook is needed?

> how it would interact with the current HotplugHandler design yet.. it looks
> quite similar but so far it's either for the machine type or pci bus, not
> capable of registering on one single device (and it's always a mistery to
> me why we'd rather ignore the per-bus hook if the machine hook
> existed.. that's in qdev_get_hotplug_handler).

machine hook is there for bus-less devices mainly, if it's not defined
code will fallback to bus handler if any exists.

However machine hook can also be used to override default hotplug chain
to do to implement non trivial plug/unplug flow.
for example see pc_get_hotplug_handler(), corresponding
pc_machine_device_[pre_plug|plug|unplug...]_cb() where
plug/unplug chain is altered for some PCI devices types.
Perhaps the same can be done for vfio.

> 
> Copying Igor too.
>

Re: FSFreeze on Windows VM

2022-04-13 Thread Konstantin Kostiuk

Hi Shelly,

You can use guest-ping commend to check that GA is alive. This command
should work always.

Best Regards,
Konstantin Kostiuk.


On Wed, Apr 13, 2022 at 5:21 PM Shelly Kagan  wrote:

> Hi Konstantin,
> Thanks for the quick response.
> I will look into getting a newer GA version.
> Regarding the GA not available, is it relevant to the newer versions? Is
> there a way to make sure the GA is available when running the freeze? when
> it is not available it takes a lot of time to get a response back and it
> causes the freeze to delay.
> Best Regards,
> Shelly
>
> On Wed, Apr 13, 2022 at 4:51 PM Konstantin Kostiuk 
> wrote:
>
>> Hi Shelly,
>>
>> Thanks for your answer. Your version of GuestAgent is too old. The latest
>> version is 103.0.0.
>> Version 100.0.0 has a bug related to wrong error messages. When you try
>> to freeze FS, GA can return an error that the GA is not available.
>>
>> Please try again with the latest version. I think it should work fine. If
>> you still will have problems, will try to debug them.
>>
>> Best Regards,
>> Konstantin Kostiuk.
>>
>>
>> On Wed, Apr 13, 2022 at 10:03 AM Shelly Kagan  wrote:
>>
>>> Hi Konstantin,
>>>
>>> The Guest Agent version
>>> "guestAgentVersion": "100.0.0",
>>> I'm running with a 2019 windows image which I understand the GA is part
>>> of the virtio drivers.
>>> Don't know if the following info also helps but:
>>>   "hostname": "WIN-CUCKQ65DH6K",
>>>   "os": {
>>> "name": "Microsoft Windows",
>>> "kernelRelease": "17763",
>>> "version": "Microsoft Windows Server 2019",
>>> "prettyName": "Windows Server 2019 Standard",
>>> "versionId": "2019",
>>> "kernelVersion": "10.0",
>>> "machine": "x86_64",
>>> "id": "mswindows"
>>>   },
>>>
>>> I think what is more important to me is the GA being not responsive for
>>> the freeze call. I'm using the fsfreeze in a feature I'm implementing, and
>>> since in windows there is a 10sec timeout because of the VSS the fact that
>>> the freeze call returns error that the GA is not available plus that when
>>> its not available it takes much longer for the freeze call to return is
>>> really time sensitive.
>>>
>>> Thanks
>>>
>>> On Wed, Apr 6, 2022 at 2:29 PM Konstantin Kostiuk 
>>> wrote:
>>>
 Hi Shelly,

 Can you provide your version of Guest Agent? Is it built from upstream
 or some VirtIO-Win release?
 Previously we had some issues related to wrong error messages from
 Guest Agent.

 Best Regards,
 Konstantin Kostiuk.


 On Tue, Mar 29, 2022 at 6:24 PM Shelly Kagan  wrote:

> Hi all,
> I'm having some weird behavior with fsfreeze in windows VM.
> I run the fsfreeze and it returns:
> `error: Guest agent is not responding: Guest agent not available for
> now`
>
> but checking the status returned frozen, rerunning it again returns
> that the command is not enabled (expected since the freeze occurs)
>
> I checked the fsstatus and it keeps returning `frozen` even after 10
> seconds (from my understanding the freeze in windows vm is limited to 10
> seconds by default by the VSS. No way to change this default from my
> understanding, is it true?) Shouldn't the status return thawed if the VSS
> no longer keeps the freeze state?
>
> After a minute I did the thaw and it returned error:
> `error: internal error: unable to execute QEMU agent command
> 'guest-fsfreeze-thaw': couldn't hold writes: fsfreeze is limited up to 10
> seconds:`
>
> but the fsstatus changed to thawed after that call.
>
> My questions are:
> 1. Why would the fsfreeze return error about the guest agent but still
> freeze the fs?
> 2. Why would the guest agent not respond, is there a way to make sure
> it is available before the freeze command? (running the fsstatus command
> before it returned without issues..)
> 3. Is it expected that the fsstatus will return frozen even if
> possibly VSS has already thawed? and that the thaw fails but the status do
> change after to thawed?
>
> Thanks for taking the time to respond and help,
> --
>
> Shelly Kagan
>
> Senior Software Engineer
>
> Red Hat 
> 
>

>>>
>>> --
>>>
>>> Shelly Kagan
>>>
>>> Senior Software Engineer
>>>
>>> Red Hat 
>>> 
>>>
>>
>
> --
>
> Shelly Kagan
>
> Senior Software Engineer
>
> Red Hat 
> 
>

Re: [PATCH v7 12/17] vfio-user: IOMMU support for remote device

2022-04-13 Thread Igor Mammedov

On Fri, 25 Mar 2022 15:19:41 -0400
Jagannathan Raman  wrote:

> Assign separate address space for each device in the remote processes.
> 
> Signed-off-by: Elena Ufimtseva 
> Signed-off-by: John G Johnson 
> Signed-off-by: Jagannathan Raman 
> ---
>  include/hw/remote/iommu.h | 18 
>  hw/remote/iommu.c | 95 +++
>  MAINTAINERS   |  2 +
>  hw/remote/meson.build |  1 +
>  4 files changed, 116 insertions(+)
>  create mode 100644 include/hw/remote/iommu.h
>  create mode 100644 hw/remote/iommu.c
> 
> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> new file mode 100644
> index 00..8f850400f1
> --- /dev/null
> +++ b/include/hw/remote/iommu.h
> @@ -0,0 +1,18 @@
> +/**
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef REMOTE_IOMMU_H
> +#define REMOTE_IOMMU_H
> +
> +#include "hw/pci/pci_bus.h"
> +
> +void remote_configure_iommu(PCIBus *pci_bus);
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev);
> +
> +#endif
> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> new file mode 100644
> index 00..13f329b45d
> --- /dev/null
> +++ b/hw/remote/iommu.c
> @@ -0,0 +1,95 @@
> +/**
> + * IOMMU for remote device
> + *
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "hw/remote/iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> +#include "exec/memory.h"
> +#include "exec/address-spaces.h"
> +#include "trace.h"
> +
> +struct RemoteIommuElem {
> +AddressSpace  as;
> +MemoryRegion  mr;
> +};
> +
> +struct RemoteIommuTable {
> +QemuMutex lock;
> +GHashTable *elem_by_bdf;
> +} remote_iommu_table;
> +
> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> +
> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> +  void *opaque, int devfn)
> +{
> +struct RemoteIommuTable *iommu_table = opaque;
> +struct RemoteIommuElem *elem = NULL;
> +int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
> +
> +elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
> +
> +if (!elem) {
> +g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
> +g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
> +
> +elem = g_malloc0(sizeof(struct RemoteIommuElem));
> +
> +memory_region_init(>mr, NULL, mr_name, UINT64_MAX);
goes here:
   memory_region_do_init()
if (!owner) {   
 
owner = container_get(qdev_get_machine(), "/unattached");   
 
}  

then

> +address_space_init(>as, >mr, as_name);
> +
> +qemu_mutex_lock(_table->lock);
> +g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), 
> elem);
> +qemu_mutex_unlock(_table->lock);
> +}
> +
> +return >as;
> +}
> +
> +static void remote_iommu_del_elem(gpointer data)
> +{
> +struct RemoteIommuElem *elem = data;
> +
> +g_assert(elem);
> +
> +memory_region_unref(>mr);

here we call
  object_unref(mr->owner); 
leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
it doesn't look correct

I thought that memory_region_unref() should be always paired with 
memory_region_ref()

and looking at memory_region_init(...owner...) history it looks like
owner-less (NULL) regions are not meant to be deleted ever.

> +address_space_destroy(>as);
> +
> +g_free(elem);
> +}
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev)
> +{
> +int pci_bdf;
> +
> +if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
> +return;
> +}
> +
> +pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), 
> pci_dev->devfn);
> +
> +qemu_mutex_lock(_iommu_table.lock);
> +g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
> +qemu_mutex_unlock(_iommu_table.lock);
> +}
> +
> +void remote_configure_iommu(PCIBus *pci_bus)
> +{
> +if (!remote_iommu_table.elem_by_bdf) {
> +remote_iommu_table.elem_by_bdf =
> +g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
> +qemu_mutex_init(_iommu_table.lock);
> +}
> +
> +pci_setup_iommu(pci_bus, remote_iommu_find_add_as, _iommu_table);
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e7b0297a63..21694a9698 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3599,6 +3599,8 @@ F: hw/remote/iohub.c
>  F: include/hw/remote/iohub.h
>  F: subprojects/libvfio-user
>  F: hw/remote/vfio-user-obj.c
> +F: hw/remote/iommu.c
> +F:

Re: FSFreeze on Windows VM

2022-04-13 Thread Shelly Kagan

Hi Konstantin,
Thanks for the quick response.
I will look into getting a newer GA version.
Regarding the GA not available, is it relevant to the newer versions? Is
there a way to make sure the GA is available when running the freeze? when
it is not available it takes a lot of time to get a response back and it
causes the freeze to delay.
Best Regards,
Shelly

On Wed, Apr 13, 2022 at 4:51 PM Konstantin Kostiuk 
wrote:

> Hi Shelly,
>
> Thanks for your answer. Your version of GuestAgent is too old. The latest
> version is 103.0.0.
> Version 100.0.0 has a bug related to wrong error messages. When you try to
> freeze FS, GA can return an error that the GA is not available.
>
> Please try again with the latest version. I think it should work fine. If
> you still will have problems, will try to debug them.
>
> Best Regards,
> Konstantin Kostiuk.
>
>
> On Wed, Apr 13, 2022 at 10:03 AM Shelly Kagan  wrote:
>
>> Hi Konstantin,
>>
>> The Guest Agent version
>> "guestAgentVersion": "100.0.0",
>> I'm running with a 2019 windows image which I understand the GA is part
>> of the virtio drivers.
>> Don't know if the following info also helps but:
>>   "hostname": "WIN-CUCKQ65DH6K",
>>   "os": {
>> "name": "Microsoft Windows",
>> "kernelRelease": "17763",
>> "version": "Microsoft Windows Server 2019",
>> "prettyName": "Windows Server 2019 Standard",
>> "versionId": "2019",
>> "kernelVersion": "10.0",
>> "machine": "x86_64",
>> "id": "mswindows"
>>   },
>>
>> I think what is more important to me is the GA being not responsive for
>> the freeze call. I'm using the fsfreeze in a feature I'm implementing, and
>> since in windows there is a 10sec timeout because of the VSS the fact that
>> the freeze call returns error that the GA is not available plus that when
>> its not available it takes much longer for the freeze call to return is
>> really time sensitive.
>>
>> Thanks
>>
>> On Wed, Apr 6, 2022 at 2:29 PM Konstantin Kostiuk 
>> wrote:
>>
>>> Hi Shelly,
>>>
>>> Can you provide your version of Guest Agent? Is it built from upstream
>>> or some VirtIO-Win release?
>>> Previously we had some issues related to wrong error messages from Guest
>>> Agent.
>>>
>>> Best Regards,
>>> Konstantin Kostiuk.
>>>
>>>
>>> On Tue, Mar 29, 2022 at 6:24 PM Shelly Kagan  wrote:
>>>
 Hi all,
 I'm having some weird behavior with fsfreeze in windows VM.
 I run the fsfreeze and it returns:
 `error: Guest agent is not responding: Guest agent not available for
 now`

 but checking the status returned frozen, rerunning it again returns
 that the command is not enabled (expected since the freeze occurs)

 I checked the fsstatus and it keeps returning `frozen` even after 10
 seconds (from my understanding the freeze in windows vm is limited to 10
 seconds by default by the VSS. No way to change this default from my
 understanding, is it true?) Shouldn't the status return thawed if the VSS
 no longer keeps the freeze state?

 After a minute I did the thaw and it returned error:
 `error: internal error: unable to execute QEMU agent command
 'guest-fsfreeze-thaw': couldn't hold writes: fsfreeze is limited up to 10
 seconds:`

 but the fsstatus changed to thawed after that call.

 My questions are:
 1. Why would the fsfreeze return error about the guest agent but still
 freeze the fs?
 2. Why would the guest agent not respond, is there a way to make sure
 it is available before the freeze command? (running the fsstatus command
 before it returned without issues..)
 3. Is it expected that the fsstatus will return frozen even if possibly
 VSS has already thawed? and that the thaw fails but the status do change
 after to thawed?

 Thanks for taking the time to respond and help,
 --

 Shelly Kagan

 Senior Software Engineer

 Red Hat 

>>>
>>
>> --
>>
>> Shelly Kagan
>>
>> Senior Software Engineer
>>
>> Red Hat 
>> 
>>
>

-- 

Shelly Kagan

Senior Software Engineer

Red Hat

Re: [PATCH v2 12/39] exec/translator: Pass the locked filepointer to disas_log hook

2022-04-13 Thread Alex Bennée



Richard Henderson  writes:

> We have fetched and locked the logfile in translator_loop.
> Pass the filepointer down to the disas_log hook so that it
> need not be fetched and locked again.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

1 2 >

1 - 100 of 128 matches

Mail list logo