from:"Mike"

Re: Unmapping KVM Guest Memory from Host Kernel

2024-03-09 Thread Mike Rapoport

On Fri, Mar 08, 2024 at 03:22:50PM -0800, Sean Christopherson wrote:
> On Fri, Mar 08, 2024, James Gowans wrote:
> > However, memfd_secret doesn’t work out the box for KVM guest memory; the
> > main reason seems to be that the GUP path is intentionally disabled for
> > memfd_secret, so if we use a memfd_secret backed VMA for a memslot then
> > KVM is not able to fault the memory in. If it’s been pre-faulted in by
> > userspace then it seems to work.
> 
> Huh, that _shouldn't_ work.  The folio_is_secretmem() in gup_pte_range() is
> supposed to prevent the "fast gup" path from getting secretmem pages.

I suspect this works because KVM only calls gup on faults and if the memory
was pre-faulted via memfd_secret there won't be faults and no gups from
KVM.
 
> > With this in mind, what’s the best way to solve getting guest RAM out of
> > the direct map? Is memfd_secret integration with KVM the way to go, or
> > should we build a solution on top of guest_memfd, for example via some
> > flag that causes it to leave memory in the host userspace’s page tables,
> > but removes it from the direct map? 
> 
> memfd_secret obviously gets you a PoC much faster, but in the long term I'm 
> quite
> sure you'll be fighting memfd_secret all the way.  E.g. it's not dumpable, it
> deliberately allocates at 4KiB granularity (though I suspect the bug you found
> means that it can be inadvertantly mapped with 2MiB hugepages), it has no line
> of sight to taking userspace out of the equation, etc.
> 
> With guest_memfd on the other hand, everyone contributing to and maintaining 
> it
> has goals that are *very* closely aligned with what you want to do.

I agree with Sean, guest_memfd seems a better interface to use. It's
integrated by design with KVM and removing guest memory from the direct map
looks like a natural enhancement to guest_memfd. 

Unless I'm missing something, for fast-and-dirty POC it'll be a oneliner
that adds set_memory_np() to kvm_gmem_get_folio() and then figuring out
what to do with virtio :)

-- 
Sincerely yours,
Mike.

[PATCH v3 1/2] vhost: Add worker backend callouts

2023-12-04 Thread Mike Christie

This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
Reviewed-by: Stefano Garzarella 
Reviewed-by: Stefan Hajnoczi 

---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index a86d103f8245..70c2e8ffeee5 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -45,6 +45,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -85,6 +87,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -172,6 +182,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1

[PATCH v3 0/2] vhost-scsi: Support worker ioctls

2023-12-04 Thread Mike Christie

The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.

v3:
- Warn user if they have set worker_per_virtqueue=true but the kernel
doesn't support it.
v2:
- Make config option a bool instead of an int.

[PATCH v3 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-12-04 Thread Mike Christie

This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, worker_per_virtqueue, which can be set
to:

false: Existing behavior where we get the single worker thread.
true: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
Reviewed-by: Stefan Hajnoczi 

---
 hw/scsi/vhost-scsi.c| 62 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 63 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..08aa7534df51 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -165,6 +165,59 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
+{
+struct vhost_dev *dev = >dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, );
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+warn_report("vhost-scsi: Backend supports a single worker. "
+"Ignoring worker_per_virtqueue=true setting.");
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, _worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, );
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +285,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.worker_per_virtqueue);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +357,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_BOOL("worker_per_virtqueue", VirtIOSCSICommon,
+ conf.worker_per_virtqueue, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..0e9a1867665e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+bool worker_per_virtqueue;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1

Re: [PATCH v2 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-29 Thread Mike Christie

On 11/29/23 3:30 AM, Stefano Garzarella wrote:
> On Sun, Nov 26, 2023 at 06:28:34PM -0600, Mike Christie wrote:
>> This adds support for vhost-scsi to be able to create a worker thread
>> per virtqueue. Right now for vhost-net we get a worker thread per
>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>> CPUs, but for scsi we get the single worker thread that's shared by all
>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>> thread becomes a bottlneck.
>>
>> This patch adds a new setting, workers_per_virtqueue, which can be set
>> to:
>>
>> false: Existing behavior where we get the single worker thread.
>> true: Create a worker per IO virtqueue.
>>
>> Signed-off-by: Mike Christie 
>> ---
>> hw/scsi/vhost-scsi.c    | 60 +
>> include/hw/virtio/virtio-scsi.h |  1 +
>> 2 files changed, 61 insertions(+)
>>
>> diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
>> index 3126df9e1d9d..77eef9474c23 100644
>> --- a/hw/scsi/vhost-scsi.c
>> +++ b/hw/scsi/vhost-scsi.c
>> @@ -165,6 +165,57 @@ static const VMStateDescription 
>> vmstate_virtio_vhost_scsi = {
>>     .pre_save = vhost_scsi_pre_save,
>> };
>>
>> +static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
>> +{
>> +    struct vhost_dev *dev = >dev;
>> +    struct vhost_vring_worker vq_worker;
>> +    struct vhost_worker_state worker;
>> +    int i, ret;
>> +
>> +    /* Use default worker */
>> +    if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
>> +    return 0;
>> +    }
>> +
>> +    /*
>> + * ctl/evt share the first worker since it will be rare for them
>> + * to send cmds while IO is running.
>> + */
>> +    for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
>> +    memset(, 0, sizeof(worker));
>> +
>> +    ret = dev->vhost_ops->vhost_new_worker(dev, );
>> +    if (ret == -ENOTTY) {
>> +    /*
>> + * worker ioctls are not implemented so just ignore and
>> + * and continue device setup.
>> + */
> 
> IIUC here the user has asked to use a worker for each virtqueue, but the
> kernel does not support it so we ignore it.
> 
> Should we at least print a warning?
> 

We should. I'll add it.

[PATCH v2 1/2] vhost: Add worker backend callouts

2023-11-26 Thread Mike Christie

This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 96ccc18cd33b..9f16d0884e8f 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -33,6 +33,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -73,6 +75,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -151,6 +161,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1

[PATCH v2 0/2] vhost-scsi: Support worker ioctls

2023-11-26 Thread Mike Christie

The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.

v2:
- Make config option a bool instead of an int.

[PATCH v2 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-26 Thread Mike Christie

This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, workers_per_virtqueue, which can be set
to:

false: Existing behavior where we get the single worker thread.
true: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
---
 hw/scsi/vhost-scsi.c| 60 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 61 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..77eef9474c23 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -165,6 +165,57 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
+{
+struct vhost_dev *dev = >dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, );
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, _worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, );
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +283,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.worker_per_virtqueue);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +355,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_BOOL("worker_per_virtqueue", VirtIOSCSICommon,
+ conf.worker_per_virtqueue, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..0e9a1867665e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+bool worker_per_virtqueue;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1

Re: [PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-15 Thread Mike Christie

On 11/15/23 6:57 AM, Stefan Hajnoczi wrote:
> On Wed, Nov 15, 2023 at 12:43:02PM +0100, Stefano Garzarella wrote:
>> On Mon, Nov 13, 2023 at 06:36:44PM -0600, Mike Christie wrote:
>>> This adds support for vhost-scsi to be able to create a worker thread
>>> per virtqueue. Right now for vhost-net we get a worker thread per
>>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>>> CPUs, but for scsi we get the single worker thread that's shared by all
>>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>>> thread becomes a bottlneck.
>>>
>>> This patch adds a new setting, virtqueue_workers, which can be set to:
>>>
>>> 1: Existing behavior whre we get the single thread.
>>> -1: Create a worker per IO virtqueue.
>>
>> I find this setting a bit odd. What about a boolean instead?
>>
>> `per_virtqueue_workers`:
>> false: Existing behavior whre we get the single thread.
>> true: Create a worker per IO virtqueue.
> 
> Me too, I thought there would be round-robin assignment for 1 <
> worker_cnt < (dev->nvqs - VHOST_SCSI_VQ_NUM_FIXED) but instead only 1
> and -1 have any meaning.
> 
> Do you want to implement round-robin assignment?
> 

It was an int because I originally did round robin but at some point
dropped it. I found that our users at least:

1. Are used to configuring number of virtqueues.
2. In the userspace guest OS are used to checking the queue to CPU
mappings to figure out how their app should optimize itself.

So users would just do a virtqueue per vCPU or if trying to reduce
mem usage would do N virtqueues < vCPUs. For both cases they just did the
worker per virtqueue.

However, I left it an int in case in the future someone wanted
the future.

Re: [PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-15 Thread Mike Christie

On 11/15/23 5:43 AM, Stefano Garzarella wrote:
> On Mon, Nov 13, 2023 at 06:36:44PM -0600, Mike Christie wrote:
>> This adds support for vhost-scsi to be able to create a worker thread
>> per virtqueue. Right now for vhost-net we get a worker thread per
>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>> CPUs, but for scsi we get the single worker thread that's shared by all
>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>> thread becomes a bottlneck.
>>
>> This patch adds a new setting, virtqueue_workers, which can be set to:
>>
>> 1: Existing behavior whre we get the single thread.
>> -1: Create a worker per IO virtqueue.
> 
> I find this setting a bit odd. What about a boolean instead?
> 
> `per_virtqueue_workers`:
>     false: Existing behavior whre we get the single thread.
>     true: Create a worker per IO virtqueue.

Sound good.


> 
>>
>> Signed-off-by: Mike Christie 
>> ---
>> hw/scsi/vhost-scsi.c    | 68 +
>> include/hw/virtio/virtio-scsi.h |  1 +
>> 2 files changed, 69 insertions(+)
>>
>> diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
>> index 3126df9e1d9d..5cf669b6563b 100644
>> --- a/hw/scsi/vhost-scsi.c
>> +++ b/hw/scsi/vhost-scsi.c
>> @@ -31,6 +31,9 @@
>> #include "qemu/cutils.h"
>> #include "sysemu/sysemu.h"
>>
>> +#define VHOST_SCSI_WORKER_PER_VQ    -1
>> +#define VHOST_SCSI_WORKER_DEF    1
>> +
>> /* Features supported by host kernel. */
>> static const int kernel_feature_bits[] = {
>>     VIRTIO_F_NOTIFY_ON_EMPTY,
>> @@ -165,6 +168,62 @@ static const VMStateDescription 
>> vmstate_virtio_vhost_scsi = {
>>     .pre_save = vhost_scsi_pre_save,
>> };
>>
>> +static int vhost_scsi_set_workers(VHostSCSICommon *vsc, int workers_cnt)
>> +{
>> +    struct vhost_dev *dev = >dev;
>> +    struct vhost_vring_worker vq_worker;
>> +    struct vhost_worker_state worker;
>> +    int i, ret;
>> +
>> +    /* Use default worker */
>> +    if (workers_cnt == VHOST_SCSI_WORKER_DEF ||
>> +    dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
>> +    return 0;
>> +    }
>> +
>> +    if (workers_cnt != VHOST_SCSI_WORKER_PER_VQ) {
>> +    return -EINVAL;
>> +    }
>> +
>> +    /*
>> + * ctl/evt share the first worker since it will be rare for them
>> + * to send cmds while IO is running.
>> + */
>> +    for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
>> +    memset(, 0, sizeof(worker));
>> +
>> +    ret = dev->vhost_ops->vhost_new_worker(dev, );
> 
> Should we call vhost_free_worker() in the vhost_scsi_unrealize() or are
> workers automatically freed when `vhostfd` is closed?
> 

All worker threads are freed automatically like how the default worker
created from VHOST_SET_OWNER is freed on close.

[PATCH 1/2] vhost: Add worker backend callouts

2023-11-13 Thread Mike Christie

This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 96ccc18cd33b..9f16d0884e8f 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -33,6 +33,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -73,6 +75,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -151,6 +161,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1

[PATCH 0/2] vhost-scsi: Support worker ioctls

2023-11-13 Thread Mike Christie

The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.

[PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-13 Thread Mike Christie

This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, virtqueue_workers, which can be set to:

1: Existing behavior whre we get the single thread.
-1: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
---
 hw/scsi/vhost-scsi.c| 68 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 69 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..5cf669b6563b 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -31,6 +31,9 @@
 #include "qemu/cutils.h"
 #include "sysemu/sysemu.h"
 
+#define VHOST_SCSI_WORKER_PER_VQ-1
+#define VHOST_SCSI_WORKER_DEF1
+
 /* Features supported by host kernel. */
 static const int kernel_feature_bits[] = {
 VIRTIO_F_NOTIFY_ON_EMPTY,
@@ -165,6 +168,62 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, int workers_cnt)
+{
+struct vhost_dev *dev = >dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (workers_cnt == VHOST_SCSI_WORKER_DEF ||
+dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+if (workers_cnt != VHOST_SCSI_WORKER_PER_VQ) {
+return -EINVAL;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, );
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, _worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, );
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +291,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.virtqueue_workers);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +363,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_INT32("virtqueue_workers", VirtIOSCSICommon,
+  conf.virtqueue_workers, VHOST_SCSI_WORKER_DEF),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..f70624ece564 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+int virtqueue_workers;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1

Re: [PATCH v2 09/20] parallels: Make mark_used() and mark_unused() global functions

2023-10-21 Thread Mike Maslenkin

On Thu, Oct 19, 2023 at 5:23 PM Alexander Ivanov
 wrote:
>
> We will need these functions in parallels-ext.c too. Let them be global
> functions parallels_mark_used() and parallels_mark_unused().
>
> Signed-off-by: Alexander Ivanov 
> ---
>  block/parallels.c | 22 --
>  block/parallels.h |  5 +
>  2 files changed, 17 insertions(+), 10 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index a22ab7f2fc..2ee2b42038 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -178,8 +178,8 @@ static void parallels_set_bat_entry(BDRVParallelsState *s,
>  bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 
> 1);
>  }
>
> -static int mark_used(BlockDriverState *bs, unsigned long *bitmap,
> - uint32_t bitmap_size, int64_t off, uint32_t count)
> +int parallels_mark_used(BlockDriverState *bs, unsigned long *bitmap,
> +uint32_t bitmap_size, int64_t off, uint32_t count)
>  {
>  BDRVParallelsState *s = bs->opaque;
>  uint32_t cluster_index = host_cluster_index(s, off);
> @@ -195,8 +195,8 @@ static int mark_used(BlockDriverState *bs, unsigned long 
> *bitmap,
>  return 0;
>  }
>
> -static int mark_unused(BlockDriverState *bs, unsigned long *bitmap,
> -   uint32_t bitmap_size, int64_t off, uint32_t count)
> +int parallels_mark_unused(BlockDriverState *bs, unsigned long *bitmap,
> +  uint32_t bitmap_size, int64_t off, uint32_t count)
>  {
>  BDRVParallelsState *s = bs->opaque;
>  uint32_t cluster_index = host_cluster_index(s, off);
> @@ -249,7 +249,8 @@ static int parallels_fill_used_bitmap(BlockDriverState 
> *bs)
>  continue;
>  }
>
> -err2 = mark_used(bs, s->used_bmap, s->used_bmap_size, host_off, 1);
> +err2 = parallels_mark_used(bs, s->used_bmap, s->used_bmap_size,
> +   host_off, 1);
>  if (err2 < 0 && err == 0) {
>  err = err2;
>  }
> @@ -326,7 +327,8 @@ int64_t parallels_allocate_host_clusters(BlockDriverState 
> *bs,
>  }
>  }
>
> -ret = mark_used(bs, s->used_bmap, s->used_bmap_size, host_off, 
> *clusters);
> +ret = parallels_mark_used(bs, s->used_bmap, s->used_bmap_size,
> +  host_off, *clusters);
>  if (ret < 0) {
>  /* Image consistency is broken. Alarm! */
>  return ret;
> @@ -393,8 +395,8 @@ allocate_clusters(BlockDriverState *bs, int64_t 
> sector_num,
>
>  qemu_vfree(buf);
>  if (ret < 0) {
> -mark_unused(bs, s->used_bmap, s->used_bmap_size,
> -host_off, to_allocate);
> +parallels_mark_unused(bs, s->used_bmap, s->used_bmap_size,
> +  host_off, to_allocate);
>  return ret;
>  }
>  }
> @@ -868,7 +870,7 @@ parallels_check_duplicate(BlockDriverState *bs, 
> BdrvCheckResult *res,
>  continue;
>  }
>
> -ret = mark_used(bs, bitmap, bitmap_size, host_off, 1);
> +ret = parallels_mark_used(bs, bitmap, bitmap_size, host_off, 1);
>  assert(ret != -E2BIG);
>  if (ret == 0) {
>  continue;
> @@ -928,7 +930,7 @@ parallels_check_duplicate(BlockDriverState *bs, 
> BdrvCheckResult *res,
>   * considered, and the bitmap size doesn't change. This specifically
>   * means that -E2BIG is OK.
>   */
> -ret = mark_used(bs, bitmap, bitmap_size, host_off, 1);
> +ret = parallels_mark_used(bs, bitmap, bitmap_size, host_off, 1);
>  if (ret == -EBUSY) {
>  res->check_errors++;
>  goto out_repair_bat;
> diff --git a/block/parallels.h b/block/parallels.h
> index 3e4f397502..4e7aa6b80f 100644
> --- a/block/parallels.h
> +++ b/block/parallels.h
> @@ -90,6 +90,11 @@ typedef struct BDRVParallelsState {
>  Error *migration_blocker;
>  } BDRVParallelsState;
>
> +int parallels_mark_used(BlockDriverState *bs, unsigned long *bitmap,
> +uint32_t bitmap_size, int64_t off, uint32_t count);
> +int parallels_mark_unused(BlockDriverState *bs, unsigned long *bitmap,
> +  uint32_t bitmap_size, int64_t off, uint32_t count);
> +
>  int64_t parallels_allocate_host_clusters(BlockDriverState *bs,
>   int64_t *clusters);
>
> --
> 2.34.1
>
>

Just a note: parallels_mark_unused() could be initially declared as
global just because after patch 3/20 there can be compilation warning:
warning: unused function 'mark_unused' [-Wunused-function]
:)

I do not have strong opinion about how to avoid such compilation
warning in the middle of the patch series.
The simplest and straightforward way is to declare this function as
static in patch 4.

I do not have any other objections for the series except misplaced
NULL assignment.

Regards,
Mike.

Re: [PATCH v2 01/20] parallels: Set s->used_bmap to NULL in parallels_free_used_bitmap()

2023-10-21 Thread Mike Maslenkin

On Thu, Oct 19, 2023 at 4:06 PM Alexander Ivanov
 wrote:
>
> After used bitmap freeng s->used_bmap points to the freed memory. If we try
> to free used bitmap one more time it leads to double free error.
>
> Set s->used_bmap to NULL to exclude double free error.
>
> Signed-off-by: Alexander Ivanov 
> ---
>  block/parallels.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index 6b46623241..ba1fdde259 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -244,6 +244,7 @@ static void parallels_free_used_bitmap(BlockDriverState 
> *bs)
>  {
>  BDRVParallelsState *s = bs->opaque;
>  s->used_bmap_size = 0;
> +s->used_bmap = NULL;
>  g_free(s->used_bmap);
>  }

Shouldn't it be added after g_free() call?

Regards,
Mike.

Re: [PATCH v2 10/20] parallels: Add dirty bitmaps saving

2023-10-21 Thread Mike Maslenkin

On Thu, Oct 19, 2023 at 4:05 PM Alexander Ivanov
 wrote:
>
> Now dirty bitmaps can be loaded but there is no their saving. Add code for
> dirty bitmap storage.
>
> Signed-off-by: Alexander Ivanov 
> ---
>  block/parallels-ext.c | 167 ++
>  block/parallels.c |  16 +++-
>  block/parallels.h |   5 ++
>  3 files changed, 186 insertions(+), 2 deletions(-)
>
> diff --git a/block/parallels-ext.c b/block/parallels-ext.c
> index 8a109f005a..0a632a2331 100644
> --- a/block/parallels-ext.c
> +++ b/block/parallels-ext.c
> @@ -24,6 +24,7 @@
>   */
>
>  #include "qemu/osdep.h"
> +#include "qemu/error-report.h"
>  #include "qapi/error.h"
>  #include "block/block-io.h"
>  #include "block/block_int.h"
> @@ -301,3 +302,169 @@ out:
>
>  return ret;
>  }
> +
> +static void parallels_save_bitmap(BlockDriverState *bs, BdrvDirtyBitmap 
> *bitmap,
> +  uint8_t **buf, int *buf_size)
> +{
> +BDRVParallelsState *s = bs->opaque;
> +ParallelsFeatureHeader *fh;
> +ParallelsDirtyBitmapFeature *bh;
> +uint64_t *l1_table, l1_size, granularity, limit;
> +int64_t bm_size, ser_size, offset, buf_used;
> +int64_t alloc_size = 1;
> +const char *name;
> +uint8_t *bm_buf;
> +QemuUUID uuid;
> +int ret = 0;
> +
> +if (!bdrv_dirty_bitmap_get_persistence(bitmap) ||
> +bdrv_dirty_bitmap_inconsistent(bitmap)) {
> +return;
> +}
> +
> +bm_size = bdrv_dirty_bitmap_size(bitmap);
> +granularity = bdrv_dirty_bitmap_granularity(bitmap);
> +limit = bdrv_dirty_bitmap_serialization_coverage(s->cluster_size, 
> bitmap);
> +ser_size = bdrv_dirty_bitmap_serialization_size(bitmap, 0, bm_size);
> +l1_size = DIV_ROUND_UP(ser_size, s->cluster_size);
> +
> +buf_used = l1_size * 8 + sizeof(*fh) + sizeof(*bh);
> +/* Check if there is enough space for the final section */
> +if (*buf_size - buf_used < sizeof(*fh)) {
> +return;
> +}
> +
> +name = bdrv_dirty_bitmap_name(bitmap);
> +ret = qemu_uuid_parse(name, );
> +if (ret < 0) {
> +error_report("Can't save dirty bitmap: ID parsing error: '%s'", 
> name);
> +return;
> +}
> +
> +fh = (ParallelsFeatureHeader *)*buf;
> +bh = (ParallelsDirtyBitmapFeature *)(*buf + sizeof(*fh));
> +l1_table = (uint64_t *)((uint8_t *)bh + sizeof(*bh));
> +
> +fh->magic = cpu_to_le64(PARALLELS_DIRTY_BITMAP_FEATURE_MAGIC);
> +fh->data_size = cpu_to_le32(l1_size * 8 + sizeof(*bh));
> +
> +bh->l1_size = cpu_to_le32(l1_size);
> +bh->size = cpu_to_le64(bm_size >> BDRV_SECTOR_BITS);
> +bh->granularity = cpu_to_le32(granularity >> BDRV_SECTOR_BITS);
> +memcpy(bh->id, , sizeof(uuid));
> +
> +bm_buf = qemu_blockalign(bs, s->cluster_size);
> +
> +offset = 0;
> +while ((offset = bdrv_dirty_bitmap_next_dirty(bitmap, offset, bm_size)) 
> >= 0) {
> +uint64_t idx = offset / limit;
> +int64_t cluster_off, end, write_size;
> +
> +offset = QEMU_ALIGN_DOWN(offset, limit);
> +end = MIN(bm_size, offset + limit);
> +write_size = bdrv_dirty_bitmap_serialization_size(bitmap, offset,
> +  end - offset);
> +assert(write_size <= s->cluster_size);
> +
> +bdrv_dirty_bitmap_serialize_part(bitmap, bm_buf, offset, end - 
> offset);
> +if (write_size < s->cluster_size) {
> +memset(bm_buf + write_size, 0, s->cluster_size - write_size);
> +}
> +
> +cluster_off = parallels_allocate_host_clusters(bs, _size);
> +if (cluster_off <= 0) {
> +goto end;
> +}
> +
> +ret = bdrv_pwrite(bs->file, cluster_off, s->cluster_size, bm_buf, 0);
> +if (ret < 0) {
> +memset(>magic, 0, sizeof(fh->magic));
> +parallels_mark_unused(bs, s->used_bmap, s->used_bmap_size,
> +  cluster_off, 1);
> +goto end;
> +}
> +
> +l1_table[idx] = cpu_to_le64(cluster_off >> BDRV_SECTOR_BITS);
> +offset = end;
> +}
> +
> +*buf_size -= buf_used;
> +*buf += buf_used;
> +
> +end:
> +qemu_vfree(bm_buf);
> +}
> +
> +void parallels_store_persistent_dirty_bitmaps(BlockDriverState *bs,
> +  Error **errp)
> +{
> +BDRVParallelsState *s = bs->opaque;
> +BdrvDirtyBitmap *bitmap;
> +ParallelsFormatExtensionHeader *eh;
> +int remaining = s->cluster_size;
> +uint8_t *buf, *pos;
> +int64_t header_off, alloc_size = 1;
> +g_autofree uint8_t *hash = NULL;
> +size_t hash_len = 0;
> +int ret;
> +
> +s->header->ext_off = 0;
> +
> +if (!bdrv_has_named_bitmaps(bs)) {
> +return;
> +}
> +
> +buf = qemu_blockalign0(bs, s->cluster_size);
> +
> +eh = (ParallelsFormatExtensionHeader *)buf;
> +pos = buf + sizeof(*eh);
> +
> +eh->magic =

[PATCH] tcg: drop unused tcg_temp_free define

2023-10-14 Thread Mike Frysinger

Use of the API was removed a while back, but the define wasn't.

Signed-off-by: Mike Frysinger 
---
 include/tcg/tcg-op.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index 80cfcf8104b6..3ead59e4594d 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -52,7 +52,6 @@ static inline void tcg_gen_insn_start(target_ulong pc, 
target_ulong a1,
 typedef TCGv_i32 TCGv;
 #define tcg_temp_new() tcg_temp_new_i32()
 #define tcg_global_mem_new tcg_global_mem_new_i32
-#define tcg_temp_free tcg_temp_free_i32
 #define tcgv_tl_temp tcgv_i32_temp
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
@@ -60,7 +59,6 @@ typedef TCGv_i32 TCGv;
 typedef TCGv_i64 TCGv;
 #define tcg_temp_new() tcg_temp_new_i64()
 #define tcg_global_mem_new tcg_global_mem_new_i64
-#define tcg_temp_free tcg_temp_free_i64
 #define tcgv_tl_temp tcgv_i64_temp
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
-- 
2.39.0

Re: [PATCH 15/19] parallels: Remove unnecessary data_end field

2023-10-07 Thread Mike Maslenkin

On Sat, Oct 7, 2023 at 5:30 PM Alexander Ivanov
 wrote:
>
>
>
> On 10/7/23 13:21, Mike Maslenkin wrote:
> > On Sat, Oct 7, 2023 at 1:18 PM Alexander Ivanov
> >   wrote:
> >>
> >> On 10/6/23 21:43, Mike Maslenkin wrote:
> >>> On Mon, Oct 2, 2023 at 12:01 PM Alexander Ivanov
> >>>   wrote:
> >>>> Since we have used bitmap, field data_end in BDRVParallelsState is
> >>>> redundant and can be removed.
> >>>>
> >>>> Add parallels_data_end() helper and remove data_end handling.
> >>>>
> >>>> Signed-off-by: Alexander Ivanov
> >>>> ---
> >>>>block/parallels.c | 33 +
> >>>>block/parallels.h |  1 -
> >>>>2 files changed, 13 insertions(+), 21 deletions(-)
> >>>>
> >>>> diff --git a/block/parallels.c b/block/parallels.c
> >>>> index 48ea5b3f03..80a7171b84 100644
> >>>> --- a/block/parallels.c
> >>>> +++ b/block/parallels.c
> >>>> @@ -265,6 +265,13 @@ static void 
> >>>> parallels_free_used_bitmap(BlockDriverState *bs)
> >>>>g_free(s->used_bmap);
> >>>>}
> >>>>
> >>>> +static int64_t parallels_data_end(BDRVParallelsState *s)
> >>>> +{
> >>>> +int64_t data_end = s->data_start * BDRV_SECTOR_SIZE;
> >>>> +data_end += s->used_bmap_size * s->cluster_size;
> >>>> +return data_end;
> >>>> +}
> >>>> +
> >>>>int64_t parallels_allocate_host_clusters(BlockDriverState *bs,
> >>>> int64_t *clusters)
> >>>>{
> >>>> @@ -275,7 +282,7 @@ int64_t 
> >>>> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>>>
> >>>>first_free = find_first_zero_bit(s->used_bmap, s->used_bmap_size);
> >>>>if (first_free == s->used_bmap_size) {
> >>>> -host_off = s->data_end * BDRV_SECTOR_SIZE;
> >>>> +host_off = parallels_data_end(s);
> >>>>prealloc_clusters = *clusters + s->prealloc_size / s->tracks;
> >>>>bytes = prealloc_clusters * s->cluster_size;
> >>>>
> >>>> @@ -297,9 +304,6 @@ int64_t 
> >>>> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>>>s->used_bmap = bitmap_zero_extend(s->used_bmap, 
> >>>> s->used_bmap_size,
> >>>>  new_usedsize);
> >>>>s->used_bmap_size = new_usedsize;
> >>>> -if (host_off + bytes > s->data_end * BDRV_SECTOR_SIZE) {
> >>>> -s->data_end = (host_off + bytes) / BDRV_SECTOR_SIZE;
> >>>> -}
> >>>>} else {
> >>>>next_used = find_next_bit(s->used_bmap, s->used_bmap_size, 
> >>>> first_free);
> >>>>
> >>>> @@ -315,8 +319,7 @@ int64_t 
> >>>> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>>> * branch. In the other case we are likely re-using hole. 
> >>>> Preallocate
> >>>> * the space if required by the prealloc_mode.
> >>>> */
> >>>> -if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE &&
> >>>> -host_off < s->data_end * BDRV_SECTOR_SIZE) {
> >>>> +if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
> >>>>ret = bdrv_pwrite_zeroes(bs->file, host_off, bytes, 0);
> >>>>if (ret < 0) {
> >>>>return ret;
> >>>> @@ -757,13 +760,7 @@ parallels_check_outside_image(BlockDriverState *bs, 
> >>>> BdrvCheckResult *res,
> >>>>}
> >>>>}
> >>>>
> >>>> -if (high_off == 0) {
> >>>> -res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
> >>>> -} else {
> >>>> -res->image_end_offset = high_off + s->cluster_size;
> >>>> -s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
> >>>> -}
> >>>> -
> >>>> +res->image_end_offs

Re: [PATCH 15/19] parallels: Remove unnecessary data_end field

2023-10-07 Thread Mike Maslenkin

On Sat, Oct 7, 2023 at 1:18 PM Alexander Ivanov
 wrote:
>
>
>
> On 10/6/23 21:43, Mike Maslenkin wrote:
> > On Mon, Oct 2, 2023 at 12:01 PM Alexander Ivanov
> >  wrote:
> >> Since we have used bitmap, field data_end in BDRVParallelsState is
> >> redundant and can be removed.
> >>
> >> Add parallels_data_end() helper and remove data_end handling.
> >>
> >> Signed-off-by: Alexander Ivanov 
> >> ---
> >>   block/parallels.c | 33 +
> >>   block/parallels.h |  1 -
> >>   2 files changed, 13 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/block/parallels.c b/block/parallels.c
> >> index 48ea5b3f03..80a7171b84 100644
> >> --- a/block/parallels.c
> >> +++ b/block/parallels.c
> >> @@ -265,6 +265,13 @@ static void 
> >> parallels_free_used_bitmap(BlockDriverState *bs)
> >>   g_free(s->used_bmap);
> >>   }
> >>
> >> +static int64_t parallels_data_end(BDRVParallelsState *s)
> >> +{
> >> +int64_t data_end = s->data_start * BDRV_SECTOR_SIZE;
> >> +data_end += s->used_bmap_size * s->cluster_size;
> >> +return data_end;
> >> +}
> >> +
> >>   int64_t parallels_allocate_host_clusters(BlockDriverState *bs,
> >>int64_t *clusters)
> >>   {
> >> @@ -275,7 +282,7 @@ int64_t 
> >> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>
> >>   first_free = find_first_zero_bit(s->used_bmap, s->used_bmap_size);
> >>   if (first_free == s->used_bmap_size) {
> >> -host_off = s->data_end * BDRV_SECTOR_SIZE;
> >> +host_off = parallels_data_end(s);
> >>   prealloc_clusters = *clusters + s->prealloc_size / s->tracks;
> >>   bytes = prealloc_clusters * s->cluster_size;
> >>
> >> @@ -297,9 +304,6 @@ int64_t 
> >> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>   s->used_bmap = bitmap_zero_extend(s->used_bmap, 
> >> s->used_bmap_size,
> >> new_usedsize);
> >>   s->used_bmap_size = new_usedsize;
> >> -if (host_off + bytes > s->data_end * BDRV_SECTOR_SIZE) {
> >> -s->data_end = (host_off + bytes) / BDRV_SECTOR_SIZE;
> >> -}
> >>   } else {
> >>   next_used = find_next_bit(s->used_bmap, s->used_bmap_size, 
> >> first_free);
> >>
> >> @@ -315,8 +319,7 @@ int64_t 
> >> parallels_allocate_host_clusters(BlockDriverState *bs,
> >>* branch. In the other case we are likely re-using hole. 
> >> Preallocate
> >>* the space if required by the prealloc_mode.
> >>*/
> >> -if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE &&
> >> -host_off < s->data_end * BDRV_SECTOR_SIZE) {
> >> +if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
> >>   ret = bdrv_pwrite_zeroes(bs->file, host_off, bytes, 0);
> >>   if (ret < 0) {
> >>   return ret;
> >> @@ -757,13 +760,7 @@ parallels_check_outside_image(BlockDriverState *bs, 
> >> BdrvCheckResult *res,
> >>   }
> >>   }
> >>
> >> -if (high_off == 0) {
> >> -res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
> >> -} else {
> >> -res->image_end_offset = high_off + s->cluster_size;
> >> -s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
> >> -}
> >> -
> >> +res->image_end_offset = parallels_data_end(s);
> >>   return 0;
> >>   }
> >>
> >> @@ -806,7 +803,6 @@ parallels_check_leak(BlockDriverState *bs, 
> >> BdrvCheckResult *res,
> >>   res->check_errors++;
> >>   return ret;
> >>   }
> >> -s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
> >>
> >>   parallels_free_used_bitmap(bs);
> >>   ret = parallels_fill_used_bitmap(bs);
> >> @@ -1361,8 +1357,7 @@ static int parallels_open(BlockDriverState *bs, 
> >> QDict *options, int flags,
> >>   }
> >>
> >>   s->data_start = data_start;
> >> -s->data_end = s->data_start

Re: [PATCH 15/19] parallels: Remove unnecessary data_end field

2023-10-06 Thread Mike Maslenkin

kDriverState *bs)
>  end_off = (end_off + 1) * s->cluster_size;
>  }
>  end_off += s->data_start * BDRV_SECTOR_SIZE;
> -s->data_end = end_off / BDRV_SECTOR_SIZE;
>  return bdrv_truncate(bs->file, end_off, true, PREALLOC_MODE_OFF, 0, 
> NULL);
>  }
>
> diff --git a/block/parallels.h b/block/parallels.h
> index 18b4f8068e..a6a048d890 100644
> --- a/block/parallels.h
> +++ b/block/parallels.h
> @@ -79,7 +79,6 @@ typedef struct BDRVParallelsState {
>  unsigned int bat_size;
>
>  int64_t  data_start;
> -int64_t  data_end;
>  uint64_t prealloc_size;
>  ParallelsPreallocMode prealloc_mode;
>
> --
> 2.34.1
>

Is it intended behavior?

Run:
1. ./qemu-img create -f parallels $TEST_IMG 1T
2. dd if=/dev/zero of=$TEST_IMG oseek=12  bs=1M count=128 conv=notrunc
3. ./qemu-img check  $TEST_IMG
   No errors were found on the image.
   Image end offset: 150994944

Without this patch `qemu-img check` reports:
   ERROR space leaked at the end of the image 145752064

  139 leaked clusters were found on the image.
  This means waste of disk space, but no harm to data.
  Image end offset: 5242880

Note: there is another issue caused by previous commits exists.
g_free asserts from parallels_free_used_bitmap() because of
s->used_bmap is NULL.

To reproduce this crash at revision before or without patch 15/19, run commands:
1. ./qemu-img create -f parallels $TEST_IMG 1T
2. dd if=/dev/zero of=$TEST_IMG oseek=12  bs=1M count=128 conv=notrunc
3. ./qemu-img check -r leaks $TEST_IMG

Regards,
Mike.

Re: [PATCH v2 1/1] qemu-img: do not erase destination file in qemu-img dd command

2023-10-01 Thread Mike Maslenkin

I thought about "conv=notrunc", but my main concern is changed virtual
disk metadata.
It depends on how qemu-img used.
May be I followed to wrong pattern, but pros and cons of adding "conv"
parameter was not in my mind in scope of the first patch version.
I see 4 obvious ways of using `qemu-img dd`:
1. Copy virtual disk data between images of same format. I think disk
geometry must be preserved in this case.
2. Copy virtual disk data between different formats. It is a valid
pattern? May be `qemu-img convert` should to be used instead?
3. Merge snapshots to specified disk image, i.e read current state and
write it to new disk image.
4. Copy virtual disk data to raw binary file. Actually this patch
breaks 'dd' behavior for this case when source image is less (in terms
of logical blocks) than existed raw binary file.
May be for this case condition can be improved to smth like
   if (strcmp(fmt, "raw") || !g_file_test(out.filename,
G_FILE_TEST_EXISTS)) . And parameter "conv=notrunc" may be implemented
additionally for this case.

Three of above do not require  "conv=" parameter from my point of view.

I would be glad to hear other opinions.

Regards,
Mike.


On Sun, Oct 1, 2023 at 3:25 PM Denis V. Lunev  wrote:
>
> On 9/30/23 22:31, Mike Maslenkin wrote:
> > Add a check that destination file exists and do not call bdrv_create for
> > this case.
> >
> > Currently `qemu-img dd` command destroys content of destination file.
> > Effectively this means that parameters (geometry) of destination image
> > file are changing. This can be undesirable behavior for user especially
> > if format of destination image does not support resizing.
> >
> > Steps to reproduce:
> >1. Create empty disk image with some non default size.
> > `qemu-img  create -f qcow2 $DEST_IMG 3T`
> >   Remember that `qemu-img info $DEST_IMG` returns:
> > virtual size: 3 TiB (3298534883328 bytes)
> > disk size: 240 KiB
> > cluster_size: 65536
> >2. Run `qemu-img dd -O qcow2 of=$DEST_IMG if=$SRC_IMG bs=1M count=100`
> >3. Check `qemu-img info $DEST_IMG` output:
> > virtual size: 100 MiB (104857600 bytes)
> > disk size: 112 MiB
> > cluster_size: 65536
> >
> > Parameters of $DEST_IMG were changed. Actually `qemu-img dd` has created
> > a new disk based on current default geometry for particular format.
> > For example for "parallels" format default BAT for 256GB disk is written
> > to empty file prior writing disk image data.
> >
> > With this patch virtual disk metadata and geometry of a destination image
> > are preserved. As another visible change of `qemu-img dd` behavior is that
> > if destination image is less than source it can finish with error (similar
> > to "dd" utility):
> >qemu-img: error while writing to output image file: Input/output error
> >
> > Signed-off-by: Mike Maslenkin 
> > ---
> >diff from v1: removed additional fprintf call leaved in patch by accident
> > ---
> >   qemu-img.c | 17 ++---
> >   1 file changed, 10 insertions(+), 7 deletions(-)
> >
> > diff --git a/qemu-img.c b/qemu-img.c
> > index a48edb71015c..1a83c14212fb 100644
> > --- a/qemu-img.c
> > +++ b/qemu-img.c
> > @@ -5150,13 +5150,15 @@ static int img_dd(int argc, char **argv)
> >   size - in.bsz * in.offset, _abort);
> >   }
> >
> > -ret = bdrv_create(drv, out.filename, opts, _err);
> > -if (ret < 0) {
> > -error_reportf_err(local_err,
> > -  "%s: error while creating output image: ",
> > -  out.filename);
> > -ret = -1;
> > -goto out;
> > +if (!g_file_test(out.filename, G_FILE_TEST_EXISTS)) {
> > +ret = bdrv_create(drv, out.filename, opts, _err);
> > +if (ret < 0) {
> > +error_reportf_err(local_err,
> > +   "%s: error while creating output image: ",
> > +   out.filename);
> > +ret = -1;
> > +goto out;
> > +}
> >   }
> >
> >   /* TODO, we can't honour --image-opts for the target,
> may be it would be worth to follow conventional
> 'dd' approach, i.e. add conv=nocreat option which
> will do the trick?
>
> Den

[PATCH v2 1/1] qemu-img: do not erase destination file in qemu-img dd command

2023-09-30 Thread Mike Maslenkin

Add a check that destination file exists and do not call bdrv_create for
this case.

Currently `qemu-img dd` command destroys content of destination file.
Effectively this means that parameters (geometry) of destination image
file are changing. This can be undesirable behavior for user especially
if format of destination image does not support resizing.

Steps to reproduce:
  1. Create empty disk image with some non default size.
   `qemu-img  create -f qcow2 $DEST_IMG 3T`
 Remember that `qemu-img info $DEST_IMG` returns:
   virtual size: 3 TiB (3298534883328 bytes)
   disk size: 240 KiB
   cluster_size: 65536
  2. Run `qemu-img dd -O qcow2 of=$DEST_IMG if=$SRC_IMG bs=1M count=100`
  3. Check `qemu-img info $DEST_IMG` output:
   virtual size: 100 MiB (104857600 bytes)
   disk size: 112 MiB
   cluster_size: 65536

Parameters of $DEST_IMG were changed. Actually `qemu-img dd` has created
a new disk based on current default geometry for particular format.
For example for "parallels" format default BAT for 256GB disk is written
to empty file prior writing disk image data.

With this patch virtual disk metadata and geometry of a destination image
are preserved. As another visible change of `qemu-img dd` behavior is that
if destination image is less than source it can finish with error (similar
to "dd" utility):
  qemu-img: error while writing to output image file: Input/output error

Signed-off-by: Mike Maslenkin 
---
  diff from v1: removed additional fprintf call leaved in patch by accident
---
 qemu-img.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index a48edb71015c..1a83c14212fb 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -5150,13 +5150,15 @@ static int img_dd(int argc, char **argv)
 size - in.bsz * in.offset, _abort);
 }
 
-ret = bdrv_create(drv, out.filename, opts, _err);
-if (ret < 0) {
-error_reportf_err(local_err,
-  "%s: error while creating output image: ",
-  out.filename);
-ret = -1;
-goto out;
+if (!g_file_test(out.filename, G_FILE_TEST_EXISTS)) {
+ret = bdrv_create(drv, out.filename, opts, _err);
+if (ret < 0) {
+error_reportf_err(local_err,
+   "%s: error while creating output image: ",
+   out.filename);
+ret = -1;
+goto out;
+}
 }
 
 /* TODO, we can't honour --image-opts for the target,
-- 
2.32.0 (Apple Git-132)

[PATCH 1/1] qemu-img: do not erase destination file in qemu-img dd command

2023-09-30 Thread Mike Maslenkin

Add a check that destination file exists and do not call bdrv_create for
this case.

Currently `qemu-img dd` command destroys content of destination file.
Effectively this means that parameters (geometry) of destination image
file are changing. This can be undesirable behavior for user especially
if format of destination image does not support resizing.

Steps to reproduce:
  1. Create empty disk image with some non default size.
   `qemu-img  create -f qcow2 $DEST_IMG 3T`
 Remember that `qemu-img info $DEST_IMG` returns:
   virtual size: 3 TiB (3298534883328 bytes)
   disk size: 240 KiB
   cluster_size: 65536
  2. Run `qemu-img dd -O qcow2 of=$DEST_IMG if=$SRC_IMG bs=1M count=100`
  3. Check `qemu-img info $DEST_IMG` output:
   virtual size: 100 MiB (104857600 bytes)
   disk size: 112 MiB
   cluster_size: 65536

Parameters of $DEST_IMG were changed. Actually `qemu-img dd` has created
a new disk based on current default geometry for particular format.
For example for "parallels" format default BAT for 256GB disk is written
to empty file prior writing disk image data.

With this patch virtual disk metadata and geometry of a destination image
are preserved. As another visible change of `qemu-img dd` behavior is that
if destination image is less than source it can finish with error (similar
to "dd" utility):
  qemu-img: error while writing to output image file: Input/output error

Signed-off-by: Mike Maslenkin 
---
 qemu-img.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index a48edb71015c..1a83c14212fb 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -5150,13 +5150,16 @@ static int img_dd(int argc, char **argv)
 size - in.bsz * in.offset, _abort);
 }
 
-ret = bdrv_create(drv, out.filename, opts, _err);
-if (ret < 0) {
-error_reportf_err(local_err,
-  "%s: error while creating output image: ",
-  out.filename);
-ret = -1;
-goto out;
+if (!g_file_test(out.filename, G_FILE_TEST_EXISTS)) {
+ret = bdrv_create(drv, out.filename, opts, _err);
+fprintf (stderr, "Recreating image file\n");
+if (ret < 0) {
+error_reportf_err(local_err,
+   "%s: error while creating output image: ",
+   out.filename);
+ret = -1;
+goto out;
+}
 }
 
 /* TODO, we can't honour --image-opts for the target,
-- 
2.32.0 (Apple Git-132)

Re: [PATCH 06/21] parallels: refactor path when we need to re-check image in parallels_open

2023-09-17 Thread Mike Maslenkin

This patch generates a warning.

On Fri, Sep 15, 2023 at 9:41 PM Denis V. Lunev  wrote:
>
> More conditions follows thus the check should be more scalable.
>
> Signed-off-by: Denis V. Lunev 
> ---
>  block/parallels.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index 8f223bfd89..aa29df9f77 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -1065,7 +1065,7 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  int ret, size, i;
>  int64_t file_nb_sectors, sector;
>  uint32_t data_start;
> -bool data_off_is_correct;
> +bool need_check = false;
>
>  ret = parallels_opts_prealloc(bs, options, errp);
>  if (ret < 0) {
> @@ -1133,11 +1133,12 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  s->bat_bitmap = (uint32_t *)(s->header + 1);
>
>  if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
> -s->header_unclean = true;
> +need_check = s->header_unclean = true;
>  }
>
> -data_off_is_correct = parallels_test_data_off(s, file_nb_sectors,
> -  _start);
> +need_check = need_check ||
> + !parallels_test_data_off(s, file_nb_sectors, _start);
> +

../block/parallels.c:1139:18: warning: variable 'data_start' is used
uninitialized whenever '||' condition is true
[-Wsometimes-uninitialized]
need_check = need_check ||
 ^~
../block/parallels.c:1142:21: note: uninitialized use occurs here
s->data_start = data_start;
^~
../block/parallels.c:1139:18: note: remove the '||' if its condition
is always false
need_check = need_check ||
 ^
../block/parallels.c:1067:24: note: initialize the variable
'data_start' to silence this warning
uint32_t data_start;
   ^
= 0
1 warning generated.

Regards,
Mike.

Re: [PATCH 18/21] parallels: naive implementation of parallels_co_pdiscard

2023-09-17 Thread Mike Maslenkin

I got a warning after this patch:

../block/parallels.c:541:25: warning: 'guarded_by' attribute only
applies to non-static data members and global variables
[-Wignored-attributes]
static int coroutine_fn GRAPH_RDLOCK_PTR
^
/Users/mg/sources/qemu/include/block/graph-lock.h:85:26: note:
expanded from macro 'GRAPH_RDLOCK_PTR'
#define GRAPH_RDLOCK_PTR TSA_GUARDED_BY(graph_lock)
 ^
/Users/mg/sources/qemu/include/qemu/clang-tsa.h:48:31: note: expanded
from macro 'TSA_GUARDED_BY'
#define TSA_GUARDED_BY(x) TSA(guarded_by(x))

Regards,
Mike.


On Fri, Sep 15, 2023 at 9:42 PM Denis V. Lunev  wrote:
>
> * Discarding with backing stores is not supported by the format.
> * There is no buffering/queueing of the discard operation.
> * Only operations aligned to the cluster are supported.
>
> Signed-off-by: Denis V. Lunev 
> ---
>  block/parallels.c | 47 +++
>  1 file changed, 47 insertions(+)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index 76aedfd7c4..83cb8d6722 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -537,6 +537,52 @@ parallels_co_readv(BlockDriverState *bs, int64_t 
> sector_num, int nb_sectors,
>  return ret;
>  }
>
> +
> +static int coroutine_fn GRAPH_RDLOCK_PTR
> +parallels_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
> +{
> +int ret = 0;
> +uint32_t cluster, count;
> +BDRVParallelsState *s = bs->opaque;
> +
> +/*
> + * The image does not support ZERO mark inside the BAT, which means that
> + * stale data could be exposed from the backing file.
> + */
> +if (bs->backing) {
> +return -ENOTSUP;
> +}
> +
> +if (!QEMU_IS_ALIGNED(offset, s->cluster_size)) {
> +return -ENOTSUP;
> +} else if (!QEMU_IS_ALIGNED(bytes, s->cluster_size)) {
> +return -ENOTSUP;
> +}
> +
> +cluster = offset / s->cluster_size;
> +count = bytes / s->cluster_size;
> +
> +qemu_co_mutex_lock(>lock);
> +for (; count > 0; cluster++, count--) {
> +int64_t host_off = bat2sect(s, cluster) << BDRV_SECTOR_BITS;
> +if (host_off == 0) {
> +continue;
> +}
> +
> +ret = bdrv_co_pdiscard(bs->file, cluster * s->cluster_size,
> +   s->cluster_size);
> +if (ret < 0) {
> +goto done;
> +}
> +
> +parallels_set_bat_entry(s, cluster, 0);
> +bitmap_clear(s->used_bmap, host_cluster_index(s, host_off), 1);
> +}
> +done:
> +qemu_co_mutex_unlock(>lock);
> +return ret;
> +}
> +
>  static void parallels_check_unclean(BlockDriverState *bs,
>  BdrvCheckResult *res,
>  BdrvCheckMode fix)
> @@ -1409,6 +1455,7 @@ static BlockDriver bdrv_parallels = {
>  .bdrv_co_create = parallels_co_create,
>  .bdrv_co_create_opts= parallels_co_create_opts,
>  .bdrv_co_check  = parallels_co_check,
> +.bdrv_co_pdiscard   = parallels_co_pdiscard,
>  };
>
>  static void bdrv_parallels_init(void)
> --
> 2.34.1
>

Re: [PATCH 03/21] parallels: invent parallels_opts_prealloc() helper to parse prealloc opts

2023-09-17 Thread Mike Maslenkin

This is not introduced by this patch,
but looks like qemu_opts_del(opts) missed.

On Fri, Sep 15, 2023 at 9:41 PM Denis V. Lunev  wrote:
>
> This patch creates above mentioned helper and moves its usage to the
> beginning of parallels_open(). This simplifies parallels_open() a bit.
>
> The patch also ensures that we store prealloc_size on block driver state
> always in sectors. This makes code cleaner and avoids wrong opinion at
> the assignment that the value is in bytes.
>
> Signed-off-by: Denis V. Lunev 
> ---
>  block/parallels.c | 65 +++
>  1 file changed, 38 insertions(+), 27 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index 428f72de1c..1d5409f2ba 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -1025,6 +1025,38 @@ static int parallels_update_header(BlockDriverState 
> *bs)
>  return bdrv_pwrite_sync(bs->file, 0, size, s->header, 0);
>  }
>
> +
> +static int parallels_opts_prealloc(BlockDriverState *bs, QDict *options,
> +   Error **errp)
> +{
> +char *buf;
> +int64_t bytes;
> +BDRVParallelsState *s = bs->opaque;
> +Error *local_err = NULL;
> +QemuOpts *opts = qemu_opts_create(_runtime_opts, NULL, 0, 
> errp);
> +if (!opts) {
> +return -ENOMEM;
> +}
> +
> +if (!qemu_opts_absorb_qdict(opts, options, errp)) {
> +return -EINVAL;
> +}
> +
> +bytes = qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
> +s->prealloc_size = bytes >> BDRV_SECTOR_BITS;

qemu_opt_get_size_del returns uint64_t, so what's a reason to declare
"bytes" variable  as int64_t
and then shift it to the right?  I see here it can not be negative,
but it's a common to use signed values and not to add explicit check
before shifting to right In this file
I takes time to ensure that initial values are not negative.

Regards,
Mike.



> +buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
> +/* prealloc_mode can be downgraded later during allocate_clusters */
> +s->prealloc_mode = qapi_enum_parse(_mode_lookup, buf,
> +   PRL_PREALLOC_MODE_FALLOCATE,
> +   _err);
> +g_free(buf);
> +if (local_err != NULL) {
> +error_propagate(errp, local_err);
> +return -EINVAL;
> +}
> +return 0;
> +}
> +
>  static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
>Error **errp)
>  {
> @@ -1033,11 +1065,13 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  int ret, size, i;
>  int64_t file_nb_sectors, sector;
>  uint32_t data_start;
> -QemuOpts *opts = NULL;
> -Error *local_err = NULL;
> -char *buf;
>  bool data_off_is_correct;
>
> +ret = parallels_opts_prealloc(bs, options, errp);
> +if (ret < 0) {
> +return ret;
> +}
> +
>  ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
>  if (ret < 0) {
>  return ret;
> @@ -1078,6 +1112,7 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  ret = -EFBIG;
>  goto fail;
>  }
> +s->prealloc_size = MAX(s->tracks, s->prealloc_size);
>  s->cluster_size = s->tracks << BDRV_SECTOR_BITS;
>
>  s->bat_size = le32_to_cpu(ph.bat_entries);
> @@ -1117,29 +1152,6 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  s->header_size = size;
>  }
>
> -opts = qemu_opts_create(_runtime_opts, NULL, 0, errp);
> -if (!opts) {
> -goto fail_options;
> -}
> -
> -if (!qemu_opts_absorb_qdict(opts, options, errp)) {
> -goto fail_options;
> -}
> -
> -s->prealloc_size =
> -qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
> -s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
> -buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
> -/* prealloc_mode can be downgraded later during allocate_clusters */
> -s->prealloc_mode = qapi_enum_parse(_mode_lookup, buf,
> -   PRL_PREALLOC_MODE_FALLOCATE,
> -   _err);
> -g_free(buf);
> -if (local_err != NULL) {
> -error_propagate(errp, local_err);
> -goto fail_options;
> -}
> -
>  if (ph.ext_off) {
>  if (flags & BDRV_O_RDWR) {
>  /*
> @@ -1214,7 +1226,6 @@ static int parallels_open(BlockDriverState *bs, QDict 
> *options, int flags,
>
>  fail_format:
>  error_setg(errp, "Image not in Parallels format");
> -fail_options:
>  ret = -EINVAL;
>  fail:
>  /*
> --
> 2.34.1
>

Re: Lost partition tables on ide-hd + ahci drive

2023-08-26 Thread Mike Maslenkin

On Wed, Aug 23, 2023 at 12:17 PM Fiona Ebner  wrote:
>
> Am 23.08.23 um 10:47 schrieb Fiona Ebner:
> > Am 17.02.23 um 22:22 schrieb Mike Maslenkin:
> >> I can not tell anything about dma-reentracy issues, but yes, i would
> >> start to look at check_cmd() function call sequence.
> >> The most interesting is why Sector Count = 1. I thought about race
> >> with IDE reset where registers initialized with
> >> value SATA_SIGNATURE_DISK = 0x0101, but this means LBA=1 as well...
> >>
> >
> > You got it! Since we got another report (after half a year of nothing)
> > and also because of Simon's mail, I gave it another shot too and was
> > finally able to reproduce the issue (with our patched QEMU 8.0, but
> > patches shouldn't affect IDE code). See below for the traces that
> > confirm your theory. The reason the write goes to sector 0 and not 1 is
> > because ide_dma_cb() uses sector_num = ide_get_sector(s); and that will
> > evaluate to 0 after a reset.
> >
> > So the issue is indeed that ide_dma_cb can get called with an IDEState
> > just after that state was reset. Can we somehow wait for pending
> > requests before proceeding with the reset, or can we force an error
> > return for callbacks that are still pending during reset?
> >
>
> I noticed that ide_bus_reset() does the reset first and then cancels the
> aiocb. Maybe it's already enough to switch those around?
>
> Best Regards,
> Fiona

Great job! Patch looks good to me.

Since the reason is known now, It can be easier to reproduce original
case again, but with disabled NCQ.
There is no command line argument, so it is required to rebuild qemu
without announcing HOST_CAP_NCQ capability.
I'd expect this greatly increase chances to catch original corruption.

Best Regards,
Mike.

Re: [PATCH 0/3] hw/ufs: fix compilation warnings

2023-08-02 Thread Mike Maslenkin

Hello All,

I'm ok with that.

Regards,
Mike.

On Wed, Aug 2, 2023 at 3:52 AM Jeuk Kim  wrote:
>
> On 8/2/2023 6:03 AM, Philippe Mathieu-Daudé wrote:
> > Hi Mike,
> >
> > On 28/7/23 01:34, Mike Maslenkin wrote:
> >> This patchset contains a trivial compilation fixes for UFS support
> >> applied to block-next tree.
> >
> > Since the series isn't merged, it would be clearer to send
> > a v9 of "hw/ufs: Add Universal Flash Storage (UFS) support"
> > with the fixes squashed in (there is still time).
> >
> > Regards,
> >
> > Phil.
> >
>
> Hi Phil,
> Thanks for your comment.
> If Mike is okay, I'll send v9 of "hw/ufs: Add Universal Flash Storage
> UFS) support" with the fixes.
>
> To Mike,
> Is it okay with you if I make a patch v9, incorporating your fixes?

[PATCH 3/3] hw/ufs: change ufs_process_db signature

2023-07-27 Thread Mike Maslenkin

Actually UTRLDBR is 32bit register. There is no need to pass 64bit
value to ufs_process_db() function.

Cc: Jeuk Kim 
Cc: Stefan Hajnoczi 
Signed-off-by: Mike Maslenkin 
---
 hw/ufs/ufs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index b0656e47598e..af57ba6df02c 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -256,7 +256,7 @@ static void ufs_irq_check(UfsHc *u)
 }
 }
 
-static void ufs_process_db(UfsHc *u, uint64_t val)
+static void ufs_process_db(UfsHc *u, uint32_t val)
 {
 uint32_t slot;
 uint32_t nutrs = u->params.nutrs;
-- 
2.32.0 (Apple Git-132)

[PATCH 2/3] hw/ufs: fix compilation warning

2023-07-27 Thread Mike Maslenkin

This patch fixes compilation warning, since argument to ufs_process_db()
passed to find_first_bit() that expects unsigned long value.

The exact warnings are:

warning: incompatible pointer types passing 'uint64_t *' (aka 'unsigned
long long *') to parameter of type 'const unsigned long *'
[-Wincompatible-pointer-types]
slot = find_first_bit(, nutrs);
  ^~~~
warning: incompatible pointer types passing 'uint64_t *' (aka 'unsigned
long long *') to parameter of type 'const unsigned long *'
[-Wincompatible-pointer-types]
slot = find_next_bit(, nutrs, slot + 1);
 ^~~~

Cc: Jeuk Kim 
Cc: Stefan Hajnoczi 
Signed-off-by: Mike Maslenkin 
---
 hw/ufs/ufs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index af32366c8504..b0656e47598e 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -267,7 +267,7 @@ static void ufs_process_db(UfsHc *u, uint64_t val)
 return;
 }
 
-slot = find_first_bit(, nutrs);
+slot = find_first_bit((unsigned long *) , nutrs);
 
 while (slot < nutrs) {
 req = >req_list[slot];
@@ -283,7 +283,7 @@ static void ufs_process_db(UfsHc *u, uint64_t val)
 
 trace_ufs_process_db(slot);
 req->state = UFS_REQUEST_READY;
-slot = find_next_bit(, nutrs, slot + 1);
+slot = find_next_bit((unsigned long *) , nutrs, slot + 1);
 }
 
 qemu_bh_schedule(u->doorbell_bh);
-- 
2.32.0 (Apple Git-132)

[PATCH 1/3] hw/ufs: fix compilation warning

2023-07-27 Thread Mike Maslenkin

This patch fixes a compilation warning: implicit conversion from enumeration
type 'enum UfsRequestState' to different enumeration type 'UfsReqResult'
(aka 'enum UfsReqResult') [-Wenum-conversion]

ufs_exec_scsi_cmd() returns a value from UfsReqResult enum.

Cc: Jeuk Kim 
Cc: Stefan Hajnoczi 
Signed-off-by: Mike Maslenkin 
---
 hw/ufs/ufs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index 1760e0f88d70..af32366c8504 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -514,10 +514,10 @@ static UfsReqResult ufs_exec_scsi_cmd(UfsRequest *req)
 if (!is_wlun(lun)) {
 if (lun >= u->device_desc.number_lu) {
 trace_ufs_err_scsi_cmd_invalid_lun(lun);
-return UFS_REQUEST_ERROR;
+return UFS_REQUEST_FAIL;
 } else if (u->lus[lun] == NULL) {
 trace_ufs_err_scsi_cmd_invalid_lun(lun);
-return UFS_REQUEST_ERROR;
+return UFS_REQUEST_FAIL;
 }
 }
 
-- 
2.32.0 (Apple Git-132)

[PATCH 0/3] hw/ufs: fix compilation warnings

2023-07-27 Thread Mike Maslenkin

This patchset contains a trivial compilation fixes for UFS support
applied to block-next tree.

Cc: Jeuk Kim 
Cc: Stefan Hajnoczi 
Signed-off-by: Mike Maslenkin

Re: [PATCH] Revert "virtio-scsi: Send "REPORTED LUNS CHANGED" sense data upon disk hotplug events"

2023-07-11 Thread Mike Christie

What was the issue you are seeing?

Was it something like you get the UA. We retry then on one of the
retries the sense is not setup correctly, so the scsi error handler
runs? That fails and the device goes offline?

If you turn on scsi debugging you would see:


[  335.445922] sd 0:0:0:0: [sda] tag#15 Add. Sense: Reported luns data has 
changed
[  335.445922] sd 0:0:0:0: [sda] tag#16 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445925] sd 0:0:0:0: [sda] tag#16 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445929] sd 0:0:0:0: [sda] tag#17 Done: FAILED Result: hostbyte=DID_OK 
driverbyte=DRIVER_OK cmd_age=0s
[  335.445932] sd 0:0:0:0: [sda] tag#17 CDB: Write(10) 2a 00 00 db 4f c0 00 00 
20 00
[  335.445934] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445936] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445938] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445940] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445942] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445945] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.451447] scsi host0: scsi_eh_0: waking up 0/2/2
[  335.451453] scsi host0: Total of 2 commands on 1 devices require eh work
[  335.451457] sd 0:0:0:0: [sda] tag#16 scsi_eh_0: requesting sense


I don't know the qemu scsi code well, but I scanned the code for my co-worker
and my guess was commit 8cc5583abe6419e7faaebc9fbd109f34f4c850f2 had a race in 
it.

How is locking done? when it is a bus level UA but there are multiple devices
on the bus?

Is it possible, devA is clearing the sense on devB. For example, thread1 for
devA  is doing scsi_clear_unit_attention but then thread2 for devB has seen that
bus->unit_attention so it set req ops to reqops_unit_attention. But when
we run reqops_unit_attention.send_command scsi_unit_attention does not see
req->bus->unit_attention set anymore so we get a CC with no sense.

If the linux kernel scsi layer sees a CC with no sense then we fire the SCSI
error handler like seen above in the logs.


On 7/11/23 12:06 PM, Stefano Garzarella wrote:
> CCing `./scripts/get_maintainer.pl -f drivers/scsi/virtio_scsi.c`,
> since I found a few things in the virtio-scsi driver...
> 
> FYI we have seen that Linux has problems with a QEMU patch for the
> virtio-scsi device (details at the bottom of this email in the revert
> commit message and BZ).
> 
> 
> This is what I found when I looked at the Linux code:
> 
> In scsi_report_sense() in linux/drivers/scsi/scsi_error.c linux calls
> scsi_report_lun_change() that set `sdev_target->expecting_lun_change =
> 1` when we receive a UNIT ATTENTION with REPORT LUNS CHANGED
> (sshdr->asc == 0x3f && sshdr->ascq == 0x0e).
> 
> When `sdev_target->expecting_lun_change = 1` is set and we call
> scsi_check_sense(), for example to check the next UNIT ATTENTION, it
> will return NEEDS_RETRY, that I think will cause the issues we are
> seeing.
> 
> `sdev_target->expecting_lun_change` is reset only in
> scsi_decide_disposition() when `REPORT_LUNS` command returns with
> SAM_STAT_GOOD.
> That command is issued in scsi_report_lun_scan() called by
> __scsi_scan_target(), called for example by scsi_scan_target(),
> scsi_scan_host(), etc.
> 
> So, checking QEMU, we send VIRTIO_SCSI_EVT_RESET_RESCAN during hotplug
> and VIRTIO_SCSI_EVT_RESET_REMOVED during hotunplug. In both cases now we
> send also the UNIT ATTENTION.
> 
> In the virtio-scsi driver, when we receive VIRTIO_SCSI_EVT_RESET_RESCAN
> (hotplug) we call scsi_scan_target() or scsi_add_device(). Both of them
> will call __scsi_scan_target() at some points, sending `REPORT_LUNS`
> command to the device. This does not happen for
> VIRTIO_SCSI_EVT_RESET_REMOVED (hotunplug). Indeed if I remove the
> UNIT ATTENTION from the hotunplug in QEMU, everything works well.
> 
> So, I tried to add a scan also for VIRTIO_SCSI_EVT_RESET_REMOVED:
> 
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> index bd5633667d01..c57658a63097 100644
> --- a/drivers/scsi/virtio_scsi.c
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -291,6 +291,7 @@ static void virtscsi_handle_transport_reset(struct 
> virtio_scsi *vscsi,
>     }
>     break;
>     case VIRTIO_SCSI_EVT_RESET_REMOVED:
> +   scsi_scan_host(shost);
>     sdev = scsi_device_lookup(shost, 0, target, lun);
>     if (sdev) {
>     scsi_remove_device(sdev);
> 
> This somehow helps, now linux only breaks if the plug/unplug frequency
> is really high. If I put a 5 second sleep between plug/unplug events, it
> doesn't break (at least for the duration of my test which has been
> running for about 30 minutes, before it used to break after about a
> minute).
> 
> Another thing I noticed is that in QEMU maybe we should set the UNIT
> ATTENTION

Re: [PATCH qemu v5] aspeed add montblanc bmc reference from fuji

2023-07-05 Thread Mike Choi

Hi Sittisak,

Minipack3 is not open-sourced yet, and we are unlikely to be able to upstream 
detailed data.


  1.  What is these FRUID datas for, is it for testing?
  2.  What other option do we have, since we are not able to upstream FRUID 
data. (It is still OK to upstream system configuration, but NOT the arrays of 
_fruid data array)

Thanks,
Mike


From: Cédric Le Goater 
Date: Tuesday, July 4, 2023 at 7:07 AM
To: Sittisak Sinprem , Bin Huang , 
Tao Ren , Mike Choi 
Cc: qemu-devel@nongnu.org , qemu-...@nongnu.org 
, peter.mayd...@linaro.org , 
and...@aj.id.au , Joel Stanley , 
qemu-sta...@nongnu.org , srika...@celestica.com 
, ssu...@celestica.com , 
thangavel...@celestica.com , kgen...@celestica.com 
, anandaram...@celestica.com 
Subject: Re: [PATCH qemu v5] aspeed add montblanc bmc reference from fuji
!---|
  This Message Is From an External Sender

|---!

On 7/4/23 15:27, Sittisak Sinprem wrote:
> Hi Meta Team,
>
> the FRU EEPROM content, I think for now detail still be confidential,
> Please confirm, Can we add the description in Qemu upstream following 
> Cedric's request?

We don't need all the details, and not the confidential part of course.

C.

>
> On Tue, Jul 4, 2023 at 6:19 PM Cédric Le Goater  <mailto:c...@kaod.org>> wrote:
>
> On 7/4/23 13:06, ~ssinprem wrote:
>  > From: Sittisak Sinprem  <mailto:ssinp...@celestica.com>>
>  >
>  > - I2C list follow I2C Tree v1.6 20230320
>  > - fru eeprom data use FB FRU format version 4
>  >
>  > Signed-off-by: Sittisak Sinprem  <mailto:ssinp...@celestica.com>>
>
> You shoot too fast :) Please add some description for the EEPROM contents.
> What they enable when the OS/FW boots is good to know for QEMU.
>
> Thanks,
>
> C.
>
>
>  > ---
>  >   docs/system/arm/aspeed.rst |  1 +
>  >   hw/arm/aspeed.c| 65 
> ++
>  >   hw/arm/aspeed_eeprom.c | 50 +
>  >   hw/arm/aspeed_eeprom.h |  7 
>  >   4 files changed, 123 insertions(+)
>  >
>  > diff --git a/docs/system/arm/aspeed.rst b/docs/system/arm/aspeed.rst
>  > index 80538422a1..5e0824f48b 100644
>  > --- a/docs/system/arm/aspeed.rst
>  > +++ b/docs/system/arm/aspeed.rst
>  > @@ -33,6 +33,7 @@ AST2600 SoC based machines :
>  >   - ``tacoma-bmc``   OpenPOWER Witherspoon POWER9 AST2600 BMC
>  >   - ``rainier-bmc``  IBM Rainier POWER10 BMC
>  >   - ``fuji-bmc`` Facebook Fuji BMC
>  > +- ``montblanc-bmc``Facebook Montblanc BMC
>  >   - ``bletchley-bmc``Facebook Bletchley BMC
>  >   - ``fby35-bmc``Facebook fby35 BMC
>  >   - ``qcom-dc-scm-v1-bmc``   Qualcomm DC-SCM V1 BMC
>  > diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
>  > index 9fca644d92..bbb7a3392c 100644
>  > --- a/hw/arm/aspeed.c
>  > +++ b/hw/arm/aspeed.c
>  > @@ -189,6 +189,10 @@ struct AspeedMachineState {
>  >   #define FUJI_BMC_HW_STRAP10x
>  >   #define FUJI_BMC_HW_STRAP20x
>  >
>  > +/* Montblanc hardware value */
>  > +#define MONTBLANC_BMC_HW_STRAP10x
>  > +#define MONTBLANC_BMC_HW_STRAP20x
>  > +
>  >   /* Bletchley hardware value */
>  >   /* TODO: Leave same as EVB for now. */
>  >   #define BLETCHLEY_BMC_HW_STRAP1 AST2600_EVB_HW_STRAP1
>  > @@ -925,6 +929,41 @@ static void fuji_bmc_i2c_init(AspeedMachineState 
> *bmc)
>  >   }
>  >   }
>  >
>  > +static void montblanc_bmc_i2c_init(AspeedMachineState *bmc)
>  > +{
>  > +AspeedSoCState *soc = >soc;
>  > +I2CBus *i2c[16] = {};
>  > +
>  > +for (int i = 0; i < 16; i++) {
>  > +i2c[i] = aspeed_i2c_get_bus(>i2c, i);
>  > +}
>  > +
>  > +/* Ref from Minipack3_I2C_Tree_V1.6 20230320 */
>  > +at24c_eeprom_init_rom(i2c[3], 0x56, 8192, montblanc_scm_fruid,
>  > +  montblanc_scm_fruid_len);
>  > +at24c_eeprom_init_rom(i2c[6], 0x53, 8192, montblanc_fcm_fruid,
>  > +  montblanc_fcm_fruid_len);
>  > +
>  > +/* CPLD and FPGA */
>  > +at24c_eeprom_init(i2c[1], 0x35, 256);  /* SCM CPLD */
>  > +at24c_eeprom_init(i2c[5], 0x35, 256);  /* COMe CPLD TODO: need to 
> update

Re: [RFC PATCH 00/19] hugetlb support for KVM guest_mem

2023-06-16 Thread Mike Kravetz

On 06/06/23 19:03, Ackerley Tng wrote:
> Hello,
> 
> This patchset builds upon a soon-to-be-published WIP patchset that Sean
> published at https://github.com/sean-jc/linux/tree/x86/kvm_gmem_solo, 
> mentioned
> at [1].
> 
> The tree can be found at:
> https://github.com/googleprodkernel/linux-cc/tree/gmem-hugetlb-rfc-v1
> 
> In this patchset, hugetlb support for KVM's guest_mem (aka gmem) is 
> introduced,
> allowing VM private memory (for confidential computing) to be backed by 
> hugetlb
> pages.
> 
> guest_mem provides userspace with a handle, with which userspace can allocate
> and deallocate memory for confidential VMs without mapping the memory into
> userspace.

Hello Ackerley,

I am not sure if you are aware or, have been following the hugetlb HGM
discussion in this thread:
https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey/

There we are trying to decide if HGM should be added to hugetlb, or if
perhaps a new filesystem/driver/allocator should be created.  The concern
is added complexity to hugetlb as well as core mm special casing.  Note
that HGM is addressing issues faced by existing hugetlb users.

Your proposal here suggests modifying hugetlb so that it can be used in
a new way (use case) by KVM's guest_mem.  As such it really seems like
something that should be done in a separate filesystem/driver/allocator.
You will likely not get much support for modifying hugetlb.

-- 
Mike Kravetz

> Why use hugetlb instead of introducing a new allocator, like gmem does for 4K
> and transparent hugepages?
> 
> + hugetlb provides the following useful functionality, which would otherwise
>   have to be reimplemented:
> + Allocation of hugetlb pages at boot time, including
> + Parsing of kernel boot parameters to configure hugetlb
> + Tracking of usage in hstate
> + gmem will share the same system-wide pool of hugetlb pages, so users
>   don't have to have separate pools for hugetlb and gmem
> + Page accounting with subpools
> + hugetlb pages are tracked in subpools, which gmem uses to reserve
>   pages from the global hstate
> + Memory charging
> + hugetlb provides code that charges memory to cgroups
> + Reporting: hugetlb usage and availability are available at 
> /proc/meminfo,
>   etc
> 
> The first 11 patches in this patchset is a series of refactoring to decouple
> hugetlb and hugetlbfs.
> 
> The central thread binding the refactoring is that some functions (like
> inode_resv_map(), inode_subpool(), inode_hstate(), etc) rely on a hugetlbfs
> concept, that the resv_map, subpool, hstate, are in a specific field in a
> hugetlb inode.
> 
> Refactoring to parametrize functions by hstate, subpool, resv_map will allow
> hugetlb to be used by gmem and in other places where these data structures
> aren't necessarily stored in the same positions in the inode.
> 
> The refactoring proposed here is just the minimum required to get a
> proof-of-concept working with gmem. I would like to get opinions on this
> approach before doing further refactoring. (See TODOs)
> 
> TODOs:
> 
> + hugetlb/hugetlbfs refactoring
> + remove_inode_hugepages() no longer needs to be exposed, it is hugetlbfs
>   specific and used only in inode.c
> + remove_mapping_hugepages(), remove_inode_single_folio(),
>   hugetlb_unreserve_pages() shouldn't need to take inode as a parameter
> + Updating inode->i_blocks can be refactored to a separate function 
> and
>   called from hugetlbfs and gmem
> + alloc_hugetlb_folio_from_subpool() shouldn't need to be parametrized by
>   vma
> + hugetlb_reserve_pages() should be refactored to be symmetric with
>   hugetlb_unreserve_pages()
> + It should be parametrized by resv_map
> + alloc_hugetlb_folio_from_subpool() could perhaps use
>   hugetlb_reserve_pages()?
> + gmem
> + Figure out if resv_map should be used by gmem at all
> + Probably needs more refactoring to decouple resv_map from hugetlb
>   functions
> 
> Questions for the community:
> 
> 1. In this patchset, every gmem file backed with hugetlb is given a new
>subpool. Is that desirable?
> + In hugetlbfs, a subpool always belongs to a mount, and hugetlbfs has one
>   mount per hugetlb size (2M, 1G, etc)
> + memfd_create(MFD_HUGETLB) effectively returns a full hugetlbfs file, so 
> it
>   (rightfully) uses the hugetlbfs kernel mounts and their subpools
> + I gave each file a subpool mostly to speed up implementation and still 
> be
>   able to reserve hugetlb pages from the global hstate based on the gmem
>   file size.
> + gmem, unlike hugetlbfs, isn't meant to be a

[PATCH] udmabuf: revert 'Add support for mapping hugepages (v4)'

2023-06-08 Thread Mike Kravetz

This effectively reverts commit 16c243e99d33 ("udmabuf: Add support
for mapping hugepages (v4)").  Recently, Junxiao Chang found a BUG
with page map counting as described here [1].  This issue pointed out
that the udmabuf driver was making direct use of subpages of hugetlb
pages.  This is not a good idea, and no other mm code attempts such use.
In addition to the mapcount issue, this also causes issues with hugetlb
vmemmap optimization and page poisoning.

For now, remove hugetlb support.

If udmabuf wants to be used on hugetlb mappings, it should be changed to
only use complete hugetlb pages.  This will require different alignment
and size requirements on the UDMABUF_CREATE API.

[1] 
https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.ch...@intel.com/

Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)")
Cc: 
Signed-off-by: Mike Kravetz 
---
 drivers/dma-buf/udmabuf.c | 47 +--
 1 file changed, 6 insertions(+), 41 deletions(-)

diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
index 01f2e86f3f7c..12cf6bb2e3ce 100644
--- a/drivers/dma-buf/udmabuf.c
+++ b/drivers/dma-buf/udmabuf.c
@@ -12,7 +12,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice *device,
struct udmabuf *ubuf;
struct dma_buf *buf;
pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
-   struct page *page, *hpage = NULL;
-   pgoff_t subpgoff, maxsubpgs;
-   struct hstate *hpstate;
+   struct page *page;
int seals, ret = -EINVAL;
u32 i, flags;
 
@@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice *device,
if (!memfd)
goto err;
mapping = memfd->f_mapping;
-   if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
+   if (!shmem_mapping(mapping))
goto err;
seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
if (seals == -EINVAL)
@@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice *device,
goto err;
pgoff = list[i].offset >> PAGE_SHIFT;
pgcnt = list[i].size   >> PAGE_SHIFT;
-   if (is_file_hugepages(memfd)) {
-   hpstate = hstate_file(memfd);
-   pgoff = list[i].offset >> huge_page_shift(hpstate);
-   subpgoff = (list[i].offset &
-   ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
-   maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT;
-   }
for (pgidx = 0; pgidx < pgcnt; pgidx++) {
-   if (is_file_hugepages(memfd)) {
-   if (!hpage) {
-   hpage = find_get_page_flags(mapping, 
pgoff,
-   
FGP_ACCESSED);
-   if (!hpage) {
-   ret = -EINVAL;
-   goto err;
-   }
-   }
-   page = hpage + subpgoff;
-   get_page(page);
-   subpgoff++;
-   if (subpgoff == maxsubpgs) {
-   put_page(hpage);
-   hpage = NULL;
-   subpgoff = 0;
-   pgoff++;
-   }
-   } else {
-   page = shmem_read_mapping_page(mapping,
-  pgoff + pgidx);
-   if (IS_ERR(page)) {
-   ret = PTR_ERR(page);
-   goto err;
-   }
+   page = shmem_read_mapping_page(mapping, pgoff + pgidx);
+   if (IS_ERR(page)) {
+   ret = PTR_ERR(page);
+   goto err;
}
ubuf->pages[pgbuf++] = page;
}
fput(memfd);
memfd = NULL;
-   if (hpage) {
-   put_page(hpage);
-   hpage = NULL;
-   }
}
 
exp_info.ops  = _ops;
-- 
2.40.1

Re: [PATCH v4 3/5] parallels: Add checking and repairing duplicate offsets in BAT

2023-04-28 Thread Mike Maslenkin

There is another issue with host_cluster_index() function.
After this patchset applied `qemu-img check -f parallels  some_disk`
aborts for  empty (just created) disk image.
The problem is that host_cluster_index() returns 0 and then
bitmap_new(0) rises an abort.

For default empty disk s->header->data_off=2048, and
res->image_end_offset = 1048576, so:
static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t 1048576)
{
  off = 1048576  - le32_to_cpu(2048) << 9;
  return 0 / 1048576;
}

Could you check this case?

Regards,
Mike.

On Thu, Apr 27, 2023 at 3:29 PM Alexander Ivanov
 wrote:
>
> Good point. Thank you.
>
> Best regards,
> Alexander Ivanov
>
> On 4/26/23 23:56, Mike Maslenkin wrote:
> > On Mon, Apr 24, 2023 at 12:44 PM Alexander Ivanov
> >  wrote:
> >> Cluster offsets must be unique among all the BAT entries. Find duplicate
> >> offsets in the BAT and fix it by copying the content of the relevant
> >> cluster to a newly allocated cluster and set the new cluster offset to the
> >> duplicated entry.
> >>
> >> Add host_cluster_index() helper to deduplicate the code.
> >>
> >> Move parallels_fix_leak() call to parallels_co_check() to fix both types
> >> of leak: real corruption and a leak produced by allocate_clusters()
> >> during deduplication.
> >>
> >> Signed-off-by: Alexander Ivanov 
> >> ---
> >>   block/parallels.c | 134 --
> >>   1 file changed, 129 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/block/parallels.c b/block/parallels.c
> >> index ec89ed894b..3b992e8173 100644
> >> --- a/block/parallels.c
> >> +++ b/block/parallels.c
> >> @@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, 
> >> int64_t sector_num,
> >>   return MIN(nb_sectors, ret);
> >>   }
> >>
> >> +static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
> >> +{
> >> +off -= s->header->data_off << BDRV_SECTOR_BITS;
> >> +return off / s->cluster_size;
> >> +}
> >> +
> > I guess  there should be:
> > off -= le32_to_cpu(s->header->data_off) << BDRV_SECTOR_BITS
> >
> > Regards,
> > Mike.
>

Re: [PATCH v4 3/5] parallels: Add checking and repairing duplicate offsets in BAT

2023-04-27 Thread Mike Maslenkin

Sorry for the noise again , but I have another note

On Mon, Apr 24, 2023 at 12:44 PM Alexander Ivanov
 wrote:
>
> Cluster offsets must be unique among all the BAT entries. Find duplicate
> offsets in the BAT and fix it by copying the content of the relevant
> cluster to a newly allocated cluster and set the new cluster offset to the
> duplicated entry.
>
> Add host_cluster_index() helper to deduplicate the code.
>
> Move parallels_fix_leak() call to parallels_co_check() to fix both types
> of leak: real corruption and a leak produced by allocate_clusters()
> during deduplication.
>
> Signed-off-by: Alexander Ivanov 
> ---
>  block/parallels.c | 134 --
>  1 file changed, 129 insertions(+), 5 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index ec89ed894b..3b992e8173 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, 
> int64_t sector_num,
>  return MIN(nb_sectors, ret);
>  }
>
> +static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
> +{
> +off -= s->header->data_off << BDRV_SECTOR_BITS;
> +return off / s->cluster_size;
> +}
> +
>  static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
>  int nb_sectors, int *pnum)
>  {
> @@ -533,7 +539,6 @@ parallels_check_leak(BlockDriverState *bs, 
> BdrvCheckResult *res,
>  {
>  BDRVParallelsState *s = bs->opaque;
>  int64_t count, leak_size;
> -int ret;
>
>  leak_size = parallels_get_leak_size(bs, res);
>  if (leak_size < 0) {
> @@ -550,16 +555,123 @@ parallels_check_leak(BlockDriverState *bs, 
> BdrvCheckResult *res,
>  fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", leak_size);
>
>  if (fix & BDRV_FIX_LEAKS) {
> -ret = parallels_fix_leak(bs, res);
> -if (ret < 0) {
> -return ret;
> -}
>  res->leaks_fixed += count;
>  }
>
>  return 0;
>  }
>
> +static int parallels_check_duplicate(BlockDriverState *bs,
> + BdrvCheckResult *res,
> + BdrvCheckMode *fix)
> +{
> +BDRVParallelsState *s = bs->opaque;
> +QEMUIOVector qiov;
> +int64_t off, sector;
> +unsigned long *bitmap;
> +uint32_t i, bitmap_size, cluster_index;
> +int n, ret = 0;
> +uint64_t *buf = NULL;
> +
> +/*
> + * Create a bitmap of used clusters.
> + * If a bit is set, there is a BAT entry pointing to this cluster.
> + * Loop through the BAT entries, check bits relevant to an entry offset.
> + * If bit is set, this entry is duplicated. Otherwise set the bit.
> + *
> + * We shouldn't worry about newly allocated clusters outside the image
> + * because they are created higher then any existing cluster pointed by
> + * a BAT entry.
> + */
> +bitmap_size = host_cluster_index(s, res->image_end_offset);
> +bitmap = bitmap_new(bitmap_size);
> +
> +buf = qemu_memalign(4096, s->cluster_size);
> +qemu_iovec_init(, 0);
> +qemu_iovec_add(, buf, s->cluster_size);
> +
> +for (i = 0; i < s->bat_size; i++) {
> +off = bat2sect(s, i) << BDRV_SECTOR_BITS;
> +if (off == 0) {
> +continue;
> +}
> +
> +cluster_index = host_cluster_index(s, off);
> +if (test_bit(cluster_index, bitmap)) {
> +/* this cluster duplicates another one */
> +fprintf(stderr,
> +"%s duplicate offset in BAT entry %u\n",
> +*fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
> +
> +res->corruptions++;
> +
> +if (*fix & BDRV_FIX_ERRORS) {
> +/*
> + * Reset the entry and allocate a new cluster
> + * for the relevant guest offset. In this way we let
> + * the lower layer to place the new cluster properly.
> + * Copy the original cluster to the allocated one.
> + */
> +parallels_set_bat_entry(s, i, 0);
> +
> +ret = bdrv_co_pread(bs->file, off, s->cluster_size, buf, 0);
> +if (ret < 0) {
> +res->check_errors++;
> +goto out;
> +}
> +
> +sector = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
> +sector = allocate_clusters(bs, sector, s->tracks, );
> +if (sector < 0) {
> +res->check_errors++;
> +ret = sector;
> +goto out;
> +}

I can not understand how index in a BAT table related to s->cluster_size.
Probably there should be "cluster_index" used?
Anyway, looks like both cause uint32 truncation as result of
({i,cluster_index} * s->cluster_size)

Regards,
Mike.

Re: [PATCH v4 3/5] parallels: Add checking and repairing duplicate offsets in BAT

2023-04-26 Thread Mike Maslenkin

On Mon, Apr 24, 2023 at 12:44 PM Alexander Ivanov
 wrote:
>
> Cluster offsets must be unique among all the BAT entries. Find duplicate
> offsets in the BAT and fix it by copying the content of the relevant
> cluster to a newly allocated cluster and set the new cluster offset to the
> duplicated entry.
>
> Add host_cluster_index() helper to deduplicate the code.
>
> Move parallels_fix_leak() call to parallels_co_check() to fix both types
> of leak: real corruption and a leak produced by allocate_clusters()
> during deduplication.
>
> Signed-off-by: Alexander Ivanov 
> ---
>  block/parallels.c | 134 --
>  1 file changed, 129 insertions(+), 5 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index ec89ed894b..3b992e8173 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, 
> int64_t sector_num,
>  return MIN(nb_sectors, ret);
>  }
>
> +static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
> +{
> +off -= s->header->data_off << BDRV_SECTOR_BITS;
> +return off / s->cluster_size;
> +}
> +

I guess  there should be:
off -= le32_to_cpu(s->header->data_off) << BDRV_SECTOR_BITS

Regards,
Mike.

Re: [PULL 19/54] acpi: pc: isa bridge: use AcpiDevAmlIf interface to build ISA device descriptors

2023-04-13 Thread Mike Maslenkin

Hibernate is disabled by default, that is why "shutdown /h" fails.
It can be enabled by 'powercfg.exe /h /size 100; powercfg.exe /h on'

anyway, my hypotheses can be checked easily: just run the VM with
changed RAM size (±1). This should have to drop hibernate state.

BTW I couldn't reproduce problem as well.

On Thu, Apr 13, 2023 at 3:10 PM Fiona Ebner  wrote:
>
> Am 13.04.23 um 13:46 schrieb Mike Maslenkin:
> > Sorry for the noise, but just curious, how did you shutdown Windows?
> > Did you use 'shutdown /s' or just press power button?
> > Could it be that Windows was actually hibernated.
> > So, when you try to boot it on the new (old) QEMU version with changed
> > PCI topology, this could make it upset.
> > I observed similar behaviour in case of Windows for ARM, but there was
> > true GSOD afterwards.
> > When windows is starting again its hibernated state dropped and all goes 
> > fine.
> >
> > Best Regards,
> > Mike
>
> I think I either pressed the shutdown button in our UI, which sends
> system_powerdown via QMP or via "Shut down" in the Windows start menu.
> Hibernation is surely something I need to consider (next time), so thank
> you for the hint, but if it were that, I'd be surprised at why it got
> stuck even with QEMU 6.2.0 today.
>
> If I try "shutdown /h" explicitly, I get "The request is not
> supported.(50)".
>
> Best Regards,
> Fiona
>

Re: [PULL 19/54] acpi: pc: isa bridge: use AcpiDevAmlIf interface to build ISA device descriptors

2023-04-13 Thread Mike Maslenkin

Sorry for the noise, but just curious, how did you shutdown Windows?
Did you use 'shutdown /s' or just press power button?
Could it be that Windows was actually hibernated.
So, when you try to boot it on the new (old) QEMU version with changed
PCI topology, this could make it upset.
I observed similar behaviour in case of Windows for ARM, but there was
true GSOD afterwards.
When windows is starting again its hibernated state dropped and all goes fine.

Best Regards,
Mike


On Thu, Apr 13, 2023 at 1:34 PM Fiona Ebner  wrote:
>
> Am 12.04.23 um 14:18 schrieb Igor Mammedov:
> > On Thu, 30 Mar 2023 13:58:22 +0200
> > Fiona Ebner  wrote:
> >
> >> Am 30.03.23 um 10:22 schrieb Igor Mammedov:
> >>> On Tue, 28 Mar 2023 14:58:21 +0200
> >>> Fiona Ebner  wrote:
> >>>
> >>>>
> >>>> Hi,
> >>>> while trying to reproduce another issue, I ended up with a Windows 10
> >>>> guest that would boot with QEMU 7.0, but get stuck after the Windows
> >>>> logo/spinning circles with QEMU 7.1 (also with 8.0.0-rc1). Machine type
> >>>> is pc-i440fx-6.2[0]. Bisecting led to this commit.
> >>>>
> >>>> It only happens the first time the VM is booted, killing the process and
> >>>> re-trying always worked afterwards. So it's not a big deal and might
> >>>> just be some ACPI-related Windows quirk. But I thought I should ask here
> >>>> to be sure.
> >>>>
> >>>> For bisecting, I restored the disk state after each attempt. While
> >>>> getting stuck sometimes took 3-4 attempts, I tested about 10 times until
> >>>> I declared a commit good, and re-tested the commit before this one 15
> >>>> times, so I'm pretty sure this is the one where the issue started 
> >>>> appearing.
> >>>>
> >>>> So, anything that could potentially be wrong with the commit or is this
> >>>> most likely just some Windows quirk/bug we can't do much about?
> >>>>
> >>>> If you need more information, please let me know!
> >>>
> >>> Please describe in more detail your setup/steps where it reproduces
> >>> (incl. Windows version/build, used QEMU CLI) so I could try to reproduce 
> >>> it locally.
> >>>
> >>> (in past there were issues with German version that some where
> >>> experience but not reproducible on my side, that resolved with
> >>> upgrading to newer QEMU (if I recall correctly issue was opened
> >>> on QEMU's gitlab tracker))
> >>>
> >>
> >> Windows 10 Education
> >> Version 1809
> >> Build 17763.1
> >>
> >> It's not the German ISO, I used default settings (except location
> >> Austria and German keymap) and I don't think I did anything other than
> >> shutdown after the install was over.
> >>
> >> The command line is below. I did use our patched QEMU builds when I got
> >> into the situation, but I don't think they touch anything ACPI-related
> >> and bisecting was done without our patches on top.
> >>
> >> I tried to reproduce the situation again from scratch today, but wasn't
> >> able to. I do still have the problematic disk (snapshot) where the issue
> >> occurs as an LVM-Thin volume. If you'd like to have access to that,
> >> please send me a direct mail and we can discuss the details there.
> >
> > I couldn't reproduce the issue on my host either.
> > If you still have access to 'broken' disk image, you can try to enable
> > kernel debug mode in guest and try to attach with debugger to it to see
> > where it is stuck.
> >
> > quick instructions how to do it:
> >  https://gitlab.com/qemu-project/qemu/-/issues/774#note_1270248862
> > or read more extensive MS docs on topic.
> >
>
> Hmm, I guess I won't be able to enable kernel debug mode without losing
> the problematic state of the image. The VM only gets stuck during the
> first boot attempt.
>
> Still, I wanted to give it a shot in the hope I can trigger it again
> when shutting down with QEMU 6.2.0 and booting with QEMU 7.1.0. I made a
> copy of the VM intending to use it as the debug host, but didn't get the
> COM port to show up in the guest with
> -serial unix:/tmp/com1,server,nowait
> I checked in the Device Manager with "Show hidden devices" enabled.
>
> Anyway, when starting the original problematic VM again, it now also got
> stuck (visually, in the same place) with QEMU 6.2.0! But only until I
> rebooted my host, which made it working with QEMU 6.2.0 again. So I'd
> say this commit has nothing to do with the issue after all, just made it
> more likely to trigger for me. And also seems less likely to be a QEMU
> issue now :)
>
> Best Regards,
> Fiona
>
>

How to write a zIPL section in IPL2 record on a raw disk

2023-04-11 Thread Mike Stramba

I have a CKD file, created with the Hercules dasdinit program.
It's a  (hercules) 3350 uncompressed file.
I've put a small "hello world" IPL program on it, and it works fine with
Hercules

When I try to run it with qemu-system-s390x I get :

LOADPARM=[]
Using virtio-blk.
Using guessed DASD geometry.
Using ECKD scheme (block size  4096), CDL
No zIPL section in IPL2 record.   <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
zIPL load failed.
Could not find a suitable boot device (none specified)
Failed to load OS from hard disk

The qemu script is :
qemu-system-s390x -m 16 -drive format=raw,file=test-ipl.3350.un,if=virtio
-nographic

I found this link re:  the zipl command.
https://www.ibm.com/docs/en/linux-on-systems?topic=u-modes-syntax-overview

Where is that command   and  / or its source ?

Mike

[PATCH] memory: avoid unnecessary iteration when updating ioeventfds

2023-02-28 Thread Longpeng(Mike)

From: Longpeng 

When updating ioeventfds, we need to iterate all address spaces and
iterate all flat ranges of each address space. There is so much
redundant process that a FlatView would be iterated for so many times
during one commit (memory_region_transaction_commit).

We can mark a FlatView as UPDATED and then skip it in the next iteration
and clear the UPDATED flag at the end of the commit. The overhead can
be significantly reduced.

For example, a VM with 16 vdpa net devices and each one has 65 vectors,
can reduce the time spent on memory_region_transaction_commit by 95%.

Signed-off-by: Longpeng 
---
 include/exec/memory.h |  2 ++
 softmmu/memory.c  | 28 +++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 2e602a2fad..974eabf765 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1093,6 +1093,8 @@ struct FlatView {
 unsigned nr_allocated;
 struct AddressSpaceDispatch *dispatch;
 MemoryRegion *root;
+#define FLATVIEW_FLAG_IOEVENTFD_UPDATED (1 << 0)
+unsigned flags;
 };
 
 static inline FlatView *address_space_to_flatview(AddressSpace *as)
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 9d64efca26..71ff996712 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -815,6 +815,15 @@ FlatView *address_space_get_flatview(AddressSpace *as)
 return view;
 }
 
+static void address_space_reset_view_flags(AddressSpace *as, unsigned mask)
+{
+FlatView *view = address_space_get_flatview(as);
+
+if (view->flags & mask) {
+view->flags &= ~mask;
+}
+}
+
 static void address_space_update_ioeventfds(AddressSpace *as)
 {
 FlatView *view;
@@ -825,6 +834,12 @@ static void address_space_update_ioeventfds(AddressSpace 
*as)
 AddrRange tmp;
 unsigned i;
 
+view = address_space_get_flatview(as);
+if (view->flags & FLATVIEW_FLAG_IOEVENTFD_UPDATED) {
+return;
+}
+view->flags |= FLATVIEW_FLAG_IOEVENTFD_UPDATED;
+
 /*
  * It is likely that the number of ioeventfds hasn't changed much, so use
  * the previous size as the starting value, with some headroom to avoid
@@ -833,7 +848,6 @@ static void address_space_update_ioeventfds(AddressSpace 
*as)
 ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4);
 ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max);
 
-view = address_space_get_flatview(as);
 FOR_EACH_FLAT_RANGE(fr, view) {
 for (i = 0; i < fr->mr->ioeventfd_nb; ++i) {
 tmp = addrrange_shift(fr->mr->ioeventfds[i].addr,
@@ -1086,6 +1100,15 @@ void memory_region_transaction_begin(void)
 ++memory_region_transaction_depth;
 }
 
+static inline void address_space_update_ioeventfds_finish(void)
+{
+AddressSpace *as;
+
+QTAILQ_FOREACH(as, _spaces, address_spaces_link) {
+address_space_reset_view_flags(as, FLATVIEW_FLAG_IOEVENTFD_UPDATED);
+}
+}
+
 void memory_region_transaction_commit(void)
 {
 AddressSpace *as;
@@ -1106,12 +1129,14 @@ void memory_region_transaction_commit(void)
 }
 memory_region_update_pending = false;
 ioeventfd_update_pending = false;
+address_space_update_ioeventfds_finish();
 MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
 } else if (ioeventfd_update_pending) {
 QTAILQ_FOREACH(as, _spaces, address_spaces_link) {
 address_space_update_ioeventfds(as);
 }
 ioeventfd_update_pending = false;
+address_space_update_ioeventfds_finish();
 }
}
 }
@@ -3076,6 +3101,7 @@ void address_space_init(AddressSpace *as, MemoryRegion 
*root, const char *name)
 as->name = g_strdup(name ? name : "anonymous");
 address_space_update_topology(as);
 address_space_update_ioeventfds(as);
+address_space_reset_view_flags(as, FLATVIEW_FLAG_IOEVENTFD_UPDATED);
 }
 
 static void do_address_space_destroy(AddressSpace *as)
-- 
2.23.0

[PATCH v1 3/3] virtio-pci: defer to commit kvm irq routing when enable msi/msix

2023-02-28 Thread Longpeng(Mike)

From: Longpeng 

All unmasked vectors will be setup in msix_set_vector_notifiers(), which
is a time-consuming operation because each vector need to be submit to
KVM once. It's even worse if the VM has several devices and each devices
has dozens of vectors.

We can defer and commit the vectors in batch, just like the commit dc580d51f7
("vfio: defer to commit kvm irq routing when enable msi/msix"),

The can reduce 80% of the time spending on virtio_pci_set_guest_notifiers().

Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 113 -
 include/hw/virtio/virtio.h |   1 +
 2 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 5fd02b7cb8..13f9c31009 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -51,15 +51,22 @@
 
 /* Protected by the BQL */
 static KVMRouteChange virtio_pci_route_change;
+static unsigned virtio_pci_route_change_depth;
 
 static inline void virtio_pci_begin_route_changes(void)
 {
-virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+if (!virtio_pci_route_change_depth) {
+virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+}
+virtio_pci_route_change_depth++;
 }
 
 static inline void virtio_pci_commit_route_changes(void)
 {
-kvm_irqchip_commit_route_changes(_pci_route_change);
+virtio_pci_route_change_depth--;
+if (!virtio_pci_route_change_depth) {
+kvm_irqchip_commit_route_changes(_pci_route_change);
+}
 }
 
 static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
@@ -976,6 +983,88 @@ static void 
kvm_virtio_pci_vector_config_release(VirtIOPCIProxy *proxy)
 kvm_virtio_pci_vector_release_one(proxy, VIRTIO_CONFIG_IRQ_IDX);
 }
 
+static int virtio_pci_vector_do_unmask(VirtIOPCIProxy *proxy,
+   unsigned int queue_no,
+   unsigned int vector,
+   EventNotifier *n)
+{
+VirtIODevice *vdev = virtio_bus_get_device(>bus);
+VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+int ret = 0;
+
+/*
+ * If guest supports masking, irqfd is already setup, unmask it.
+ * Otherwise, set it up now.
+ */
+if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+k->guest_notifier_mask(vdev, queue_no, false);
+/* Test after unmasking to avoid losing events. */
+if (k->guest_notifier_pending &&
+k->guest_notifier_pending(vdev, queue_no)) {
+event_notifier_set(n);
+}
+} else {
+ret = kvm_virtio_pci_irqfd_use(proxy, n, vector);
+}
+
+return ret;
+}
+
+static void virtio_pci_prepare_kvm_msi_virq_batch(VirtIOPCIProxy *proxy)
+{
+VirtIODevice *vdev = virtio_bus_get_device(>bus);
+
+assert(!vdev->defer_kvm_irq_routing);
+vdev->defer_kvm_irq_routing = true;
+virtio_pci_begin_route_changes();
+}
+
+static void virtio_pci_commit_kvm_msi_virq_batch(VirtIOPCIProxy *proxy)
+{
+VirtIODevice *vdev = virtio_bus_get_device(>bus);
+PCIDevice *dev = >pci_dev;
+VirtQueue *vq;
+EventNotifier *n;
+int vector, index;
+int ret;
+
+assert(vdev->defer_kvm_irq_routing);
+virtio_pci_commit_route_changes();
+vdev->defer_kvm_irq_routing = false;
+
+if (!msix_enabled(dev)) {
+return;
+}
+
+/* Unmask all unmasked vectors */
+for (vector = 0; vector < dev->msix_entries_nr; vector++) {
+if (msix_is_masked(dev, vector)) {
+continue;
+}
+
+vq = virtio_vector_first_queue(vdev, vector);
+while (vq) {
+index = virtio_get_queue_index(vq);
+if (!virtio_queue_get_num(vdev, index)) {
+break;
+}
+if (index < proxy->nvqs_with_notifiers) {
+n = virtio_queue_get_guest_notifier(vq);
+ret = virtio_pci_vector_do_unmask(proxy, index, vector, n);
+assert(ret >= 0);
+}
+vq = virtio_vector_next_queue(vq);
+}
+
+if (vector == vdev->config_vector) {
+n = virtio_config_get_guest_notifier(vdev);
+ret = virtio_pci_vector_do_unmask(proxy, VIRTIO_CONFIG_IRQ_IDX,
+  vector, n);
+assert(ret >= 0);
+}
+}
+}
+
 static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy,
unsigned int queue_no,
unsigned int vector,
@@ -983,7 +1072,6 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy 
*proxy,
EventNotifier *n)
 {
 VirtIODevice *vdev = virtio_bus_get_device(>bus);
-VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
 VirtIOIRQFD *irqfd;
 int ret = 0;
 
@@ -1002,19 +1090,10 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy

[PATCH v1 1/3] virtio-pci: submit msi route changes in batch

2023-02-28 Thread Longpeng(Mike)

From: Longpeng 

The kvm_irqchip_commit_routes() is a time-intensive operation, it needs
scan and update all irqfds that are already assigned during each invocation,
so more vectors means need more time to process them. For virtio-pci, we
can just submit once when enabling vectors of a virtio-pci device.

This can reduce the downtime when migrating a VM with vhost-vdpa devices.

Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 247325c193..22e76e3902 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -49,6 +49,19 @@
  * configuration space */
 #define VIRTIO_PCI_CONFIG_SIZE(dev) 
VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev))
 
+/* Protected by the BQL */
+static KVMRouteChange virtio_pci_route_change;
+
+static inline void virtio_pci_begin_route_changes(void)
+{
+virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+}
+
+static inline void virtio_pci_commit_route_changes(void)
+{
+kvm_irqchip_commit_route_changes(_pci_route_change);
+}
+
 static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
VirtIOPCIProxy *dev);
 static void virtio_pci_reset(DeviceState *qdev);
@@ -790,12 +803,11 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy 
*proxy,
 int ret;
 
 if (irqfd->users == 0) {
-KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
-ret = kvm_irqchip_add_msi_route(, vector, >pci_dev);
+ret = kvm_irqchip_add_msi_route(_pci_route_change, vector,
+>pci_dev);
 if (ret < 0) {
 return ret;
 }
-kvm_irqchip_commit_route_changes();
 irqfd->virq = ret;
 }
 irqfd->users++;
@@ -903,12 +915,18 @@ static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy 
*proxy, int nvqs)
 int ret = 0;
 VirtIODevice *vdev = virtio_bus_get_device(>bus);
 
+virtio_pci_begin_route_changes();
+
 for (queue_no = 0; queue_no < nvqs; queue_no++) {
 if (!virtio_queue_get_num(vdev, queue_no)) {
+virtio_pci_commit_route_changes();
 return -1;
 }
 ret = kvm_virtio_pci_vector_use_one(proxy, queue_no);
 }
+
+virtio_pci_commit_route_changes();
+
 return ret;
 }
 
-- 
2.23.0

[PATCH v1 0/3] virtio-pci: optimize set_guest_notifier

2023-02-28 Thread Longpeng(Mike)

From: Longpeng 

This patchset optimizes the time-consuming operation in 
virtio_pci_set_guest_notifier,
especially for the vhost-vdpa migration, the time spend on set_guest_notifier 
can
reduce 87% in some cases.

Longpeng (Mike) (3):
  virtio-pci: submit msi route changes in batch
  kvm-irqchip: use KVMRouteChange API to update msi route
  virtio-pci: defer to commit kvm irq routing when enable msi/msix

 accel/kvm/kvm-all.c|  10 +--
 accel/stubs/kvm-stub.c |   2 +-
 hw/intc/ioapic.c   |   5 +-
 hw/misc/ivshmem.c  |   6 +-
 hw/vfio/pci.c  |   5 +-
 hw/virtio/virtio-pci.c | 140 -
 include/hw/virtio/virtio.h |   1 +
 include/sysemu/kvm.h   |   2 +-
 target/i386/kvm/kvm.c  |   6 +-
 9 files changed, 145 insertions(+), 32 deletions(-)

-- 
2.23.0

[PATCH v1 2/3] kvm-irqchip: use KVMRouteChange API to update msi route

2023-02-28 Thread Longpeng(Mike)

From: Longpeng 

The KVMRouteChange API is added by commit 9568690868e ("kvm-irqchip:
introduce new API to support route change"). We can also apply it on
kvm_irqchip_update_msi_route(), there are no functional changes and
we can optimize the virtio-pci core base on this change in the next
patch.

Signed-off-by: Longpeng 
---
 accel/kvm/kvm-all.c| 10 ++
 accel/stubs/kvm-stub.c |  2 +-
 hw/intc/ioapic.c   |  5 +++--
 hw/misc/ivshmem.c  |  6 --
 hw/vfio/pci.c  |  5 +++--
 hw/virtio/virtio-pci.c |  7 +--
 include/sysemu/kvm.h   |  2 +-
 target/i386/kvm/kvm.c  |  6 --
 8 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 9b26582655..1ed0dc4c9d 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1820,10 +1820,11 @@ static void kvm_add_routing_entry(KVMState *s,
 set_gsi(s, entry->gsi);
 }
 
-static int kvm_update_routing_entry(KVMState *s,
+static int kvm_update_routing_entry(KVMRouteChange *c,
 struct kvm_irq_routing_entry *new_entry)
 {
 struct kvm_irq_routing_entry *entry;
+KVMState *s = c->s;
 int n;
 
 for (n = 0; n < s->irq_routes->nr; n++) {
@@ -1837,6 +1838,7 @@ static int kvm_update_routing_entry(KVMState *s,
 }
 
 *entry = *new_entry;
+c->changes++;
 
 return 0;
 }
@@ -2046,7 +2048,7 @@ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int 
vector, PCIDevice *dev)
 return virq;
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg,
  PCIDevice *dev)
 {
 struct kvm_irq_routing_entry kroute = {};
@@ -2075,7 +2077,7 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, 
MSIMessage msg,
 
 trace_kvm_irqchip_update_msi_route(virq);
 
-return kvm_update_routing_entry(s, );
+return kvm_update_routing_entry(c, );
 }
 
 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
@@ -2221,7 +2223,7 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, 
EventNotifier *event,
 abort();
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg)
 {
 return -ENOSYS;
 }
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5d2dd8f351..5bcf98b9ab 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -69,7 +69,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq)
 {
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg,
  PCIDevice *dev)
 {
 return -ENOSYS;
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 264262959d..07b9cf7705 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -195,6 +195,7 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s)
 int i;
 
 if (kvm_irqchip_is_split()) {
+KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
 for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 MSIMessage msg;
 struct ioapic_entry_info info;
@@ -202,10 +203,10 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s)
 if (!info.masked) {
 msg.address = info.addr;
 msg.data = info.data;
-kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL);
+kvm_irqchip_update_msi_route(, i, msg, NULL);
 }
 }
-kvm_irqchip_commit_routes(kvm_state);
+kvm_irqchip_commit_route_changes();
 }
 #endif
 }
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index d66d912172..0e9427be42 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -278,6 +278,7 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned 
vector,
 IVShmemState *s = IVSHMEM_COMMON(dev);
 EventNotifier *n = >peers[s->vm_id].eventfds[vector];
 MSIVector *v = >msi_vectors[vector];
+KVMRouteChange c;
 int ret;
 
 IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
@@ -287,11 +288,12 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned 
vector,
 }
 assert(!v->unmasked);
 
-ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
+c = kvm_irqchip_begin_route_changes(kvm_state);
+ret = kvm_irqchip_update_msi_route(, v->virq, msg, dev);
 if (ret < 0) {
 return ret;
 }
-kvm_irqchip_commit_routes(kvm_state);
+kvm_irqchip_commit_route_changes();
 
 ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
 if (ret < 0) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a..fb69cc9965 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -460,8 +460,9 @@ static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 static void

Re: Lost partition tables on ide-hd + ahci drive

2023-02-17 Thread Mike Maslenkin

I think it's guest memory again. IMHO It's a part of a memory pool and
not real IO data (unless this was pagefile data).
The first 16 bytes look like POOL_HEADER structure.
The first dump contained signature from FilterManager and the latest
contains two structures from Ntfs.
It's not clear to me what exact data after header structure, but in
case of Ntfs it looks like doubly linked list  element
with Flink/Blink pointers: 60 a5 a6 d4 0c a8 ff ff,  - is a
0xa80cd4a6a560, and 30 15 d9 e6 0c a8 ff ff = 0xa80ce6d91530.
The first Ntfs, looks like a final element of something, while the
second is a middle part of something else.
That is why I think it is not real IO (i.e disk data sent by guest
NTFS driver). IMHO.

I can not tell anything about dma-reentracy issues, but yes, i would
start to look at check_cmd() function call sequence.
The most interesting is why Sector Count = 1. I thought about race
with IDE reset where registers initialized with
value SATA_SIGNATURE_DISK = 0x0101, but this means LBA=1 as well...

Regards,
Mike

On Fri, Feb 17, 2023 at 4:40 PM Fiona Ebner  wrote:
>
> Am 16.02.23 um 15:17 schrieb Mike Maslenkin:
> > Does additional comparison make a sense here: check for LBA == 0 and
> > then check MBR signature bytes.
> > Additionally it’s easy to check buffer_is_zero() result or even print
> > FIS contents under these conditions.
> > Data looks like a part of guest memory of 64bit Windows.
>
> Just today we got a new dump [0], and it's very similar. Again only 512
> bytes and again guest memory?
>
> > febner@enia ~/Downloads % hexdump -C dump.raw
> >   00 03 22 00 4e 74 46 73  da 4c a3 1c 3b f5 7d 19  
> > |..".NtFs.L..;.}.|
> > 0010  60 a5 a6 d4 0c a8 ff ff  30 15 d9 e6 0c a8 ff ff  
> > |`...0...|
> > 0020  5c 00 53 00 6f 00 66 00  74 00 77 00 61 00 72 00  
> > |\.S.o.f.t.w.a.r.|
> > 0030  65 00 44 00 69 00 73 00  74 00 72 00 69 00 62 00  
> > |e.D.i.s.t.r.i.b.|
> > 0040  75 00 74 00 69 00 6f 00  6e 00 5c 00 44 00 6f 00  
> > |u.t.i.o.n.\.D.o.|
> > 0050  77 00 6e 00 6c 00 6f 00  61 00 64 00 5c 00 37 00  
> > |w.n.l.o.a.d.\.7.|
> > 0060  33 00 63 00 36 00 33 00  65 00 32 00 64 00 37 00  
> > |3.c.6.3.e.2.d.7.|
> > 0070  66 00 66 00 38 00 66 00  36 00 35 00 31 00 31 00  
> > |f.f.8.f.6.5.1.1.|
> > 0080  39 00 36 00 63 00 65 00  61 00 31 00 65 00 30 00  
> > |9.6.c.e.a.1.e.0.|
> > 0090  39 00 66 00 66 00 36 00  32 00 30 00 65 00 5c 00  
> > |9.f.f.6.2.0.e.\.|
> > 00a0  69 00 6e 00 73 00 74 00  5c 00 70 00 61 00 63 00  
> > |i.n.s.t.\.p.a.c.|
> > 00b0  6b 00 61 00 67 00 65 00  5f 00 39 00 31 00 37 00  
> > |k.a.g.e._.9.1.7.|
> > 00c0  31 00 5f 00 66 00 6f 00  72 00 5f 00 6b 00 62 00  
> > |1._.f.o.r._.k.b.|
> > 00d0  35 00 30 00 32 00 32 00  38 00 33 00 38 00 7e 00  
> > |5.0.2.2.8.3.8.~.|
> > 00e0  33 00 31 00 62 00 66 00  33 00 38 00 35 00 36 00  
> > |3.1.b.f.3.8.5.6.|
> > 00f0  61 00 64 00 33 00 36 00  34 00 65 00 33 00 35 00  
> > |a.d.3.6.4.e.3.5.|
> > 0100  7e 00 61 00 6d 00 64 00  36 00 34 00 7e 00 7e 00  
> > |~.a.m.d.6.4.~.~.|
> > 0110  31 00 30 00 2e 00 30 00  2e 00 31 00 2e 00 31 00  
> > |1.0...0...1...1.|
> > 0120  33 00 2e 00 63 00 61 00  74 00 1d 08 0d a8 ff ff  
> > |3...c.a.t...|
> > 0130  13 03 0f 00 4e 74 46 73  ea 4d a3 1c 3b f5 7d 19  
> > |NtFs.M..;.}.|
> > 0140  90 05 4d 0f 0d a8 ff ff  a0 0c 55 0d 0d a8 ff ff  
> > |..M...U.|
> > 0150  43 52 4f 53 4f 46 54 2d  57 49 4e 44 4f 57 53 2d  
> > |CROSOFT-WINDOWS-|
> > 0160  44 2e 2e 2d 57 49 4e 50  52 4f 56 49 44 45 52 53  
> > |D..-WINPROVIDERS|
> > 0170  2d 41 53 53 4f 43 5f 33  31 42 46 33 38 35 36 41  
> > |-ASSOC_31BF3856A|
> > 0180  0c 03 67 00 70 00 73 00  63 00 72 00 69 00 70 00  
> > |..g.p.s.c.r.i.p.|
> > 0190  74 00 2e 00 65 00 78 00  65 00 37 00 36 00 34 00  
> > |t...e.x.e.7.6.4.|
> > 01a0  37 00 62 00 33 00 36 00  30 00 30 00 63 00 64 00  
> > |7.b.3.6.0.0.c.d.|
> > 01b0  65 00 30 00 34 00 31 00  35 00 39 00 35 00 32 00  
> > |e.0.4.1.5.9.5.2.|
> > 01c0  31 00 2e 00 74 00 6d 00  70 00 47 00 50 00 53 00  
> > |1...t.m.p.G.P.S.|
> > 01d0  43 00 52 00 49 00 50 00  54 00 2e 00 45 00 58 00  
> > |C.R.I.P.T...E.X.|
> > 01e0  45 00 37 00 36 00 34 00  37 00 42 00 33 00 36 00  
> > |E.7.6.4.7.B.3.6.|
> > 01f0  30 00 30 00 43 00 44 00  45 00 30 00 34 00 31 00  
> > |0.0.C.D.E.0.4.1.|
> > 0200  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  
> > ||
> > *
> > 0010
>
> [0]:
> https://forum.proxmox.com/threads/not-a-bootable-disk-vm-ms-server-2016.122849/post-534473
>

Re: Lost partition tables on ide-hd + ahci drive

2023-02-16 Thread Mike Maslenkin

Makes sense for disks without partition table.
But wouldn't Linux or any other OS write at least 4K bytes in that case?
Who may want to write 512 bytes for any purposes except for boot
sector nowadays..
In dump mentioned before only 512 bytes were not zeroed, so I guess it
was caused by IO from guest OS.
In other cases it can be caused by misconfigured IDE registers state
or broken FIS memory area.

On Thu, Feb 16, 2023 at 6:25 PM Fiona Ebner  wrote:
>
> Am 16.02.23 um 15:17 schrieb Mike Maslenkin:
> > Does additional comparison make a sense here: check for LBA == 0 and
> > then check MBR signature bytes.
> > Additionally it’s easy to check buffer_is_zero() result or even print
> > FIS contents under these conditions.
> > Data looks like a part of guest memory of 64bit Windows.
>
> Thank you for the suggestion! I'll think about adding such a check and
> dumping of FIS contents in a custom build for affected users. But in
> general it would be too much noise for non-MBR cases: e.g. on a disk
> formatted with ext4 (without any partitions), Linux will write to sector
> 0 on every startup and shutdown.
>
> Best Regards,
> Fiona
>

Re: Lost partition tables on ide-hd + ahci drive

2023-02-16 Thread Mike Maslenkin

Does additional comparison make a sense here: check for LBA == 0 and
then check MBR signature bytes.
Additionally it’s easy to check buffer_is_zero() result or even print
FIS contents under these conditions.
Data looks like a part of guest memory of 64bit Windows.

On Wed, Feb 15, 2023 at 1:53 PM Fiona Ebner  wrote:
>
> Am 14.02.23 um 19:21 schrieb John Snow:
> > On Thu, Feb 2, 2023 at 7:08 AM Fiona Ebner  wrote:
> >>
> >> Hi,
> >> over the years we've got 1-2 dozen reports[0] about suddenly
> >> missing/corrupted MBR/partition tables. The issue seems to be very rare
> >> and there was no success in trying to reproduce it yet. I'm asking here
> >> in the hope that somebody has seen something similar.
> >>
> >> The only commonality seems to be the use of an ide-hd drive with ahci bus.
> >>
> >> It does seem to happen with both Linux and Windows guests (one of the
> >> reports even mentions FreeBSD) and backing storages for the VMs include
> >> ZFS, RBD, LVM-Thin as well as file-based storages.
> >>
> >> Relevant part of an example configuration:
> >>
> >>>   -device 'ahci,id=ahci0,multifunction=on,bus=pci.0,addr=0x7' \
> >>>   -drive 
> >>> 'file=/dev/zvol/myzpool/vm-168-disk-0,if=none,id=drive-sata0,format=raw,cache=none,aio=io_uring,detect-zeroes=on'
> >>>  \
> >>>   -device 'ide-hd,bus=ahci0.0,drive=drive-sata0,id=sata0' \
> >>
> >> The first reports are from before io_uring was used and there are also
> >> reports with writeback cache mode and discard=on,detect-zeroes=unmap.
> >>
> >> Some reports say that the issue occurred under high IO load.
> >>
> >> Many reports suspect backups causing the issue. Our backup mechanism
> >> uses backup_job_create() for each drive and runs the jobs sequentially.
> >> It uses a custom block driver as the backup target which just forwards
> >> the writes to the actual target which can be a file or our backup server.
> >> (If you really want to see the details, apply the patches in [1] and see
> >> pve-backup.c and block/backup-dump.c).
> >>
> >> Of course, the backup job will read sector 0 of the source disk, but I
> >> really can't see where a stray write would happen, why the issue would
> >> trigger so rarely or why seemingly only ide-hd+ahci would be affected.
> >>
> >> So again, just asking if somebody has seen something similar or has a
> >> hunch of what the cause might be.
> >>
> >
> > Hi Floria;
> >
> > I'm sorry to say that I haven't worked on the block devices (or
> > backup) for a little while now, so I am not immediately sure what
> > might be causing this problem. In general, I advise against using AHCI
> > in production as better performance (and dev support) can be achieved
> > through virtio.
>
> Yes, we also recommend using virtio-{scsi,blk}-pci to our users and most
> do. Still, some use AHCI, I'd guess mostly for Windows, but not only.
>
> > Still, I am not sure why the combination of AHCI with
> > backup_job_create() would be corrupting the early sectors of the disk.
>
> It's not clear that backup itself is causing the issue. Some of the
> reports do correlate it with backup, but there are no precise timestamps
> when the corruption happened. It might be that the additional IO during
> backup is somehow triggering the issue.
>
> > Do you have any analysis on how much data gets corrupted? Is it the
> > first sector only, the first few? Has anyone taken a peek at the
> > backing storage to see if there are any interesting patterns that can
> > be observed? (Zeroes, garbage, old data?)
>
> It does seem to be the first sector only, but it's not entirely clear.
> Many of the affected users said that after fixing the partition table
> with TestDisk, the VMs booted/worked normally again. We only have dumps
> for the first MiB of three images. In this case, all Windows with Ceph
> RBD images.
>
> See below[0] for the dumps. One was a valid MBR and matched the latest
> good backup, so that VM didn't boot for some other reason, not sure if
> even related to this bug. I did not include this one. One was completely
> empty and one contained other data in the first 512 Bytes, then again
> zeroes, but those zeroes are nothing special AFAIK.
>
> > Have any errors or warnings been observed in either the guest or the
> > host that might offer some clues?
>
> There is a single user who seemed to have hardware issues, and I'd be
> inclined to blame those in that case. But none of the other users
> reported any errors or warnings, though I can't say if any checked
> inside the guests.
>
> > Is there any commonality in the storage format being used? Is it
> > qcow2? Is it network-backed?
>
> There are reports with local ZFS volumes, local LVM-Thin volumes, RBD
> images, qcow2 on NFS. So no pattern to be seen.
>
> > Apologies for the "tier 1" questions.
>
> Thank you for your time!
>
> Best Regards,
> Fiona
>
> @Aaron (had access to the broken images): please correct me/add anything
> relevant I missed. Are the broken VMs/backups still present? If yes, can
> we ask the

Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM

2023-02-15 Thread Mike Rapoport

Hi,

On Fri, Dec 02, 2022 at 02:13:38PM +0800, Chao Peng wrote:
> This patch series implements KVM guest private memory for confidential
> computing scenarios like Intel TDX[1]. If a TDX host accesses
> TDX-protected guest memory, machine check can happen which can further
> crash the running host system, this is terrible for multi-tenant
> configurations. The host accesses include those from KVM userspace like
> QEMU. This series addresses KVM userspace induced crash by introducing
> new mm and KVM interfaces so KVM userspace can still manage guest memory
> via a fd-based approach, but it can never access the guest memory
> content.

Sorry for jumping late.

Unless I'm missing something, hibernation will also cause an machine check
when there is TDX-protected memory in the system. When the hibernation
creates memory snapshot it essentially walks all physical pages and saves
their contents, so for TDX memory this will trigger machine check, right?
 
>  Documentation/virt/kvm/api.rst | 125 ++-
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  arch/x86/include/asm/kvm_host.h|   9 +
>  arch/x86/kvm/Kconfig   |   3 +
>  arch/x86/kvm/mmu/mmu.c | 205 ++-
>  arch/x86/kvm/mmu/mmu_internal.h|  14 +-
>  arch/x86/kvm/mmu/mmutrace.h|   1 +
>  arch/x86/kvm/mmu/tdp_mmu.c |   2 +-
>  arch/x86/kvm/x86.c |  17 +-
>  include/linux/kvm_host.h   | 103 +-
>  include/linux/restrictedmem.h  |  71 
>  include/linux/syscalls.h   |   1 +
>  include/uapi/asm-generic/unistd.h  |   5 +-
>  include/uapi/linux/kvm.h   |  53 +++
>  include/uapi/linux/magic.h |   1 +
>  kernel/sys_ni.c|   3 +
>  mm/Kconfig |   4 +
>  mm/Makefile|   1 +
>  mm/memory-failure.c|   3 +
>  mm/restrictedmem.c | 318 +
>  virt/kvm/Kconfig   |   6 +
>  virt/kvm/kvm_main.c| 469 +
>  23 files changed, 1323 insertions(+), 93 deletions(-)
>  create mode 100644 include/linux/restrictedmem.h
>  create mode 100644 mm/restrictedmem.c

-- 
Sincerely yours,
Mike.

[PATCH] linux-user: move target_flat.h to target subdirs

2023-01-28 Thread Mike Frysinger

This makes target_flat.h behave like every other target_xxx.h header.
It also makes it actually work -- while the current header says adding
a header to the target subdir overrides the common one, it doesn't.
This is for two reasons:
* meson.build adds -Ilinux-user before -Ilinux-user/$arch
* the compiler search path for "target_flat.h" looks in the same dir
  as the source file before searching -I paths.

This can be seen with the xtensa port -- the subdir settings aren't
used which breaks stack setup.

Move it to the generic/ subdir and add include stubs like every
other target_xxx.h header is handled.

Signed-off-by: Mike Frysinger 
---
 linux-user/aarch64/target_flat.h   | 1 +
 linux-user/arm/target_flat.h   | 1 +
 linux-user/{ => generic}/target_flat.h | 0
 linux-user/m68k/target_flat.h  | 1 +
 linux-user/microblaze/target_flat.h| 1 +
 linux-user/sh4/target_flat.h   | 1 +
 6 files changed, 5 insertions(+)
 create mode 100644 linux-user/aarch64/target_flat.h
 create mode 100644 linux-user/arm/target_flat.h
 rename linux-user/{ => generic}/target_flat.h (100%)
 create mode 100644 linux-user/m68k/target_flat.h
 create mode 100644 linux-user/microblaze/target_flat.h
 create mode 100644 linux-user/sh4/target_flat.h

diff --git a/linux-user/aarch64/target_flat.h b/linux-user/aarch64/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/aarch64/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/arm/target_flat.h b/linux-user/arm/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/arm/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/target_flat.h b/linux-user/generic/target_flat.h
similarity index 100%
rename from linux-user/target_flat.h
rename to linux-user/generic/target_flat.h
diff --git a/linux-user/m68k/target_flat.h b/linux-user/m68k/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/m68k/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/microblaze/target_flat.h 
b/linux-user/microblaze/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/microblaze/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/sh4/target_flat.h b/linux-user/sh4/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/sh4/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
-- 
2.39.0

[PATCH v2] linux-user: fix strace build w/out munlockall

2023-01-18 Thread Mike Frysinger

Signed-off-by: Mike Frysinger 
Reviewed-by: Philippe Mathieu-Daud?? 
---
 linux-user/strace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9ae5a812cd71..11a7c3df9498 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -1380,7 +1380,8 @@ UNUSED static struct flags termios_lflags[] = {
 FLAG_END,
 };
 
-UNUSED static struct flags mlockall_flags[] = {
+#ifdef TARGET_NR_mlockall
+static struct flags mlockall_flags[] = {
 FLAG_TARGET(MCL_CURRENT),
 FLAG_TARGET(MCL_FUTURE),
 #ifdef MCL_ONFAULT
@@ -1388,6 +1389,7 @@ UNUSED static struct flags mlockall_flags[] = {
 #endif
 FLAG_END,
 };
+#endif
 
 /* IDs of the various system clocks */
 #define TARGET_CLOCK_REALTIME  0
-- 
2.39.0

[PATCH] configure: do not quote $PKG_CONFIG

2023-01-17 Thread Mike Frysinger

We should not quote the PKG_CONFIG setting as this deviates from the
canonical upstream behavior that gets integrated with all other build
systems, and deviates from how we treat all other toolchain variables
that we get from the environment.

Ultimately, the point is that it breaks passing custom flags directly
to pkg-config via the env var where this normally works elsewhere,
and it used to work in the past.

Signed-off-by: Mike Frysinger 
---
 configure | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 9e407ce2e3a9..b5a19d1319a5 100755
--- a/configure
+++ b/configure
@@ -369,7 +369,7 @@ windres="${WINDRES-${cross_prefix}windres}"
 windmc="${WINDMC-${cross_prefix}windmc}"
 pkg_config_exe="${PKG_CONFIG-${cross_prefix}pkg-config}"
 query_pkg_config() {
-"${pkg_config_exe}" ${QEMU_PKG_CONFIG_FLAGS} "$@"
+${pkg_config_exe} ${QEMU_PKG_CONFIG_FLAGS} "$@"
 }
 pkg_config=query_pkg_config
 sdl2_config="${SDL2_CONFIG-${cross_prefix}sdl2-config}"
@@ -1430,7 +1430,7 @@ fi
 ##
 # pkg-config probe
 
-if ! has "$pkg_config_exe"; then
+if ! has $pkg_config_exe; then
   error_exit "pkg-config binary '$pkg_config_exe' not found"
 fi
 
-- 
2.39.0

[PATCH] linux-user: fix strace build w/out munlockall

2023-01-17 Thread Mike Frysinger

Signed-off-by: Mike Frysinger 
---
 linux-user/strace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9ae5a812cd71..f7912ad67f2b 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -1380,6 +1380,7 @@ UNUSED static struct flags termios_lflags[] = {
 FLAG_END,
 };
 
+#ifdef TARGET_NR_mlockall
 UNUSED static struct flags mlockall_flags[] = {
 FLAG_TARGET(MCL_CURRENT),
 FLAG_TARGET(MCL_FUTURE),
@@ -1388,6 +1389,7 @@ UNUSED static struct flags mlockall_flags[] = {
 #endif
 FLAG_END,
 };
+#endif
 
 /* IDs of the various system clocks */
 #define TARGET_CLOCK_REALTIME  0
-- 
2.39.0

Re: [PATCH 7/7] hw/mem/cxl_type3: Add CXL RAS Error Injection Support.

2023-01-15 Thread Mike Maslenkin

On Fri, Jan 13, 2023 at 7:43 PM Jonathan Cameron via
 wrote:
>
> CXL uses PCI AER Internal errors to signal to the host that an error has
> occurred. The host can then read more detailed status from the CXL RAS
> capability.
>
> For uncorrectable errors: support multiple injection in one operation
> as this is needed to reliably test multiple header logging support in an
> OS. The equivalent feature doesn't exist for correctable errors, so only
> one error need be injected at a time.
>
> Note:
>  - Header content needs to be manually specified in a fashion that
>matches the specification for what can be in the header for each
>error type.
>
> Injection via QMP:
> { "execute": "qmp_capabilities" }
> ...
> { "execute": "cxl-inject-uncorrectable-errors",
>   "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "errors": [
> {
> "type": "cache-address-parity",
> "header": [ 3, 4]
> },
> {
> "type": "cache-data-parity",
> "header": 
> [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
> },
> {
> "type": "internal",
> "header": [ 1, 2, 4]
> }
> ]
>   }}
> ...
> { "execute": "cxl-inject-correctable-error",
> "arguments": {
> "path": "/machine/peripheral/cxl-pmem0",
> "type": "physical",
> "header": [ 3, 4]
> } }
>
> Signed-off-by: Jonathan Cameron 
> ---
>  hw/cxl/cxl-component-utils.c   |   4 +-
>  hw/mem/cxl_type3.c | 290 +
>  hw/mem/cxl_type3_stubs.c   |  10 ++
>  hw/mem/meson.build |   2 +
>  include/hw/cxl/cxl_component.h |  26 +++
>  include/hw/cxl/cxl_device.h|  11 ++
>  qapi/cxl.json  | 113 +
>  qapi/meson.build   |   1 +
>  qapi/qapi-schema.json  |   1 +
>  9 files changed, 457 insertions(+), 1 deletion(-)
>
> diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
> index 3edd303a33..02fb6c17b9 100644
> --- a/hw/cxl/cxl-component-utils.c
> +++ b/hw/cxl/cxl-component-utils.c
> @@ -142,16 +142,18 @@ static void ras_init_common(uint32_t *reg_state, 
> uint32_t *write_msk)
>   * be handled as RO.
>   */
>  reg_state[R_CXL_RAS_UNC_ERR_STATUS] = 0;
> +write_msk[R_CXL_RAS_UNC_ERR_STATUS] = 0x1cfff;
>  /* Bits 12-13 and 17-31 reserved in CXL 2.0 */
>  reg_state[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff;
>  write_msk[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff;
>  reg_state[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff;
>  write_msk[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff;
>  reg_state[R_CXL_RAS_COR_ERR_STATUS] = 0;
> +write_msk[R_CXL_RAS_COR_ERR_STATUS] = 0x7f;
>  reg_state[R_CXL_RAS_COR_ERR_MASK] = 0x7f;
>  write_msk[R_CXL_RAS_COR_ERR_MASK] = 0x7f;
>  /* CXL switches and devices must set */
> -reg_state[R_CXL_RAS_ERR_CAP_CTRL] = 0x00;
> +reg_state[R_CXL_RAS_ERR_CAP_CTRL] = 0x200;
>  }
>
>  static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk,
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index 6cdd988d1d..ae8fd09e87 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -1,6 +1,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qemu/error-report.h"
> +#include "qapi/qapi-commands-cxl.h"
>  #include "hw/mem/memory-device.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/pci/pci.h"
> @@ -323,6 +324,66 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int 
> which)
>  ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
>  }
>
> +static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
> +{
> +switch (qmp_err) {
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_BE_PARITY:
> +return CXL_RAS_UNC_ERR_CACHE_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_ECC:
> +return CXL_RAS_UNC_ERR_CACHE_DATA_ECC;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_DATA_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_ADDRESS_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_ADDRESS_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_BE_PARITY:
> +return CXL_RAS_UNC_ERR_MEM_BE_PARITY;
> +case CXL_UNCOR_ERROR_TYPE_MEM_DATA_ECC:
> +return CXL_RAS_UNC_ERR_MEM_DATA_ECC;
> +case CXL_UNCOR_ERROR_TYPE_REINIT_THRESHOLD:
> +return CXL_RAS_UNC_ERR_REINIT_THRESHOLD;
> +case CXL_UNCOR_ERROR_TYPE_RSVD_ENCODING:
> +return CXL_RAS_UNC_ERR_RSVD_ENCODING;
> +case CXL_UNCOR_ERROR_TYPE_POISON_RECEIVED:
> +return CXL_RAS_UNC_ERR_POISON_RECEIVED;
> +case CXL_UNCOR_ERROR_TYPE_RECEIVER_OVERFLOW:
> +return CXL_RAS_UNC_ERR_RECEIVER_OVERFLOW;
> +case

[PATCH v3 3/3] vdpa: commit all host notifier MRs in a single MR transaction

2022-12-26 Thread Longpeng(Mike)

From: Longpeng 

This allows the vhost-vdpa device to batch the setup of all its MRs of
host notifiers.

This significantly reduces the device starting time, e.g. the time spend
on setup the host notifier MRs reduce from 423ms to 32ms for a VM with
64 vCPUs and 3 vhost-vDPA generic devices (vdpa_sim_blk, 64vq per device).

Signed-off-by: Longpeng 
---
 hw/virtio/vhost-vdpa.c | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index fd0c33b0e1..870265188a 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -512,9 +512,18 @@ static void vhost_vdpa_host_notifiers_uninit(struct 
vhost_dev *dev, int n)
 {
 int i;
 
+/*
+ * Pack all the changes to the memory regions in a single
+ * transaction to avoid a few updating of the address space
+ * topology.
+ */
+memory_region_transaction_begin();
+
 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
 vhost_vdpa_host_notifier_uninit(dev, i);
 }
+
+memory_region_transaction_commit();
 }
 
 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
@@ -527,17 +536,21 @@ static void vhost_vdpa_host_notifiers_init(struct 
vhost_dev *dev)
 return;
 }
 
+/*
+ * Pack all the changes to the memory regions in a single
+ * transaction to avoid a few updating of the address space
+ * topology.
+ */
+memory_region_transaction_begin();
+
 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
 if (vhost_vdpa_host_notifier_init(dev, i)) {
-goto err;
+vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
+break;
 }
 }
 
-return;
-
-err:
-vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
-return;
+memory_region_transaction_commit();
 }
 
 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
-- 
2.23.0

[PATCH v3 2/3] vhost: configure all host notifiers in a single MR transaction

2022-12-26 Thread Longpeng(Mike)

From: Longpeng 

This allows the vhost device to batch the setup of all its host notifiers.
This significantly reduces the device starting time, e.g. the time spend
on enabling notifiers reduce from 376ms to 9.1ms for a VM with 64 vCPUs
and 3 vhost-vDPA generic devices (vdpa_sim_blk, 64vq per device)

Signed-off-by: Longpeng 
---
 hw/virtio/vhost.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 5994559da8..064d4abe5c 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1562,16 +1562,25 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 return r;
 }
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  true);
 if (r < 0) {
 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
+memory_region_transaction_commit();
 vhost_dev_disable_notifiers(hdev, vdev);
 return r;
 }
 }
 
+memory_region_transaction_commit();
+
 return 0;
 }
 
@@ -1585,6 +1594,12 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
 int i, r;
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  false);
@@ -1592,6 +1607,15 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
 }
 assert (r >= 0);
+}
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+for (i = 0; i < hdev->nvqs; ++i) {
 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
 }
 virtio_device_release_ioeventfd(vdev);
-- 
2.23.0

[PATCH v3 1/3] vhost: simplify vhost_dev_enable_notifiers

2022-12-26 Thread Longpeng(Mike)

From: Longpeng 

Simplify the error path in vhost_dev_enable_notifiers by using
vhost_dev_disable_notifiers directly.

Signed-off-by: Longpeng 
---
 hw/virtio/vhost.c | 20 
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index fdcd1a8fdf..5994559da8 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1551,7 +1551,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-int i, r, e;
+int i, r;
 
 /* We will pass the notifiers to the kernel, make sure that QEMU
  * doesn't interfere.
@@ -1559,7 +1559,7 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 r = virtio_device_grab_ioeventfd(vdev);
 if (r < 0) {
 error_report("binding does not support host notifiers");
-goto fail;
+return r;
 }
 
 for (i = 0; i < hdev->nvqs; ++i) {
@@ -1567,24 +1567,12 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
  true);
 if (r < 0) {
 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
-goto fail_vq;
+vhost_dev_disable_notifiers(hdev, vdev);
+return r;
 }
 }
 
 return 0;
-fail_vq:
-while (--i >= 0) {
-e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
- false);
-if (e < 0) {
-error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
-}
-assert (e >= 0);
-virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
-}
-virtio_device_release_ioeventfd(vdev);
-fail:
-return r;
 }
 
 /* Stop processing guest IO notifications in vhost.
-- 
2.23.0

[PATCH v3 0/3] two optimizations to speed up the start time

2022-12-26 Thread Longpeng(Mike)

From: Longpeng 

Changes v3->v2:
 - cleanup the code [Philippe]

Changes v2->v1:
 Patch-1:
  - remove vq_init_count [Jason]
 Patch-2:
  - new added. [Jason]

v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg922499.html

Longpeng (Mike) (3):
  vhost: simplify vhost_dev_enable_notifiers
  vhost: configure all host notifiers in a single MR transaction
  vdpa: commit all host notifier MRs in a single MR transaction

 hw/virtio/vhost-vdpa.c | 25 ++--
 hw/virtio/vhost.c  | 44 +++---
 2 files changed, 47 insertions(+), 22 deletions(-)

-- 
2.23.0

[PATCH 1/2] vdpa-dev: get iova range explicitly

2022-12-24 Thread Longpeng(Mike)

From: Longpeng 

In commit a585fad26b ("vdpa: request iova_range only once") we remove
GET_IOVA_RANGE form vhost_vdpa_init, the generic vdpa device will start
without iova_range populated, so the device won't work. Let's call
GET_IOVA_RANGE ioctl explicitly.

Fixes: a585fad26b2e6ccc ("vdpa: request iova_range only once")
Signed-off-by: Longpeng 
---
 hw/virtio/vdpa-dev.c   | 9 +
 hw/virtio/vhost-vdpa.c | 7 +++
 include/hw/virtio/vhost-vdpa.h | 2 ++
 net/vhost-vdpa.c   | 8 
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index db6ba61152..01b41eb0f1 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -53,6 +53,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error 
**errp)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+struct vhost_vdpa_iova_range iova_range;
 uint16_t max_queue_size;
 struct vhost_virtqueue *vqs;
 int i, ret;
@@ -108,6 +109,14 @@ static void vhost_vdpa_device_realize(DeviceState *dev, 
Error **errp)
 v->dev.backend_features = 0;
 v->started = false;
 
+ret = vhost_vdpa_get_iova_range(v->vhostfd, _range);
+if (ret < 0) {
+error_setg(errp, "vhost-vdpa-device: get iova range failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+v->vdpa.iova_range = iova_range;
+
 ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
 if (ret < 0) {
 error_setg(errp, "vhost-vdpa-device: vhost initialization failed: %s",
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 870265188a..109a2ee3bf 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -378,6 +378,13 @@ static int vhost_vdpa_add_status(struct vhost_dev *dev, 
uint8_t status)
 return 0;
 }
 
+int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range)
+{
+int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+return ret < 0 ? -errno : 0;
+}
+
 /*
  * The use of this function is for requests that only need to be
  * applied once. Typically such request occurs at the beginning
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index 45b969a311..7997f09a8d 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -51,6 +51,8 @@ typedef struct vhost_vdpa {
 VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
 
+int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range 
*iova_range);
+
 int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
hwaddr size, void *vaddr, bool readonly);
 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index d36664f33a..ffdc435d19 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -702,14 +702,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 return nc;
 }
 
-static int vhost_vdpa_get_iova_range(int fd,
- struct vhost_vdpa_iova_range *iova_range)
-{
-int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
-
-return ret < 0 ? -errno : 0;
-}
-
 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
 {
 int ret = ioctl(fd, VHOST_GET_FEATURES, features);
-- 
2.23.0

[PATCH 2/2] vdpa: harden the error path if get_iova_range failed

2022-12-24 Thread Longpeng(Mike)

From: Longpeng 

We should stop if the GET_IOVA_RANGE ioctl failed.

Signed-off-by: Longpeng 
---
 net/vhost-vdpa.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index ffdc435d19..e65023d013 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -797,7 +797,13 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 return queue_pairs;
 }
 
-vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+r = vhost_vdpa_get_iova_range(vdpa_device_fd, _range);
+if (unlikely(r < 0)) {
+error_setg(errp, "vhost-vdpa: get iova range failed: %s",
+   strerror(-r));
+goto err;
+}
+
 if (opts->x_svq) {
 if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
 goto err_svq;
-- 
2.23.0

[PATCH 0/2] Fix the init path of generic vhost-vdpa device

2022-12-24 Thread Longpeng(Mike)

From: Longpeng 

The generic vhost-vdpa device and the commit a585fad26b ("vdpa: request
iova_range only once") are merged in the same pull request, and the later
would cause the generic vhost-vdpa device work improperly.

Patch 1 fixes the problem and patch 2 hardens the error path of vdpa/net.

Longpeng (Mike) (2):
  vdpa-dev: get iova range explicitly
  vdpa: harden the error path if get_iova_range failed

 hw/virtio/vdpa-dev.c   |  9 +
 hw/virtio/vhost-vdpa.c |  7 +++
 include/hw/virtio/vhost-vdpa.h |  2 ++
 net/vhost-vdpa.c   | 16 +++-
 4 files changed, 25 insertions(+), 9 deletions(-)

-- 
2.23.0

[PATCH v11 3/5] vdpa: add vdpa-dev-pci support

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 2 files changed, 103 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
-- 
2.23.0

[PATCH v11 1/5] virtio: get class_id and pci device id by the virtio id

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index a1c9dfa7bb..a602f670ca 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -224,6 +225,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1729,6 +1814,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 938799e8f6..24fba1604b 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -151,6 +151,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -184,6 +186,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

[PATCH v11 4/5] vdpa-dev: mark the device as unmigratable

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

The generic vDPA device doesn't support migration currently, so
mark it as unmigratable temporarily.

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/vdpa-dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index dbc4f8001d..db6ba61152 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -327,6 +327,7 @@ static Property vhost_vdpa_device_properties[] = {
 
 static const VMStateDescription vmstate_vhost_vdpa_device = {
 .name = "vhost-vdpa-device",
+.unmigratable = 1,
 .minimum_version_id = 1,
 .version_id = 1,
 .fields = (VMStateField[]) {
-- 
2.23.0

[PATCH v11 0/5] add generic vDPA device support

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Hi guys,

With the generic vDPA device, QEMU won't need to touch the device
types any more, such like vfio.

We can use the generic vDPA device as follow:
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X
  Or
  -M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
  vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Changes v11->v10:
- add requirements, optimize the preparation [Stefano]

Changes v10 -> v9:
- optimize the doc [Jason]

Changes v9 -> v8:
- rename vhost-vdpa-device.rst to vhost-vdpa-generic-device.rst [Jason, 
Stefano]
- emphasize the vhost-vDPA generic device in doc [Jason]

Changes v8 -> v7:
- add migration blocker. [Michael]

Changes v6 -> v7:
(v6: https://mail.gnu.org/archive/html/qemu-devel/2022-05/msg02821.html)
- rebase. [Jason]
- add documentation . [Stefan]

Changes v5 -> v6:
  Patch 2:
- Turn to the original approach in the RFC to initialize the
  virtio_pci_id_info array. [Michael]
  https://lore.kernel.org/all/20220105005900.860-2-longpe...@huawei.com/
  Patch 3:
- Fix logical error of exception handler around the post_init.
  [Stefano]
- Fix some coding style warnings. [Stefano]
  Patch 4:
- Fix some coding style warnings. [Stefano]

Changes v4 -> v5:
  Patch 3:
- remove vhostfd [Jason]
- support virtio-mmio [Jason]

Changes v3 -> v4:
  v3: https://www.mail-archive.com/qemu-devel@nongnu.org/msg877015.html
  - reorganize the series [Stefano]
  - fix some typos [Stefano]
  - fix logical error in vhost_vdpa_device_realize [Stefano]

Changes v2 -> v3
  Patch 4 & 5:
- only call vdpa ioctls in vdpa-dev.c [Stefano, Longpeng]
- s/VQS_NUM/VQS_COUNT  [Stefano]
- check both vdpa_dev_fd and vdpa_dev [Stefano]
  Patch 6:
- move all steps into vhost_vdpa_device_unrealize. [Stefano]

Changes RFC -> v2
  Patch 1:
- rename 'pdev_id' to 'trans_devid'  [Michael]
- only use transitional device id for the devices
  listed in the spec  [Michael]
- use macros to make the id_info table clearer  [Longpeng]
- add some modern devices in the id_info table  [Longpeng]
  Patch 2:
- remove the GET_VECTORS_NUM command  [Jason]
  Patch 4:
- expose vdpa_dev_fd as a QOM preperty  [Stefan]
- introduce vhost_vdpa_device_get_u32 as a common
  function to make the code clearer  [Stefan]
- fix the misleading description of 'dc->desc'  [Stefano]
  Patch 5:
- check returned number of virtqueues  [Stefan]
  Patch 6:
- init s->num_queues  [Stefano]
- free s->dev.vqs  [Stefano]

Longpeng (Mike) (5):
  virtio: get class_id and pci device id by the virtio id
  vdpa: add vdpa-dev support
  vdpa: add vdpa-dev-pci support
  vdpa-dev: mark the device as unmigratable
  docs: Add generic vhost-vdpa device documentation

 .../devices/vhost-vdpa-generic-device.rst |  68 
 hw/virtio/Kconfig |   5 +
 hw/virtio/meson.build |   2 +
 hw/virtio/vdpa-dev-pci.c  | 102 +
 hw/virtio/vdpa-dev.c  | 377 ++
 hw/virtio/virtio-pci.c|  88 
 include/hw/virtio/vdpa-dev.h  |  43 ++
 include/hw/virtio/virtio-pci.h|   5 +
 8 files changed, 690 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst
 create mode 100644 hw/virtio/vdpa-dev-pci.c
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

-- 
2.23.0

[PATCH v11 5/5] docs: Add generic vhost-vdpa device documentation

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 .../devices/vhost-vdpa-generic-device.rst | 68 +++
 1 file changed, 68 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst

diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst 
b/docs/system/devices/vhost-vdpa-generic-device.rst
new file mode 100644
index 00..24c825ef1a
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-generic-device.rst
@@ -0,0 +1,68 @@
+
+=
+vhost-vDPA generic device
+=
+
+This document explains the usage of the vhost-vDPA generic device.
+
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which
+is type insensitive.
+
+The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio
+subsystem. It is quite small, but it can support any type of virtio device.
+
+
+Requirements
+
+Linux 5.18+
+iproute2/vdpa 5.12.0+
+
+
+Examples
+
+
+1. Prepare the vhost-vDPA backends, here is an example using vdpa_sim_blk
+   device:
+
+::
+  host# modprobe vhost_vdpa
+  host# modprobe vdpa_sim_blk
+  host# vdpa dev add mgmtdev vdpasim_blk name blk0
+  (...you can see the vhost-vDPA device under /dev directory now...)
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Note:
+It needs some vendor-specific steps to provision the vDPA device if you're
+using real HW devices, such as loading the vendor-specific vDPA driver and
+binding the device to the driver.
+
+
+2. Start the virtual machine:
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v11 2/5] vdpa: add vdpa-dev support

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 376 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 425 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..dbc4f8001d
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,376 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, ) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_setg(errp,

[PATCH v2 1/2] vhost: configure all host notifiers in a single MR transaction

2022-12-06 Thread Longpeng(Mike)

From: Longpeng 

This allows the vhost device to batch the setup of all its host notifiers.
This significantly reduces the device starting time, e.g. the time spend
on enabling notifiers reduce from 376ms to 9.1ms for a VM with 64 vCPUs
and 3 vhost-vDPA generic devices[1] (64vq per device)

[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg921541.html

Signed-off-by: Longpeng 
---
 hw/virtio/vhost.c | 40 ++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 7fb008bc9e..16f8391d86 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1507,7 +1507,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-int i, r, e;
+int i, n, r, e;
 
 /* We will pass the notifiers to the kernel, make sure that QEMU
  * doesn't interfere.
@@ -1518,6 +1518,12 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 goto fail;
 }
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  true);
@@ -1527,8 +1533,12 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 }
 }
 
+memory_region_transaction_commit();
+
 return 0;
 fail_vq:
+/* save i for a second iteration after transaction is committed. */
+n = i;
 while (--i >= 0) {
 e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  false);
@@ -1536,8 +1546,18 @@ fail_vq:
 error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
 }
 assert (e >= 0);
-virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
 }
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+while (--n >= 0) {
+virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + n);
+}
+
 virtio_device_release_ioeventfd(vdev);
 fail:
 return r;
@@ -1553,6 +1573,12 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
 int i, r;
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  false);
@@ -1560,8 +1586,18 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
 }
 assert (r >= 0);
+}
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+for (i = 0; i < hdev->nvqs; ++i) {
 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
 }
+
 virtio_device_release_ioeventfd(vdev);
 }
 
-- 
2.23.0

[PATCH v2 2/2] vdpa: commit all host notifier MRs in a single MR transaction

2022-12-06 Thread Longpeng(Mike)

From: Longpeng 

This allows the vhost-vdpa device to batch the setup of all its MRs of
host notifiers.

This significantly reduces the device starting time, e.g. the time spend
on setup the host notifier MRs reduce from 423ms to 32ms for a VM with
64 vCPUs and 3 vhost-vDPA generic devices[1] (64vq per device).

[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg921541.html

Signed-off-by: Longpeng 
---
 hw/virtio/vhost-vdpa.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7468e44b87..eb233cf08a 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -547,9 +547,18 @@ static void vhost_vdpa_host_notifiers_uninit(struct 
vhost_dev *dev, int n)
 {
 int i;
 
+/*
+ * Pack all the changes to the memory regions in a single
+ * transaction to avoid a few updating of the address space
+ * topology.
+ */
+memory_region_transaction_begin();
+
 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
 vhost_vdpa_host_notifier_uninit(dev, i);
 }
+
+memory_region_transaction_commit();
 }
 
 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
@@ -562,16 +571,25 @@ static void vhost_vdpa_host_notifiers_init(struct 
vhost_dev *dev)
 return;
 }
 
+/*
+ * Pack all the changes to the memory regions in a single
+ * transaction to avoid a few updating of the address space
+ * topology.
+ */
+memory_region_transaction_begin();
+
 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
 if (vhost_vdpa_host_notifier_init(dev, i)) {
 goto err;
 }
 }
 
+memory_region_transaction_commit();
 return;
 
 err:
 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
+memory_region_transaction_commit();
 return;
 }
 
-- 
2.23.0

[PATCH v2 0/2] two optimizations to speed up the start time

2022-12-06 Thread Longpeng(Mike)

From: Longpeng 

Changes v2->v1:
 Patch-1:
  - remove vq_init_count [Jason]
 Patch-2:
  - new added. [Jason]

v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg922499.html

Longpeng (Mike) (2):
  vhost: configure all host notifiers in a single MR transaction
  vdpa: commit all host notifier MRs in a single MR transaction

 hw/virtio/vhost-vdpa.c | 18 ++
 hw/virtio/vhost.c  | 40 ++--
 2 files changed, 56 insertions(+), 2 deletions(-)

-- 
2.23.0

[PATCH v10 3/5] vdpa: add vdpa-dev-pci support

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 2 files changed, 103 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
-- 
2.23.0

[PATCH v10 4/5] vdpa-dev: mark the device as unmigratable

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

The generic vDPA device doesn't support migration currently, so
mark it as unmigratable temporarily.

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/vdpa-dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index dbc4f8001d..db6ba61152 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -327,6 +327,7 @@ static Property vhost_vdpa_device_properties[] = {
 
 static const VMStateDescription vmstate_vhost_vdpa_device = {
 .name = "vhost-vdpa-device",
+.unmigratable = 1,
 .minimum_version_id = 1,
 .version_id = 1,
 .fields = (VMStateField[]) {
-- 
2.23.0

[PATCH v10 5/5] docs: Add generic vhost-vdpa device documentation

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 .../devices/vhost-vdpa-generic-device.rst | 66 +++
 1 file changed, 66 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst

diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst 
b/docs/system/devices/vhost-vdpa-generic-device.rst
new file mode 100644
index 00..7d13359ea1
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-generic-device.rst
@@ -0,0 +1,66 @@
+
+=
+vhost-vDPA generic device
+=
+
+This document explains the usage of the vhost-vDPA generic device.
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which
+is type insensitive.
+
+The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio
+subsystem. It is quite small, but it can support any type of virtio device.
+
+Examples
+
+
+1. Please make sure the modules listed bellow are installed:
+vhost.ko
+vhost_iotlb.ko
+vdpa.ko
+vhost_vdpa.ko
+
+
+2. Prepare the vhost-vDPA backends, here is an example using vdpa_sim_blk
+   device:
+
+::
+  host# modprobe vdpa_sim_blk
+  host# vdpa dev add mgmtdev vdpasim_blk name blk0
+  (...you can see the vhost-vDPA device under /dev directory now...)
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Note:
+It needs some vendor-specific steps to provision the vDPA device if you're
+using real HW devices, such as installing the vendor-specific vDPA driver
+and binding the device to the driver.
+
+
+3. Start the virtual machine:
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v10 2/5] vdpa: add vdpa-dev support

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 376 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 425 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..dbc4f8001d
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,376 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, ) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_setg(errp,

[PATCH v10 1/5] virtio: get class_id and pci device id by the virtio id

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index a1c9dfa7bb..a602f670ca 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -224,6 +225,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1729,6 +1814,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 938799e8f6..24fba1604b 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -151,6 +151,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -184,6 +186,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

[PATCH v10 0/5] add generic vDPA device support

2022-12-05 Thread Longpeng(Mike)

From: Longpeng 

Hi guys,

With the generic vDPA device, QEMU won't need to touch the device
types any more, such like vfio.

We can use the generic vDPA device as follow:
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X
  Or
  -M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
  vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Changes v10 -> v9:
- optimize the doc [Jason]

Changes v9 -> v8:
- rename vhost-vdpa-device.rst to vhost-vdpa-generic-device.rst [Jason, 
Stefano]
- emphasize the vhost-vDPA generic device in doc [Jason]

Changes v8 -> v7:
- add migration blocker. [Michael]

Changes v6 -> v7:
(v6: https://mail.gnu.org/archive/html/qemu-devel/2022-05/msg02821.html)
- rebase. [Jason]
- add documentation . [Stefan]

Changes v5 -> v6:
  Patch 2:
- Turn to the original approach in the RFC to initialize the
  virtio_pci_id_info array. [Michael]
  https://lore.kernel.org/all/20220105005900.860-2-longpe...@huawei.com/
  Patch 3:
- Fix logical error of exception handler around the post_init.
  [Stefano]
- Fix some coding style warnings. [Stefano]
  Patch 4:
- Fix some coding style warnings. [Stefano]

Changes v4 -> v5:
  Patch 3:
- remove vhostfd [Jason]
- support virtio-mmio [Jason]

Changes v3 -> v4:
  v3: https://www.mail-archive.com/qemu-devel@nongnu.org/msg877015.html
  - reorganize the series [Stefano]
  - fix some typos [Stefano]
  - fix logical error in vhost_vdpa_device_realize [Stefano]

Changes v2 -> v3
  Patch 4 & 5:
- only call vdpa ioctls in vdpa-dev.c [Stefano, Longpeng]
- s/VQS_NUM/VQS_COUNT  [Stefano]
- check both vdpa_dev_fd and vdpa_dev [Stefano]
  Patch 6:
- move all steps into vhost_vdpa_device_unrealize. [Stefano]

Changes RFC -> v2
  Patch 1:
- rename 'pdev_id' to 'trans_devid'  [Michael]
- only use transitional device id for the devices
  listed in the spec  [Michael]
- use macros to make the id_info table clearer  [Longpeng]
- add some modern devices in the id_info table  [Longpeng]
  Patch 2:
- remove the GET_VECTORS_NUM command  [Jason]
  Patch 4:
- expose vdpa_dev_fd as a QOM preperty  [Stefan]
- introduce vhost_vdpa_device_get_u32 as a common
  function to make the code clearer  [Stefan]
- fix the misleading description of 'dc->desc'  [Stefano]
  Patch 5:
- check returned number of virtqueues  [Stefan]
  Patch 6:
- init s->num_queues  [Stefano]
- free s->dev.vqs  [Stefano]

Longpeng (Mike) (5):
  virtio: get class_id and pci device id by the virtio id
  vdpa: add vdpa-dev support
  vdpa: add vdpa-dev-pci support
  vdpa-dev: mark the device as unmigratable
  docs: Add generic vhost-vdpa device documentation

 .../devices/vhost-vdpa-generic-device.rst |  66 +++
 hw/virtio/Kconfig |   5 +
 hw/virtio/meson.build |   2 +
 hw/virtio/vdpa-dev-pci.c  | 102 +
 hw/virtio/vdpa-dev.c  | 377 ++
 hw/virtio/virtio-pci.c|  88 
 include/hw/virtio/vdpa-dev.h  |  43 ++
 include/hw/virtio/virtio-pci.h|   5 +
 8 files changed, 688 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst
 create mode 100644 hw/virtio/vdpa-dev-pci.c
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

-- 
2.23.0

[PATCH] vhost: configure all host notifiers in a single MR transaction

2022-11-18 Thread Longpeng(Mike)

From: Longpeng 

This allows the vhost device to batch the setup of all its host notifiers.
This significantly reduces the device starting time, e.g. the vhost-vDPA
generic device [1] start time reduce from 376ms to 9.1ms for a VM with
64 vCPUs and 3 vDPA device(64vq per device).

[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg921541.html

Signed-off-by: Longpeng 
---
 hw/virtio/vhost.c | 39 ++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index d1c4c20b8c..bf82d9b176 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1507,6 +1507,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+int vq_init_count = 0;
 int i, r, e;
 
 /* We will pass the notifiers to the kernel, make sure that QEMU
@@ -1518,6 +1519,12 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 goto fail;
 }
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  true);
@@ -1525,19 +1532,33 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
 goto fail_vq;
 }
+
+vq_init_count++;
 }
 
+memory_region_transaction_commit();
+
 return 0;
 fail_vq:
-while (--i >= 0) {
+for (i = 0; i < vq_init_count; i++) {
 e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  false);
 if (e < 0) {
 error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
 }
 assert (e >= 0);
+}
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+for (i = 0; i < vq_init_count; i++) {
 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
 }
+
 virtio_device_release_ioeventfd(vdev);
 fail:
 return r;
@@ -1553,6 +1574,12 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
 int i, r;
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  false);
@@ -1560,8 +1587,18 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
 }
 assert (r >= 0);
+}
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+for (i = 0; i < hdev->nvqs; ++i) {
 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
 }
+
 virtio_device_release_ioeventfd(vdev);
 }
 
-- 
2.23.0

[PATCH v9 4/5] vdpa-dev: mark the device as unmigratable

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

The generic vDPA device doesn't support migration currently, so
mark it as unmigratable temporarily.

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/vdpa-dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index 2885d06cbe..62d83d3423 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -327,6 +327,7 @@ static Property vhost_vdpa_device_properties[] = {
 
 static const VMStateDescription vmstate_vhost_vdpa_device = {
 .name = "vhost-vdpa-device",
+.unmigratable = 1,
 .minimum_version_id = 1,
 .version_id = 1,
 .fields = (VMStateField[]) {
-- 
2.23.0

[PATCH v9 0/5] add generic vDPA device support

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

Hi guys,

With the generic vDPA device, QEMU won't need to touch the device
types any more, such like vfio.

We can use the generic vDPA device as follow:
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X
  Or
  -M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
  vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Changes v9 -> v8:
- rename vhost-vdpa-device.rst to vhost-vdpa-generic-device.rst [Jason, 
Stefano]
- emphasize the vhost-vDPA generic device in doc [Jason]

Changes v8 -> v7:
- add migration blocker. [Michael]

Changes v6 -> v7:
(v6: https://mail.gnu.org/archive/html/qemu-devel/2022-05/msg02821.html)
- rebase. [Jason]
- add documentation . [Stefan]

Changes v5 -> v6:
  Patch 2:
- Turn to the original approach in the RFC to initialize the
  virtio_pci_id_info array. [Michael]
  https://lore.kernel.org/all/20220105005900.860-2-longpe...@huawei.com/
  Patch 3:
- Fix logical error of exception handler around the post_init.
  [Stefano]
- Fix some coding style warnings. [Stefano]
  Patch 4:
- Fix some coding style warnings. [Stefano]

Changes v4 -> v5:
  Patch 3:
- remove vhostfd [Jason]
- support virtio-mmio [Jason]

Changes v3 -> v4:
  v3: https://www.mail-archive.com/qemu-devel@nongnu.org/msg877015.html
  - reorganize the series [Stefano]
  - fix some typos [Stefano]
  - fix logical error in vhost_vdpa_device_realize [Stefano]

Changes v2 -> v3
  Patch 4 & 5:
- only call vdpa ioctls in vdpa-dev.c [Stefano, Longpeng]
- s/VQS_NUM/VQS_COUNT  [Stefano]
- check both vdpa_dev_fd and vdpa_dev [Stefano]
  Patch 6:
- move all steps into vhost_vdpa_device_unrealize. [Stefano]

Changes RFC -> v2
  Patch 1:
- rename 'pdev_id' to 'trans_devid'  [Michael]
- only use transitional device id for the devices
  listed in the spec  [Michael]
- use macros to make the id_info table clearer  [Longpeng]
- add some modern devices in the id_info table  [Longpeng]
  Patch 2:
- remove the GET_VECTORS_NUM command  [Jason]
  Patch 4:
- expose vdpa_dev_fd as a QOM preperty  [Stefan]
- introduce vhost_vdpa_device_get_u32 as a common
  function to make the code clearer  [Stefan]
- fix the misleading description of 'dc->desc'  [Stefano]
  Patch 5:
- check returned number of virtqueues  [Stefan]
  Patch 6:
- init s->num_queues  [Stefano]
- free s->dev.vqs  [Stefano]

Longpeng (Mike) (5):
  virtio: get class_id and pci device id by the virtio id
  vdpa: add vdpa-dev support
  vdpa: add vdpa-dev-pci support
  vdpa-dev: mark the device as unmigratable
  docs: Add generic vhost-vdpa device documentation

 .../devices/vhost-vdpa-generic-device.rst |  46 +++
 hw/virtio/Kconfig |   5 +
 hw/virtio/meson.build |   2 +
 hw/virtio/vdpa-dev-pci.c  | 102 +
 hw/virtio/vdpa-dev.c  | 377 ++
 hw/virtio/virtio-pci.c|  88 
 include/hw/virtio/vdpa-dev.h  |  43 ++
 include/hw/virtio/virtio-pci.h|   5 +
 8 files changed, 668 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst
 create mode 100644 hw/virtio/vdpa-dev-pci.c
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

-- 
2.23.0

[PATCH v9 1/5] virtio: get class_id and pci device id by the virtio id

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index a1c9dfa7bb..a602f670ca 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -224,6 +225,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1729,6 +1814,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 938799e8f6..24fba1604b 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -151,6 +151,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -184,6 +186,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

[PATCH v9 3/5] vdpa: add vdpa-dev-pci support

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 2 files changed, 103 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
-- 
2.23.0

[PATCH v9 5/5] docs: Add generic vhost-vdpa device documentation

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 .../devices/vhost-vdpa-generic-device.rst | 46 +++
 1 file changed, 46 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst

diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst 
b/docs/system/devices/vhost-vdpa-generic-device.rst
new file mode 100644
index 00..d6db9af755
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-generic-device.rst
@@ -0,0 +1,46 @@
+
+=
+vhost-vDPA generic device
+=
+
+This document explains the usage of the vhost-vDPA generic device.
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which
+is type insensitive.
+
+The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio
+subsystem. It is quite small, but it can support any type of virtio device.
+
+Examples
+
+
+Prepare the vhost-vDPA backends first:
+
+::
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v9 2/5] vdpa: add vdpa-dev support

2022-11-12 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 376 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 425 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..2885d06cbe
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,376 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, ) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_setg(errp,

[PATCH v8 1/5] virtio: get class_id and pci device id by the virtio id

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 34db51e241..3469b88d43 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -213,6 +214,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1683,6 +1768,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 2446dcd9ae..d95b1a13a5 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -146,6 +146,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -179,6 +181,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

[PATCH v8 4/5] vdpa-dev: mark the device as unmigratable

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

The generic vDPA device doesn't support migration currently, so
mark it as unmigratable temporarily.

Signed-off-by: Longpeng 
---
 hw/virtio/vdpa-dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index 2885d06cbe..62d83d3423 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -327,6 +327,7 @@ static Property vhost_vdpa_device_properties[] = {
 
 static const VMStateDescription vmstate_vhost_vdpa_device = {
 .name = "vhost-vdpa-device",
+.unmigratable = 1,
 .minimum_version_id = 1,
 .version_id = 1,
 .fields = (VMStateField[]) {
-- 
2.23.0

[PATCH v8 5/5] docs: Add generic vhost-vdpa device documentation

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 docs/system/devices/vhost-vdpa-device.rst | 43 +++
 1 file changed, 43 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-device.rst

diff --git a/docs/system/devices/vhost-vdpa-device.rst 
b/docs/system/devices/vhost-vdpa-device.rst
new file mode 100644
index 00..b758c4fce6
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-device.rst
@@ -0,0 +1,43 @@
+
+=
+generic vhost-vdpa device
+=
+
+This document explains the usage of the generic vhost vdpa device.
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vdpa devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "generic vdpa device" which is
+type insensitive (likes vfio-pci).
+
+Examples
+
+
+Prepare the vhost-vdpa backends first:
+
+::
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v8 3/5] vdpa: add vdpa-dev-pci support

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 2 files changed, 103 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
-- 
2.23.0

[PATCH v8 0/5] add generic vDPA device support

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

Hi guys,

With the generic vDPA device, QEMU won't need to touch the device
types any more, such like vfio.

We can use the generic vDPA device as follow:
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X
  Or
  -M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
  vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Changes v8 -> v7:
- add migration blocker. [Michael]

Changes v6 -> v7:
(v6: https://mail.gnu.org/archive/html/qemu-devel/2022-05/msg02821.html)
- rebase. [Jason]
- add documentation . [Stefan]

Changes v5 -> v6:
  Patch 2:
- Turn to the original approach in the RFC to initialize the
  virtio_pci_id_info array. [Michael]
  https://lore.kernel.org/all/20220105005900.860-2-longpe...@huawei.com/
  Patch 3:
- Fix logical error of exception handler around the post_init.
  [Stefano]
- Fix some coding style warnings. [Stefano]
  Patch 4:
- Fix some coding style warnings. [Stefano]

Changes v4 -> v5:
  Patch 3:
- remove vhostfd [Jason]
- support virtio-mmio [Jason]

Changes v3 -> v4:
  v3: https://www.mail-archive.com/qemu-devel@nongnu.org/msg877015.html
  - reorganize the series [Stefano]
  - fix some typos [Stefano]
  - fix logical error in vhost_vdpa_device_realize [Stefano]

Changes v2 -> v3
  Patch 4 & 5:
- only call vdpa ioctls in vdpa-dev.c [Stefano, Longpeng]
- s/VQS_NUM/VQS_COUNT  [Stefano]
- check both vdpa_dev_fd and vdpa_dev [Stefano]
  Patch 6:
- move all steps into vhost_vdpa_device_unrealize. [Stefano]

Changes RFC -> v2
  Patch 1:
- rename 'pdev_id' to 'trans_devid'  [Michael]
- only use transitional device id for the devices
  listed in the spec  [Michael]
- use macros to make the id_info table clearer  [Longpeng]
- add some modern devices in the id_info table  [Longpeng]
  Patch 2:
- remove the GET_VECTORS_NUM command  [Jason]
  Patch 4:
- expose vdpa_dev_fd as a QOM preperty  [Stefan]
- introduce vhost_vdpa_device_get_u32 as a common
  function to make the code clearer  [Stefan]
- fix the misleading description of 'dc->desc'  [Stefano]
  Patch 5:
- check returned number of virtqueues  [Stefan]
  Patch 6:
- init s->num_queues  [Stefano]
- free s->dev.vqs  [Stefano]

Longpeng (Mike) (5):
  virtio: get class_id and pci device id by the virtio id
  vdpa: add vdpa-dev support
  vdpa: add vdpa-dev-pci support
  vdpa-dev: mark the device as unmigratable
  docs: Add generic vhost-vdpa device documentation

 docs/system/devices/vhost-vdpa-device.rst |  43 +++
 hw/virtio/Kconfig |   5 +
 hw/virtio/meson.build |   2 +
 hw/virtio/vdpa-dev-pci.c  | 102 ++
 hw/virtio/vdpa-dev.c  | 377 ++
 hw/virtio/virtio-pci.c|  88 +
 include/hw/virtio/vdpa-dev.h  |  43 +++
 include/hw/virtio/virtio-pci.h|   5 +
 8 files changed, 665 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-device.rst
 create mode 100644 hw/virtio/vdpa-dev-pci.c
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

-- 
2.23.0

[PATCH v8 2/5] vdpa: add vdpa-dev support

2022-11-07 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 376 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 425 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..2885d06cbe
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,376 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, ) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_setg(errp,

Re: [PATCH] hw/riscv: virt: Remove size restriction for pflash

2022-11-06 Thread Mike Maslenkin

Hello Sunil!

What about virt_machine_done() function?
kernel_entry variable still points to the second flash started from
virt_memmap[VIRT_FLASH].size / 2.

On Sun, Nov 6, 2022 at 5:41 PM Sunil V L  wrote:
>
> The pflash implementation currently assumes fixed size of the
> backend storage. Due to this, the backend storage file needs to be
> exactly of size 32M. Otherwise, there will be an error like below.
>
> "device requires 33554432 bytes, block backend provides 3145728 bytes"
>
> Fix this issue by using the actual size of the backing store.
>
> Signed-off-by: Sunil V L 
> ---
>  hw/riscv/virt.c | 33 +
>  1 file changed, 25 insertions(+), 8 deletions(-)
>
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index a5bc7353b4..aad175fa31 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -49,6 +49,7 @@
>  #include "hw/pci/pci.h"
>  #include "hw/pci-host/gpex.h"
>  #include "hw/display/ramfb.h"
> +#include "sysemu/block-backend.h"
>
>  /*
>   * The virt machine physical address space used by some of the devices
> @@ -144,10 +145,17 @@ static void virt_flash_map1(PFlashCFI01 *flash,
>  MemoryRegion *sysmem)
>  {
>  DeviceState *dev = DEVICE(flash);
> +BlockBackend *blk;
> +hwaddr real_size;
>
> -assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE));
> -assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
> -qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
> +blk = pflash_cfi01_get_blk(flash);
> +
> +real_size = blk ? blk_getlength(blk): size;
> +
> +assert(real_size);
> +assert(QEMU_IS_ALIGNED(real_size, VIRT_FLASH_SECTOR_SIZE));
> +assert(real_size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
> +qdev_prop_set_uint32(dev, "num-blocks", real_size / 
> VIRT_FLASH_SECTOR_SIZE);
>  sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
>
>  memory_region_add_subregion(sysmem, base,
> @@ -971,15 +979,24 @@ static void create_fdt_flash(RISCVVirtState *s, const 
> MemMapEntry *memmap)
>  {
>  char *name;
>  MachineState *mc = MACHINE(s);
> -hwaddr flashsize = virt_memmap[VIRT_FLASH].size / 2;
> -hwaddr flashbase = virt_memmap[VIRT_FLASH].base;
> +MemoryRegion *flash_mem;
> +hwaddr flashsize[2];
> +hwaddr flashbase[2];
> +
> +flash_mem = pflash_cfi01_get_memory(s->flash[0]);
> +flashbase[0] = flash_mem->addr;
> +flashsize[0] = flash_mem->size;
> +
> +flash_mem = pflash_cfi01_get_memory(s->flash[1]);
> +flashbase[1] = flash_mem->addr;
> +flashsize[1] = flash_mem->size;
>
> -name = g_strdup_printf("/flash@%" PRIx64, flashbase);
> +name = g_strdup_printf("/flash@%" PRIx64, flashbase[0]);
>  qemu_fdt_add_subnode(mc->fdt, name);
>  qemu_fdt_setprop_string(mc->fdt, name, "compatible", "cfi-flash");
>  qemu_fdt_setprop_sized_cells(mc->fdt, name, "reg",
> - 2, flashbase, 2, flashsize,
> - 2, flashbase + flashsize, 2, flashsize);
> + 2, flashbase[0], 2, flashsize[0],
> + 2, flashbase[1], 2, flashsize[1]);
>  qemu_fdt_setprop_cell(mc->fdt, name, "bank-width", 4);
>  g_free(name);
>  }
> --
> 2.38.0
>
>

[PATCH v7 resend 3/4] vdpa: add vdpa-dev-pci support

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 2 files changed, 103 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
-- 
2.23.0

[PATCH v7 resend 2/4] vdpa: add vdpa-dev support

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 377 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 426 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..62d83d3423
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,377 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, ) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(>dev, >vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_setg(errp,

[PATCH v7 resend 4/4] docs: Add generic vhost-vdpa device documentation

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 docs/system/devices/vhost-vdpa-device.rst | 43 +++
 1 file changed, 43 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-device.rst

diff --git a/docs/system/devices/vhost-vdpa-device.rst 
b/docs/system/devices/vhost-vdpa-device.rst
new file mode 100644
index 00..b758c4fce6
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-device.rst
@@ -0,0 +1,43 @@
+
+=
+generic vhost-vdpa device
+=
+
+This document explains the usage of the generic vhost vdpa device.
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vdpa devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "generic vdpa device" which is
+type insensitive (likes vfio-pci).
+
+Examples
+
+
+Prepare the vhost-vdpa backends first:
+
+::
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v7 resend 1/4] virtio: get class_id and pci device id by the virtio id

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 34db51e241..3469b88d43 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -213,6 +214,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1683,6 +1768,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 2446dcd9ae..d95b1a13a5 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -146,6 +146,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -179,6 +181,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

[PATCH v7 resend 0/4] add generic vDPA device support

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Hi guys,

With the generic vDPA device, QEMU won't need to touch the device
types any more, such like vfio.

We can use the generic vDPA device as follow:
  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X
  Or
  -M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
  vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Changes v6 -> v7:
(v6: https://mail.gnu.org/archive/html/qemu-devel/2022-05/msg02821.html)
- rebase. [Jason]
- add documentation . [Stefan]

Changes v5 -> v6:
  Patch 2:
- Turn to the original approach in the RFC to initialize the
  virtio_pci_id_info array. [Michael]
  https://lore.kernel.org/all/20220105005900.860-2-longpe...@huawei.com/
  Patch 3:
- Fix logical error of exception handler around the post_init.
  [Stefano]
- Fix some coding style warnings. [Stefano]
  Patch 4:
- Fix some coding style warnings. [Stefano]

Changes v4 -> v5:
  Patch 3:
- remove vhostfd [Jason]
- support virtio-mmio [Jason]

Changes v3 -> v4:
  v3: https://www.mail-archive.com/qemu-devel@nongnu.org/msg877015.html
  - reorganize the series [Stefano]
  - fix some typos [Stefano]
  - fix logical error in vhost_vdpa_device_realize [Stefano]

Changes v2 -> v3
  Patch 4 & 5:
- only call vdpa ioctls in vdpa-dev.c [Stefano, Longpeng]
- s/VQS_NUM/VQS_COUNT  [Stefano]
- check both vdpa_dev_fd and vdpa_dev [Stefano]
  Patch 6:
- move all steps into vhost_vdpa_device_unrealize. [Stefano]

Changes RFC -> v2
  Patch 1:
- rename 'pdev_id' to 'trans_devid'  [Michael]
- only use transitional device id for the devices
  listed in the spec  [Michael]
- use macros to make the id_info table clearer  [Longpeng]
- add some modern devices in the id_info table  [Longpeng]
  Patch 2:
- remove the GET_VECTORS_NUM command  [Jason]
  Patch 4:
- expose vdpa_dev_fd as a QOM preperty  [Stefan]
- introduce vhost_vdpa_device_get_u32 as a common
  function to make the code clearer  [Stefan]
- fix the misleading description of 'dc->desc'  [Stefano]
  Patch 5:
- check returned number of virtqueues  [Stefan]
  Patch 6:
- init s->num_queues  [Stefano]
- free s->dev.vqs  [Stefano]


Longpeng (Mike) (4):
  virtio: get class_id and pci device id by the virtio id
  vdpa: add vdpa-dev support
  vdpa: add vdpa-dev-pci support
  docs: Add generic vhost-vdpa device documentation

 docs/system/devices/vhost-vdpa-device.rst |  43 +++
 hw/virtio/Kconfig |   5 +
 hw/virtio/meson.build |   2 +
 hw/virtio/vdpa-dev-pci.c  | 102 ++
 hw/virtio/vdpa-dev.c  | 377 ++
 hw/virtio/virtio-pci.c|  88 +
 include/hw/virtio/vdpa-dev.h  |  43 +++
 include/hw/virtio/virtio-pci.h|   5 +
 8 files changed, 665 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-device.rst
 create mode 100644 hw/virtio/vdpa-dev-pci.c
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

-- 
2.23.0

[PATCH v7 4/4] docs: Add generic vhost-vdpa device documentation

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Signed-off-by: Longpeng 
---
 docs/system/devices/vhost-vdpa-device.rst | 43 +++
 1 file changed, 43 insertions(+)
 create mode 100644 docs/system/devices/vhost-vdpa-device.rst

diff --git a/docs/system/devices/vhost-vdpa-device.rst 
b/docs/system/devices/vhost-vdpa-device.rst
new file mode 100644
index 00..50173299e0
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-device.rst
@@ -0,0 +1,43 @@
+
+=
+generic vhost-vdpa device
+=
+
+This document explains the usage of the generic vhost vdpa device.
+
+Description
+---
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vdpa devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "generic vdpa device" which is
+type insensitive (likes vfio-pci). 
+
+Examples
+
+
+Prepare the vhost-vdpa backends first:
+
+::
+  host# ls -l /dev/vhost-vdpa-*
+  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system  \
+  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
+  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
+  ...
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system  \
+  -M pc -m 512 -smp 2\
+  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
+  ...
-- 
2.23.0

[PATCH v7 3/4] vdpa: add vdpa-dev-pci support

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev-pci, we can use the device as follow:

-device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-X

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev-pci.c | 102 +++
 hw/virtio/vdpa-dev.c |   2 +-
 3 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vdpa-dev-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 54d6d29af7..559b80cb28 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -57,6 +57,7 @@ virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-serial-pc
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: 
files('virtio-pmem-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu-pci.c'))
 virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: 
files('vdpa-dev-pci.c'))
 
 virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
 
diff --git a/hw/virtio/vdpa-dev-pci.c b/hw/virtio/vdpa-dev-pci.c
new file mode 100644
index 00..5446e6b393
--- /dev/null
+++ b/hw/virtio/vdpa-dev-pci.c
@@ -0,0 +1,102 @@
+/*
+ * Vhost Vdpa Device PCI Bindings
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+
+typedef struct VhostVdpaDevicePCI VhostVdpaDevicePCI;
+
+#define TYPE_VHOST_VDPA_DEVICE_PCI "vhost-vdpa-device-pci-base"
+DECLARE_INSTANCE_CHECKER(VhostVdpaDevicePCI, VHOST_VDPA_DEVICE_PCI,
+ TYPE_VHOST_VDPA_DEVICE_PCI)
+
+struct VhostVdpaDevicePCI {
+VirtIOPCIProxy parent_obj;
+VhostVdpaDevice vdev;
+};
+
+static void vhost_vdpa_device_pci_instance_init(Object *obj)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_VDPA_DEVICE);
+object_property_add_alias(obj, "bootindex", OBJECT(>vdev),
+  "bootindex");
+}
+
+static Property vhost_vdpa_device_pci_properties[] = {
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int vhost_vdpa_device_pci_post_init(VhostVdpaDevice *v, Error **errp)
+{
+VhostVdpaDevicePCI *dev = container_of(v, VhostVdpaDevicePCI, vdev);
+VirtIOPCIProxy *vpci_dev = >parent_obj;
+
+vpci_dev->class_code = virtio_pci_get_class_id(v->vdev_id);
+vpci_dev->trans_devid = virtio_pci_get_trans_devid(v->vdev_id);
+/* one for config vector */
+vpci_dev->nvectors = v->num_queues + 1;
+
+return 0;
+}
+
+static void
+vhost_vdpa_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VhostVdpaDevicePCI *dev = VHOST_VDPA_DEVICE_PCI(vpci_dev);
+
+dev->vdev.post_init = vhost_vdpa_device_pci_post_init;
+qdev_realize(DEVICE(>vdev), BUS(_dev->bus), errp);
+}
+
+static void vhost_vdpa_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, vhost_vdpa_device_pci_properties);
+k->realize = vhost_vdpa_device_pci_realize;
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vdpa_device_pci_info = {
+.base_name   = TYPE_VHOST_VDPA_DEVICE_PCI,
+.generic_name= "vhost-vdpa-device-pci",
+.transitional_name   = "vhost-vdpa-device-pci-transitional",
+.non_transitional_name   = "vhost-vdpa-device-pci-non-transitional",
+.instance_size  = sizeof(VhostVdpaDevicePCI),
+.instance_init  = vhost_vdpa_device_pci_instance_init,
+.class_init = vhost_vdpa_device_pci_class_init,
+};
+
+static void vhost_vdpa_device_pci_register(void)
+{
+virtio_pci_types_register(_vdpa_device_pci_info);
+}
+
+type_init(vhost_vdpa_device_pci_register);
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index 1840f0e450..62d83d3423 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -138,7 +138,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, 
Error **errp)
 goto free_config;
 }
 
-virtio_init(vdev, "vhost-vdpa", v->vdev_id, v->config_size);
+virtio_init(vdev, v->vdev_id, v->config_size);
 
 v->virtqs = g_new0(VirtQueue *, v->dev.nvqs);
 for

[PATCH v7 1/4] virtio: get class_id and pci device id by the virtio id

2022-11-05 Thread Longpeng(Mike)

From: Longpeng 

Add helpers to get the "Transitional PCI Device ID" and "class_id"
of the device specified by the "Virtio Device ID".

These helpers will be used to build the generic vDPA device later.

Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/virtio-pci.c | 88 ++
 include/hw/virtio/virtio-pci.h |  5 ++
 2 files changed, 93 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 34db51e241..3469b88d43 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -19,6 +19,7 @@
 
 #include "exec/memop.h"
 #include "standard-headers/linux/virtio_pci.h"
+#include "standard-headers/linux/virtio_ids.h"
 #include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
@@ -213,6 +214,90 @@ static int virtio_pci_load_queue(DeviceState *d, int n, 
QEMUFile *f)
 return 0;
 }
 
+typedef struct VirtIOPCIIDInfo {
+/* virtio id */
+uint16_t vdev_id;
+/* pci device id for the transitional device */
+uint16_t trans_devid;
+uint16_t class_id;
+} VirtIOPCIIDInfo;
+
+static const VirtIOPCIIDInfo virtio_pci_id_info[] = {
+{
+.vdev_id = VIRTIO_ID_CRYPTO,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_FS,
+.class_id = PCI_CLASS_STORAGE_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_NET,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_NET,
+.class_id = PCI_CLASS_NETWORK_ETHERNET,
+}, {
+.vdev_id = VIRTIO_ID_BLOCK,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BLOCK,
+.class_id = PCI_CLASS_STORAGE_SCSI,
+}, {
+.vdev_id = VIRTIO_ID_CONSOLE,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_CONSOLE,
+.class_id = PCI_CLASS_COMMUNICATION_OTHER,
+}, {
+.vdev_id = VIRTIO_ID_SCSI,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_SCSI,
+.class_id = PCI_CLASS_STORAGE_SCSI
+}, {
+.vdev_id = VIRTIO_ID_9P,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_9P,
+.class_id = PCI_BASE_CLASS_NETWORK,
+}, {
+.vdev_id = VIRTIO_ID_BALLOON,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_BALLOON,
+.class_id = PCI_CLASS_OTHERS,
+}, {
+.vdev_id = VIRTIO_ID_RNG,
+.trans_devid = PCI_DEVICE_ID_VIRTIO_RNG,
+.class_id = PCI_CLASS_OTHERS,
+},
+};
+
+static const VirtIOPCIIDInfo *virtio_pci_get_id_info(uint16_t vdev_id)
+{
+const VirtIOPCIIDInfo *info = NULL;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(virtio_pci_id_info); i++) {
+if (virtio_pci_id_info[i].vdev_id == vdev_id) {
+info = _pci_id_info[i];
+break;
+}
+}
+
+if (!info) {
+/* The device id is invalid or not added to the id_info yet. */
+error_report("Invalid virtio device(id %u)", vdev_id);
+abort();
+}
+
+return info;
+}
+
+/*
+ * Get the Transitional Device ID for the specific device, return
+ * zero if the device is non-transitional.
+ */
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->trans_devid;
+}
+
+/*
+ * Get the Class ID for the specific device.
+ */
+uint16_t virtio_pci_get_class_id(uint16_t device_id)
+{
+return virtio_pci_get_id_info(device_id)->class_id;
+}
+
 static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 {
 VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -1683,6 +1768,9 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
  */
 pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+if (proxy->trans_devid) {
+pci_config_set_device_id(config, proxy->trans_devid);
+}
 } else {
 /* pure virtio-1.0 */
 pci_set_word(config + PCI_VENDOR_ID,
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 2446dcd9ae..d95b1a13a5 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -146,6 +146,8 @@ struct VirtIOPCIProxy {
 bool disable_modern;
 bool ignore_backend_features;
 OnOffAuto disable_legacy;
+/* Transitional device id */
+uint16_t trans_devid;
 uint32_t class_code;
 uint32_t nvectors;
 uint32_t dfselect;
@@ -179,6 +181,9 @@ static inline void virtio_pci_disable_modern(VirtIOPCIProxy 
*proxy)
 proxy->disable_modern = true;
 }
 
+uint16_t virtio_pci_get_trans_devid(uint16_t device_id);
+uint16_t virtio_pci_get_class_id(uint16_t device_id);
+
 /*
  * virtio-input-pci: This extends VirtioPCIProxy.
  */
-- 
2.23.0

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1123 matches

Mail list logo