Re: [PATCH] oslib-posix: fix memory leak in touch_all_pages

2024-03-04 Thread Mark Kanda

On 3/4/24 4:48 PM, Paolo Bonzini wrote:

touch_all_pages() can return early, before creating threads.  In this case,
however, it leaks the MemsetContext that it has allocated at the
beginning of the function.

Reported by Coverity as CID 1534922.

Fixes: 04accf43df8 ("oslib-posix: initialize backend memory objects in 
parallel", 2024-02-06)
Cc: Mark Kanda
Signed-off-by: Paolo Bonzini

Reviewed-by: Mark Kanda 

Thanks/regards,
-Mark

---
  util/oslib-posix.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 3c379f96c26..e76441695bd 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -467,11 +467,13 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
   * preallocating synchronously.
   */
  if (context->num_threads == 1 && !async) {
+ret = 0;
  if (qemu_madvise(area, hpagesize * numpages,
   QEMU_MADV_POPULATE_WRITE)) {
-return -errno;
+ret = -errno;
  }
-return 0;
+g_free(context);
+return ret;
  }
  touch_fn = do_madv_populate_write_pages;
  } else {


[PATCH v4 1/1] oslib-posix: initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel (asynchronously). In order to
ensure optimal thread placement, asynchronous initialization requires prealloc
context threads to be in use.

Signed-off-by: Mark Kanda 
Signed-off-by: David Hildenbrand 
---
 backends/hostmem.c |   7 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/hw/qdev-core.h |   5 ++
 include/qemu/osdep.h   |  18 +-
 system/vl.c|   9 +++
 util/oslib-posix.c | 131 +++--
 util/oslib-win32.c |   8 ++-
 7 files changed, 145 insertions(+), 37 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..17221e422a 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"
 
 #ifdef CONFIG_NUMA
 #include 
@@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, 
bool value,
 uint64_t sz = memory_region_size(>mr);
 
 if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-   backend->prealloc_context, errp)) {
+   backend->prealloc_context, false, errp)) {
 return;
 }
 backend->prealloc = true;
@@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
 void *ptr;
 uint64_t sz;
+bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
 
 if (!bc->alloc) {
 return;
@@ -398,7 +400,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 if (backend->prealloc && 
!qemu_prealloc_mem(memory_region_get_fd(>mr),
 ptr, sz,
 backend->prealloc_threads,
-backend->prealloc_context, 
errp)) {
+backend->prealloc_context,
+async, errp)) {
 return;
 }
 }
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 99ab989852..ffd119ebac 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, 
uint64_t start_gpa,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 static bool warned;
 
 /*
@@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, 
void *arg,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 error_report_err(local_err);
 return -ENOMEM;
 }
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 151d968238..83dd9e2485 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -1071,6 +1071,11 @@ typedef enum MachineInitPhase {
  */
 PHASE_ACCEL_CREATED,
 
+/*
+ * Late backend objects have been created and initialized.
+ */
+PHASE_LATE_BACKENDS_CREATED,
+
 /*
  * machine_class->init has been called, thus creating any embedded
  * devices and validating machine properties.  Devices created at
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index c9692cc314..7d359dabc4 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
  * @area: start address of the are to preallocate
  * @sz: the size of the area to preallocate
  * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
  * @errp: returns an error if this function fails
  *
  * Preallocate memory (populate/prefault page tables writable) for the virtual
@@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
  * each page in the area was faulted in writable at least once, for example,
  * after allocating file blocks for mapped files.
  *
+ * When setting @async, allocation might be performed asynchronously.
+ * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
+ * preallocation.
+ *
  * Return: true on success, else false setting @errp with error.
  */
 bool qemu_prealloc_mem(int fd,

[PATCH v4 0/1] Initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda
v4:
- remove unneeded async check from host_memory_backend_set_prealloc()
- rename qemu_finish_async_mem_prealloc -> qemu_finish_async_prealloc_mem
- use new phase PHASE_LATE_BACKENDS_CREATED for async

v3:
- squash into a single patch
- use global context list for async handling only (MT capability)
- add BQL asserts to guard against concurrent async prealloc requests
- clean up qemu_finish_async_mem_prealloc() error handling

Includes David's suggested restructuring [1] (with David's SoB).

[1] 
https://lore.kernel.org/qemu-devel/c15161eb-f52c-4a82-8b4b-0ba038421...@redhat.com/

v2:
- require MADV_POPULATE_WRITE (simplify the implementation)
- require prealloc context threads to ensure optimal thread placement
- use machine phase 'initialized' to determine when to allow parallel init

QEMU initializes preallocated backend memory when parsing the corresponding
objects from the command line. In certain scenarios, such as memory being
preallocated across multiple numa nodes, this approach is not optimal due to
the unnecessary serialization.

This series addresses this issue by initializing the backend memory objects in
parallel.

Mark Kanda (1):
  oslib-posix: initialize backend memory objects in parallel

 backends/hostmem.c |   7 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/hw/qdev-core.h |   5 ++
 include/qemu/osdep.h   |  18 +-
 system/vl.c|   9 +++
 util/oslib-posix.c | 131 +++--
 util/oslib-win32.c |   8 ++-
 7 files changed, 145 insertions(+), 37 deletions(-)

-- 
2.39.3




Re: [PATCH v3 1/1] oslib-posix: initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda




On 1/31/24 8:57 AM, David Hildenbrand wrote:

On 31.01.24 15:48, Mark Kanda wrote:

On 1/31/24 8:30 AM, David Hildenbrand wrote:


OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
with code comments/function name).


But then, you should set it at the very end of the function (not sure
if that would be a problem with the other devices that are getting
created in between -- if they would be using one of these memory
backends; likely not).



I think I misunderstood your suggestion. I was planning to add it a
'phase_advance(PHASE_LATE_BACKENDS_CREATED)' to qemu_init():

 @@ -3703,6 +3703,7 @@ void qemu_init(int argc, char **argv)
    * over memory-backend-file objects).
    */
   qemu_create_late_backends();
 +    phase_advance(PHASE_LATE_BACKENDS_CREATED);

And use PHASE_LATE_BACKENDS_CREATED (instead of
PHASE_MACHINE_INITIALIZED) for the async bool in
host_memory_backend_memory_complete().

I was planning to leave this call where it is:

 @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)

   object_option_foreach_add(object_create_late);

 +    /*
 + * Wait for any outstanding memory prealloc from created 
memory

 + * backends to complete.
 + */
 +    if (!qemu_finish_async_mem_prealloc(_fatal)) {
 +    exit(1);
 +    }
 +
   if (tpm_init() < 0) {
   exit(1);
   }



Yes. The only "suboptimal" things is that if someone where to create a 
memory backend between qemu_finish_async_mem_prealloc() and 
phase_advance(PHASE_LATE_BACKENDS_CREATED), it would never get 
preallocated.


That likely won't ever happen by any of the remaining stuff in 
qemu_create_late_backends(), especially not with "prealloc=on" and 
thread-contexts set.




Yep. OK, I'll go with that.

Thanks again!



Re: [PATCH v3 1/1] oslib-posix: initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda

On 1/31/24 8:30 AM, David Hildenbrand wrote:


OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent
with code comments/function name).


But then, you should set it at the very end of the function (not sure 
if that would be a problem with the other devices that are getting 
created in between -- if they would be using one of these memory 
backends; likely not).




I think I misunderstood your suggestion. I was planning to add it a 
'phase_advance(PHASE_LATE_BACKENDS_CREATED)' to qemu_init():


   @@ -3703,6 +3703,7 @@ void qemu_init(int argc, char **argv)
  * over memory-backend-file objects).
  */
 qemu_create_late_backends();
   +    phase_advance(PHASE_LATE_BACKENDS_CREATED);

And use PHASE_LATE_BACKENDS_CREATED (instead of 
PHASE_MACHINE_INITIALIZED) for the async bool in 
host_memory_backend_memory_complete().


I was planning to leave this call where it is:

   @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)

 object_option_foreach_add(object_create_late);

   +    /*
   + * Wait for any outstanding memory prealloc from created memory
   + * backends to complete.
   + */
   +    if (!qemu_finish_async_mem_prealloc(_fatal)) {
   +    exit(1);
   +    }
   +
 if (tpm_init() < 0) {
 exit(1);
 }

Is this what you had in mind?

Thanks/regards,
-Mark




Re: [PATCH v3 1/1] oslib-posix: initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda

On 1/31/24 8:04 AM, David Hildenbrand wrote:

On 31.01.24 14:48, Mark Kanda wrote:
QEMU initializes preallocated backend memory as the objects are 
parsed from
the command line. This is not optimal in some cases (e.g. memory 
spanning
multiple NUMA nodes) because the memory objects are initialized in 
series.


Allow the initialization to occur in parallel (asynchronously). In 
order to
ensure optimal thread placement, asynchronous initialization requires 
prealloc

context threads to be in use.

Signed-off-by: Mark Kanda 
Signed-off-by: David Hildenbrand 
---
  backends/hostmem.c |   8 ++-
  hw/virtio/virtio-mem.c |   4 +-
  include/qemu/osdep.h   |  18 +-
  system/vl.c    |   8 +++
  util/oslib-posix.c | 131 +++--
  util/oslib-win32.c |   8 ++-
  6 files changed, 140 insertions(+), 37 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..8f602dc86f 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@
  #include "qom/object_interfaces.h"
  #include "qemu/mmap-alloc.h"
  #include "qemu/madvise.h"
+#include "hw/qdev-core.h"
    #ifdef CONFIG_NUMA
  #include 
@@ -235,9 +236,10 @@ static void 
host_memory_backend_set_prealloc(Object *obj, bool value,

  int fd = memory_region_get_fd(>mr);
  void *ptr = memory_region_get_ram_ptr(>mr);
  uint64_t sz = memory_region_size(>mr);
+    bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
    if (!qemu_prealloc_mem(fd, ptr, sz, 
backend->prealloc_threads,

-   backend->prealloc_context, errp)) {
+   backend->prealloc_context, async, 
errp)) {

  return;
  }


I think we will never trigger that case: we would have to set the 
propertly after the device was already initialized, which shouldn't 
happen.


So I guess we can simplify and drop that.



Will fix.


  backend->prealloc = true;



[...]


+++ b/include/qemu/osdep.h
@@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
   * @area: start address of the are to preallocate
   * @sz: the size of the area to preallocate
   * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
   * @errp: returns an error if this function fails
   *
   * Preallocate memory (populate/prefault page tables writable) for 
the virtual

@@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
   * each page in the area was faulted in writable at least once, for 
example,

   * after allocating file blocks for mapped files.
   *
+ * When setting @async, allocation might be performed asynchronously.
+ * qemu_finish_async_mem_prealloc() must be called to finish any 
asynchronous

+ * preallocation.
+ *
   * Return: true on success, else false setting @errp with error.
   */
  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-   ThreadContext *tc, Error **errp);
+   ThreadContext *tc, bool async, Error **errp);
+
+/**
+ * qemu_finish_async_mem_prealloc:
+ * @errp: returns an error if this function fails
+ *
+ * Finish all outstanding asynchronous memory preallocation.
+ *
+ * Return: true on success, else false setting @errp with error.
+ */
+bool qemu_finish_async_mem_prealloc(Error **errp);


Suboptimal suggestion from my side, guess it woud be better to call this

"qemu_finish_async_prealloc_mem" to match "qemu_prealloc_mem"



Will fix.


    /**
   * qemu_get_pid_name:
diff --git a/system/vl.c b/system/vl.c
index 788d88ea03..290bb3232b 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
    object_option_foreach_add(object_create_late);
  +    /*
+ * Wait for any outstanding memory prealloc from created memory
+ * backends to complete.
+ */
+    if (!qemu_finish_async_mem_prealloc(_fatal)) {
+    exit(1);
+    }
+


I'm wondering if we should have a new phase instead, like

PHASE_LATE_OBJECTS_CREATED.

and do here

phase_advance(PHASE_LATE_OBJECTS_CREATED);

and use that instead. Currently, there is a "gap" between both things. 
I don't think anything is actually broken right now (because any 
internal memory abckend wouldn't have a thread context), but it might 
be much cleaner and obvious that way.




OK. I'll call it 'PHASE_LATE_BACKENDS_CREATED' (to make it consistent 
with code comments/function name).



Apart from that LGTM!



Thanks/regards,
-Mark




[PATCH v3 1/1] oslib-posix: initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel (asynchronously). In order to
ensure optimal thread placement, asynchronous initialization requires prealloc
context threads to be in use.

Signed-off-by: Mark Kanda 
Signed-off-by: David Hildenbrand 
---
 backends/hostmem.c |   8 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/qemu/osdep.h   |  18 +-
 system/vl.c|   8 +++
 util/oslib-posix.c | 131 +++--
 util/oslib-win32.c |   8 ++-
 6 files changed, 140 insertions(+), 37 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..8f602dc86f 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"
 
 #ifdef CONFIG_NUMA
 #include 
@@ -235,9 +236,10 @@ static void host_memory_backend_set_prealloc(Object *obj, 
bool value,
 int fd = memory_region_get_fd(>mr);
 void *ptr = memory_region_get_ram_ptr(>mr);
 uint64_t sz = memory_region_size(>mr);
+bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
 if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-   backend->prealloc_context, errp)) {
+   backend->prealloc_context, async, errp)) {
 return;
 }
 backend->prealloc = true;
@@ -323,6 +325,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
 void *ptr;
 uint64_t sz;
+bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
 if (!bc->alloc) {
 return;
@@ -398,7 +401,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 if (backend->prealloc && 
!qemu_prealloc_mem(memory_region_get_fd(>mr),
 ptr, sz,
 backend->prealloc_threads,
-backend->prealloc_context, 
errp)) {
+backend->prealloc_context,
+async, errp)) {
 return;
 }
 }
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 99ab989852..ffd119ebac 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, 
uint64_t start_gpa,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 static bool warned;
 
 /*
@@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, 
void *arg,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 error_report_err(local_err);
 return -ENOMEM;
 }
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index c9692cc314..f45954b512 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
  * @area: start address of the are to preallocate
  * @sz: the size of the area to preallocate
  * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
  * @errp: returns an error if this function fails
  *
  * Preallocate memory (populate/prefault page tables writable) for the virtual
@@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
  * each page in the area was faulted in writable at least once, for example,
  * after allocating file blocks for mapped files.
  *
+ * When setting @async, allocation might be performed asynchronously.
+ * qemu_finish_async_mem_prealloc() must be called to finish any asynchronous
+ * preallocation.
+ *
  * Return: true on success, else false setting @errp with error.
  */
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-   ThreadContext *tc, Error **errp);
+   ThreadContext *tc, bool async, Error **errp);
+
+/**
+ * qemu_finish_async_mem_prealloc:
+ * @errp: returns an error if this function fails
+ *
+ * Finish all outstanding asynchronous memory preallocation.
+ *
+ * Return: true on success, else false setting @errp with

[PATCH v3 0/1] Initialize backend memory objects in parallel

2024-01-31 Thread Mark Kanda
v3:
- squash into a single patch
- use global context list for async handling only (MT capability)
- add BQL asserts to guard against concurrent async prealloc requests
- clean up qemu_finish_async_mem_prealloc() error handling

Includes David's suggested restructuring [1] (with David's SoB).

[1] 
https://lore.kernel.org/qemu-devel/c15161eb-f52c-4a82-8b4b-0ba038421...@redhat.com/

v2:
- require MADV_POPULATE_WRITE (simplify the implementation)
- require prealloc context threads to ensure optimal thread placement
- use machine phase 'initialized' to determine when to allow parallel init

QEMU initializes preallocated backend memory when parsing the corresponding
objects from the command line. In certain scenarios, such as memory being
preallocated across multiple numa nodes, this approach is not optimal due to
the unnecessary serialization.

This series addresses this issue by initializing the backend memory objects in
parallel.

Mark Kanda (1):
  oslib-posix: initialize backend memory objects in parallel

 backends/hostmem.c |   8 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/qemu/osdep.h   |  18 +-
 system/vl.c|   8 +++
 util/oslib-posix.c | 131 +++--
 util/oslib-win32.c |   8 ++-
 6 files changed, 140 insertions(+), 37 deletions(-)

-- 
2.39.3




Re: [PATCH v2 0/2] Initialize backend memory objects in parallel

2024-01-29 Thread Mark Kanda

On 1/29/24 1:11 PM, David Hildenbrand wrote:

On 22.01.24 16:32, Mark Kanda wrote:

v2:
- require MADV_POPULATE_WRITE (simplify the implementation)
- require prealloc context threads to ensure optimal thread placement
- use machine phase 'initialized' to detremine when to allow parallel 
init


QEMU initializes preallocated backend memory when parsing the 
corresponding
objects from the command line. In certain scenarios, such as memory 
being
preallocated across multiple numa nodes, this approach is not optimal 
due to

the unnecessary serialization.

This series addresses this issue by initializing the backend memory 
objects in

parallel.


I just played with the code, some comments:

* I suggest squashing both patches. It doesn't make things clearer if we
  factor out unconditionally adding contexts to a global list.

* Keep the functions MT-capable, at least as long as async=false. That
  is, don't involve the global list if async=false. virtio-mem will
  perform preallocation from other threads at some point, where we could
  see concurrent preallocations for different devices. I made sure that
  qemu_mem_prealloc() can handle that.

* Rename wait_mem_prealloc() to qemu_finish_async_mem_prealloc() and let
  it report the error / return true/false like qemu_prealloc_mem().
  Especially, don't change the existing
  "qemu_prealloc_mem: preallocating memory failed" error message.

* Do the conditional async=false fixup in touch_all_pages(). That means,
  in qemu_prealloc_mem(), only route the async parameter through.


One thing I don't quite like is what happens when multiple threads 
would try

issuing "async=true". It will currently not happen, but we should catch
whenever that happens and require that only one thread at a time can
perform async preallocs. Maybe we can assert in qemu_prealloc_mem()/
qemu_finish_async_mem_prealloc() that we hold the BQL. Hopefully, that
is the case when we start creating memory backends, before starting the
main loop. If not, maybe we should just document that async limitation.

Ideally, we'd have some async_start(), prealloc(), prealloc(),
async_finish() interface, where async_start() would block until
another thread called async_finish(), so we never have a mixture.
But that would currently be over-engineering.


I'll attach the untested, likely broken, code I played with to see
what it could look like. Observe how I only conditionally add the
context to the list at the end of touch_all_pages().



Thank you very much for the feedback David. I'll take a close look at 
this for v3.


Best regards,
-Mark



From fe26cc5252f1284efa8e667310609a22c6166324 Mon Sep 17 00:00:00 2001
From: Mark Kanda 
Date: Mon, 22 Jan 2024 09:32:18 -0600
Subject: [PATCH] oslib-posix: initialize selected backend memory 
objects in

 parallel

QEMU initializes preallocated backend memory as the objects are parsed 
from

the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in 
series.


Allow the initialization to occur in parallel. In order to ensure optimal
thread placement, parallel initialization requires prealloc context 
threads

to be in use.

Signed-off-by: Mark Kanda 
Signed-off-by: David Hildenbrand 
---
 backends/hostmem.c |   8 ++-
 hw/virtio/virtio-mem.c |   4 +-
 include/qemu/osdep.h   |  19 +-
 system/vl.c    |   8 +++
 util/oslib-posix.c | 130 +++--
 util/oslib-win32.c |   8 ++-
 6 files changed, 140 insertions(+), 37 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..8f602dc86f 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"

 #ifdef CONFIG_NUMA
 #include 
@@ -235,9 +236,10 @@ static void 
host_memory_backend_set_prealloc(Object *obj, bool value,

 int fd = memory_region_get_fd(>mr);
 void *ptr = memory_region_get_ram_ptr(>mr);
 uint64_t sz = memory_region_size(>mr);
+    bool async = !phase_check(PHASE_MACHINE_INITIALIZED);

 if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-   backend->prealloc_context, errp)) {
+   backend->prealloc_context, async, 
errp)) {

 return;
 }
 backend->prealloc = true;
@@ -323,6 +325,7 @@ host_memory_backend_memory_complete(UserCreatable 
*uc, Error **errp)

 HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
 void *ptr;
 uint64_t sz;
+    bool async = !phase_check(PHASE_MACHINE_INITIALIZED);

 if (!bc->alloc) {
 return;
@@ -398,7 +401,8 @@ host_memory_backend_memory_complete(UserCreatable 
*uc, Error **errp)
 if (backend->prealloc && 
!qemu_prealloc_me

Re: [PATCH v2 0/2] Initialize backend memory objects in parallel

2024-01-29 Thread Mark Kanda

Ping.

Any comments?

Thanks/regards,
-Mark

On 1/22/24 9:32 AM, Mark Kanda wrote:

v2:
- require MADV_POPULATE_WRITE (simplify the implementation)
- require prealloc context threads to ensure optimal thread placement
- use machine phase 'initialized' to detremine when to allow parallel init

QEMU initializes preallocated backend memory when parsing the corresponding
objects from the command line. In certain scenarios, such as memory being
preallocated across multiple numa nodes, this approach is not optimal due to
the unnecessary serialization.

This series addresses this issue by initializing the backend memory objects in
parallel.

Mark Kanda (2):
   oslib-posix: refactor memory prealloc threads
   oslib-posix: initialize backend memory objects in parallel

  backends/hostmem.c |   8 +++-
  hw/virtio/virtio-mem.c |   4 +-
  include/qemu/osdep.h   |  14 +-
  system/vl.c|   6 +++
  util/oslib-posix.c | 103 -
  util/oslib-win32.c |   8 +++-
  6 files changed, 103 insertions(+), 40 deletions(-)






[PATCH v2 2/2] oslib-posix: initialize backend memory objects in parallel

2024-01-22 Thread Mark Kanda
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel. In order to ensure optimal
thread placement, parallel initialization requires prealloc context threads
to be in use.

Signed-off-by: Mark Kanda 
---
 backends/hostmem.c |  8 ++--
 hw/virtio/virtio-mem.c |  4 ++--
 include/qemu/osdep.h   | 14 --
 system/vl.c|  6 ++
 util/oslib-posix.c | 27 +--
 util/oslib-win32.c |  8 +++-
 6 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 30f69b2cb5..8f602dc86f 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -20,6 +20,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"
 
 #ifdef CONFIG_NUMA
 #include 
@@ -235,9 +236,10 @@ static void host_memory_backend_set_prealloc(Object *obj, 
bool value,
 int fd = memory_region_get_fd(>mr);
 void *ptr = memory_region_get_ram_ptr(>mr);
 uint64_t sz = memory_region_size(>mr);
+bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
 if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-   backend->prealloc_context, errp)) {
+   backend->prealloc_context, async, errp)) {
 return;
 }
 backend->prealloc = true;
@@ -323,6 +325,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
 void *ptr;
 uint64_t sz;
+bool async = !phase_check(PHASE_MACHINE_INITIALIZED);
 
 if (!bc->alloc) {
 return;
@@ -398,7 +401,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
 if (backend->prealloc && 
!qemu_prealloc_mem(memory_region_get_fd(>mr),
 ptr, sz,
 backend->prealloc_threads,
-backend->prealloc_context, 
errp)) {
+backend->prealloc_context,
+async, errp)) {
 return;
 }
 }
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 99ab989852..ffd119ebac 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, 
uint64_t start_gpa,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 static bool warned;
 
 /*
@@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, 
void *arg,
 int fd = memory_region_get_fd(>memdev->mr);
 Error *local_err = NULL;
 
-if (!qemu_prealloc_mem(fd, area, size, 1, NULL, _err)) {
+if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, _err)) {
 error_report_err(local_err);
 return -ENOMEM;
 }
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 9a405bed89..d6e074c515 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -672,17 +672,27 @@ typedef struct ThreadContext ThreadContext;
  * @area: start address of the are to preallocate
  * @sz: the size of the area to preallocate
  * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
  * @errp: returns an error if this function fails
  *
  * Preallocate memory (populate/prefault page tables writable) for the virtual
  * memory area starting at @area with the size of @sz. After a successful call,
  * each page in the area was faulted in writable at least once, for example,
- * after allocating file blocks for mapped files.
+ * after allocating file blocks for mapped files. When using @async,
+ * wait_mem_prealloc() is required to wait for the prealloction threads to
+ * terminate and associated cleanup.
  *
  * Return: true on success, else false setting @errp with error.
  */
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-   ThreadContext *tc, Error **errp);
+   ThreadContext *tc, bool async, Error **errp);
+
+/**
+ * Wait for any outstanding memory prealloc initialization
+ * to complete.
+ */
+int wait_mem_prealloc(void);
 
 /**
  * qemu_get_pid_name:
diff --git a/system/vl.c b/system/vl.c
index 53850a1daf..5696c53ace 100644
--

[PATCH v2 0/2] Initialize backend memory objects in parallel

2024-01-22 Thread Mark Kanda
v2:
- require MADV_POPULATE_WRITE (simplify the implementation)
- require prealloc context threads to ensure optimal thread placement
- use machine phase 'initialized' to detremine when to allow parallel init

QEMU initializes preallocated backend memory when parsing the corresponding
objects from the command line. In certain scenarios, such as memory being
preallocated across multiple numa nodes, this approach is not optimal due to
the unnecessary serialization.

This series addresses this issue by initializing the backend memory objects in
parallel.

Mark Kanda (2):
  oslib-posix: refactor memory prealloc threads
  oslib-posix: initialize backend memory objects in parallel

 backends/hostmem.c |   8 +++-
 hw/virtio/virtio-mem.c |   4 +-
 include/qemu/osdep.h   |  14 +-
 system/vl.c|   6 +++
 util/oslib-posix.c | 103 -
 util/oslib-win32.c |   8 +++-
 6 files changed, 103 insertions(+), 40 deletions(-)

-- 
2.39.3




[PATCH v2 1/2] oslib-posix: refactor memory prealloc threads

2024-01-22 Thread Mark Kanda
Refactor the memory prealloc threads support:
- Make memset context a global qlist
- Move the memset thread join/cleanup code to a separate routine

This is functionally equivalent and facilitates multiple memset contexts
(used in a subsequent patch).

Signed-off-by: Mark Kanda 
---
 util/oslib-posix.c | 90 ++
 1 file changed, 60 insertions(+), 30 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 7c297003b9..26bf2f2883 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -63,11 +63,15 @@
 
 struct MemsetThread;
 
+static QLIST_HEAD(, MemsetContext) memset_contexts =
+QLIST_HEAD_INITIALIZER(memset_contexts);
+
 typedef struct MemsetContext {
 bool all_threads_created;
 bool any_thread_failed;
 struct MemsetThread *threads;
 int num_threads;
+QLIST_ENTRY(MemsetContext) next;
 } MemsetContext;
 
 struct MemsetThread {
@@ -417,14 +421,15 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
bool use_madv_populate_write)
 {
 static gsize initialized = 0;
-MemsetContext context = {
-.num_threads = get_memset_num_threads(hpagesize, numpages, 
max_threads),
-};
+MemsetContext *context = g_malloc0(sizeof(MemsetContext));
 size_t numpages_per_thread, leftover;
 void *(*touch_fn)(void *);
-int ret = 0, i = 0;
+int i = 0;
 char *addr = area;
 
+context->num_threads =
+get_memset_num_threads(hpagesize, numpages, max_threads);
+
 if (g_once_init_enter()) {
 qemu_mutex_init(_mutex);
 qemu_cond_init(_cond);
@@ -433,7 +438,7 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
 
 if (use_madv_populate_write) {
 /* Avoid creating a single thread for MADV_POPULATE_WRITE */
-if (context.num_threads == 1) {
+if (context->num_threads == 1) {
 if (qemu_madvise(area, hpagesize * numpages,
  QEMU_MADV_POPULATE_WRITE)) {
 return -errno;
@@ -445,49 +450,65 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
 touch_fn = do_touch_pages;
 }
 
-context.threads = g_new0(MemsetThread, context.num_threads);
-numpages_per_thread = numpages / context.num_threads;
-leftover = numpages % context.num_threads;
-for (i = 0; i < context.num_threads; i++) {
-context.threads[i].addr = addr;
-context.threads[i].numpages = numpages_per_thread + (i < leftover);
-context.threads[i].hpagesize = hpagesize;
-context.threads[i].context = 
+context->threads = g_new0(MemsetThread, context->num_threads);
+numpages_per_thread = numpages / context->num_threads;
+leftover = numpages % context->num_threads;
+for (i = 0; i < context->num_threads; i++) {
+context->threads[i].addr = addr;
+context->threads[i].numpages = numpages_per_thread + (i < leftover);
+context->threads[i].hpagesize = hpagesize;
+context->threads[i].context = context;
 if (tc) {
-thread_context_create_thread(tc, [i].pgthread,
+thread_context_create_thread(tc, >threads[i].pgthread,
  "touch_pages",
- touch_fn, [i],
+ touch_fn, >threads[i],
  QEMU_THREAD_JOINABLE);
 } else {
-qemu_thread_create([i].pgthread, "touch_pages",
-   touch_fn, [i],
+qemu_thread_create(>threads[i].pgthread, "touch_pages",
+   touch_fn, >threads[i],
QEMU_THREAD_JOINABLE);
 }
-addr += context.threads[i].numpages * hpagesize;
+addr += context->threads[i].numpages * hpagesize;
 }
 
 if (!use_madv_populate_write) {
-sigbus_memset_context = 
+sigbus_memset_context = context;
+}
+
+QLIST_INSERT_HEAD(_contexts, context, next);
+
+return 0;
+}
+
+static int wait_mem_prealloc(void)
+{
+int i, ret = 0;
+MemsetContext *context, *next_context;
+
+if (QLIST_EMPTY(_contexts)) {
+return ret;
 }
 
 qemu_mutex_lock(_mutex);
-context.all_threads_created = true;
+QLIST_FOREACH(context, _contexts, next) {
+context->all_threads_created = true;
+}
 qemu_cond_broadcast(_cond);
 qemu_mutex_unlock(_mutex);
 
-for (i = 0; i < context.num_threads; i++) {
-int tmp = (uintptr_t)qemu_thread_join([i].pgthread);
+QLIST_FOREACH_SAFE(context, _contexts, next, next_context) {
+for (i = 0; i < context->num_threads; i++) {
+int tmp =
+(uintptr_t)qemu_thread_join(>threads[i].pgthread);
 
-if (tmp) {
-ret =

Re: [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads

2024-01-17 Thread Mark Kanda

On 1/9/24 8:25 AM, David Hildenbrand wrote:

On 09.01.24 15:15, Daniel P. Berrangé wrote:

On Tue, Jan 09, 2024 at 03:02:00PM +0100, David Hildenbrand wrote:

On 08.01.24 19:40, Mark Kanda wrote:

On 1/8/24 9:40 AM, David Hildenbrand wrote:

On 08.01.24 16:10, Mark Kanda wrote:

Refactor the memory prealloc threads support:
- Make memset context a global qlist
- Move the memset thread join/cleanup code to a separate routine

This is functionally equivalent and facilitates multiple memset 
contexts

(used in a subsequent patch).

Signed-off-by: Mark Kanda 
---
    util/oslib-posix.c | 104 
+

    1 file changed, 68 insertions(+), 36 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e86fd64e09..293297ac6c 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -63,11 +63,15 @@
      struct MemsetThread;
    +static QLIST_HEAD(, MemsetContext) memset_contexts =
+    QLIST_HEAD_INITIALIZER(memset_contexts);
+
    typedef struct MemsetContext {
    bool all_threads_created;
    bool any_thread_failed;
    struct MemsetThread *threads;
    int num_threads;
+    QLIST_ENTRY(MemsetContext) next;
    } MemsetContext;
      struct MemsetThread {
@@ -81,7 +85,7 @@ struct MemsetThread {
    typedef struct MemsetThread MemsetThread;
      /* used by sigbus_handler() */
-static MemsetContext *sigbus_memset_context;
+static bool sigbus_memset_context;
    struct sigaction sigbus_oldact;
    static QemuMutex sigbus_mutex;
    @@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
    #endif /* CONFIG_LINUX */
    {
    int i;
+    MemsetContext *context;
      if (sigbus_memset_context) {
-    for (i = 0; i < sigbus_memset_context->num_threads; i++) {
-    MemsetThread *thread = 
_memset_context->threads[i];

+    QLIST_FOREACH(context, _contexts, next) {
+    for (i = 0; i < context->num_threads; i++) {
+    MemsetThread *thread = >threads[i];
    -    if (qemu_thread_is_self(>pgthread)) {
-    siglongjmp(thread->env, 1);
+    if (qemu_thread_is_self(>pgthread)) {
+    siglongjmp(thread->env, 1);
+    }
    }
    }
    }
@@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
   bool use_madv_populate_write)
    {
    static gsize initialized = 0;
-    MemsetContext context = {
-    .num_threads = get_memset_num_threads(hpagesize, numpages,
max_threads),
-    };
+    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
    size_t numpages_per_thread, leftover;
    void *(*touch_fn)(void *);
-    int ret = 0, i = 0;
+    int i = 0;
    char *addr = area;
    +    context->num_threads =
+    get_memset_num_threads(hpagesize, numpages, max_threads);
+
    if (g_once_init_enter()) {
    qemu_mutex_init(_mutex);
    qemu_cond_init(_cond);
@@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
      if (use_madv_populate_write) {
    /* Avoid creating a single thread for 
MADV_POPULATE_WRITE */

-    if (context.num_threads == 1) {
+    if (context->num_threads == 1) {
    if (qemu_madvise(area, hpagesize * numpages,
QEMU_MADV_POPULATE_WRITE)) {
    return -errno;
@@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
    touch_fn = do_touch_pages;
    }
    -    context.threads = g_new0(MemsetThread, 
context.num_threads);

-    numpages_per_thread = numpages / context.num_threads;
-    leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
-    context.threads[i].addr = addr;
-    context.threads[i].numpages = numpages_per_thread + (i <
leftover);
-    context.threads[i].hpagesize = hpagesize;
-    context.threads[i].context = 
+    context->threads = g_new0(MemsetThread, context->num_threads);
+    numpages_per_thread = numpages / context->num_threads;
+    leftover = numpages % context->num_threads;
+    for (i = 0; i < context->num_threads; i++) {
+    context->threads[i].addr = addr;
+    context->threads[i].numpages = numpages_per_thread + (i <
leftover);
+    context->threads[i].hpagesize = hpagesize;
+    context->threads[i].context = context;
    if (tc) {
-    thread_context_create_thread(tc,
[i].pgthread,
+    thread_context_create_thread(tc,
>threads[i].pgthread,
"touch_pages",
- touch_fn, 
[i],

+ touch_fn,
>threads[i],
QEMU_THREAD_JOINABLE);
    } else {
- qemu_thread_create([i].pgthread, "touch_pages",
-   touch_fn, [i],
+ qemu_th

Re: [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads

2024-01-09 Thread Mark Kanda




On 1/9/24 8:25 AM, David Hildenbrand wrote:

On 09.01.24 15:15, Daniel P. Berrangé wrote:

On Tue, Jan 09, 2024 at 03:02:00PM +0100, David Hildenbrand wrote:

On 08.01.24 19:40, Mark Kanda wrote:

On 1/8/24 9:40 AM, David Hildenbrand wrote:

On 08.01.24 16:10, Mark Kanda wrote:

Refactor the memory prealloc threads support:
- Make memset context a global qlist
- Move the memset thread join/cleanup code to a separate routine

This is functionally equivalent and facilitates multiple memset 
contexts

(used in a subsequent patch).

Signed-off-by: Mark Kanda 
---
    util/oslib-posix.c | 104 
+

    1 file changed, 68 insertions(+), 36 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e86fd64e09..293297ac6c 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -63,11 +63,15 @@
      struct MemsetThread;
    +static QLIST_HEAD(, MemsetContext) memset_contexts =
+    QLIST_HEAD_INITIALIZER(memset_contexts);
+
    typedef struct MemsetContext {
    bool all_threads_created;
    bool any_thread_failed;
    struct MemsetThread *threads;
    int num_threads;
+    QLIST_ENTRY(MemsetContext) next;
    } MemsetContext;
      struct MemsetThread {
@@ -81,7 +85,7 @@ struct MemsetThread {
    typedef struct MemsetThread MemsetThread;
      /* used by sigbus_handler() */
-static MemsetContext *sigbus_memset_context;
+static bool sigbus_memset_context;
    struct sigaction sigbus_oldact;
    static QemuMutex sigbus_mutex;
    @@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
    #endif /* CONFIG_LINUX */
    {
    int i;
+    MemsetContext *context;
      if (sigbus_memset_context) {
-    for (i = 0; i < sigbus_memset_context->num_threads; i++) {
-    MemsetThread *thread = 
_memset_context->threads[i];

+    QLIST_FOREACH(context, _contexts, next) {
+    for (i = 0; i < context->num_threads; i++) {
+    MemsetThread *thread = >threads[i];
    -    if (qemu_thread_is_self(>pgthread)) {
-    siglongjmp(thread->env, 1);
+    if (qemu_thread_is_self(>pgthread)) {
+    siglongjmp(thread->env, 1);
+    }
    }
    }
    }
@@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
   bool use_madv_populate_write)
    {
    static gsize initialized = 0;
-    MemsetContext context = {
-    .num_threads = get_memset_num_threads(hpagesize, numpages,
max_threads),
-    };
+    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
    size_t numpages_per_thread, leftover;
    void *(*touch_fn)(void *);
-    int ret = 0, i = 0;
+    int i = 0;
    char *addr = area;
    +    context->num_threads =
+    get_memset_num_threads(hpagesize, numpages, max_threads);
+
    if (g_once_init_enter()) {
    qemu_mutex_init(_mutex);
    qemu_cond_init(_cond);
@@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
      if (use_madv_populate_write) {
    /* Avoid creating a single thread for 
MADV_POPULATE_WRITE */

-    if (context.num_threads == 1) {
+    if (context->num_threads == 1) {
    if (qemu_madvise(area, hpagesize * numpages,
QEMU_MADV_POPULATE_WRITE)) {
    return -errno;
@@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t
hpagesize, size_t numpages,
    touch_fn = do_touch_pages;
    }
    -    context.threads = g_new0(MemsetThread, 
context.num_threads);

-    numpages_per_thread = numpages / context.num_threads;
-    leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
-    context.threads[i].addr = addr;
-    context.threads[i].numpages = numpages_per_thread + (i <
leftover);
-    context.threads[i].hpagesize = hpagesize;
-    context.threads[i].context = 
+    context->threads = g_new0(MemsetThread, context->num_threads);
+    numpages_per_thread = numpages / context->num_threads;
+    leftover = numpages % context->num_threads;
+    for (i = 0; i < context->num_threads; i++) {
+    context->threads[i].addr = addr;
+    context->threads[i].numpages = numpages_per_thread + (i <
leftover);
+    context->threads[i].hpagesize = hpagesize;
+    context->threads[i].context = context;
    if (tc) {
-    thread_context_create_thread(tc,
[i].pgthread,
+    thread_context_create_thread(tc,
>threads[i].pgthread,
"touch_pages",
- touch_fn, 
[i],

+ touch_fn,
>threads[i],
QEMU_THREAD_JOINABLE);
    } else {
- qemu_thread_create([i].pgthread, "touch_pages",
-   touch_fn, [i],
+ qemu_th

Re: [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads

2024-01-08 Thread Mark Kanda

On 1/8/24 9:40 AM, David Hildenbrand wrote:

On 08.01.24 16:10, Mark Kanda wrote:

Refactor the memory prealloc threads support:
- Make memset context a global qlist
- Move the memset thread join/cleanup code to a separate routine

This is functionally equivalent and facilitates multiple memset contexts
(used in a subsequent patch).

Signed-off-by: Mark Kanda 
---
  util/oslib-posix.c | 104 +
  1 file changed, 68 insertions(+), 36 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e86fd64e09..293297ac6c 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -63,11 +63,15 @@
    struct MemsetThread;
  +static QLIST_HEAD(, MemsetContext) memset_contexts =
+    QLIST_HEAD_INITIALIZER(memset_contexts);
+
  typedef struct MemsetContext {
  bool all_threads_created;
  bool any_thread_failed;
  struct MemsetThread *threads;
  int num_threads;
+    QLIST_ENTRY(MemsetContext) next;
  } MemsetContext;
    struct MemsetThread {
@@ -81,7 +85,7 @@ struct MemsetThread {
  typedef struct MemsetThread MemsetThread;
    /* used by sigbus_handler() */
-static MemsetContext *sigbus_memset_context;
+static bool sigbus_memset_context;
  struct sigaction sigbus_oldact;
  static QemuMutex sigbus_mutex;
  @@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
  #endif /* CONFIG_LINUX */
  {
  int i;
+    MemsetContext *context;
    if (sigbus_memset_context) {
-    for (i = 0; i < sigbus_memset_context->num_threads; i++) {
-    MemsetThread *thread = _memset_context->threads[i];
+    QLIST_FOREACH(context, _contexts, next) {
+    for (i = 0; i < context->num_threads; i++) {
+    MemsetThread *thread = >threads[i];
  -    if (qemu_thread_is_self(>pgthread)) {
-    siglongjmp(thread->env, 1);
+    if (qemu_thread_is_self(>pgthread)) {
+    siglongjmp(thread->env, 1);
+    }
  }
  }
  }
@@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t 
hpagesize, size_t numpages,

 bool use_madv_populate_write)
  {
  static gsize initialized = 0;
-    MemsetContext context = {
-    .num_threads = get_memset_num_threads(hpagesize, numpages, 
max_threads),

-    };
+    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
  size_t numpages_per_thread, leftover;
  void *(*touch_fn)(void *);
-    int ret = 0, i = 0;
+    int i = 0;
  char *addr = area;
  +    context->num_threads =
+    get_memset_num_threads(hpagesize, numpages, max_threads);
+
  if (g_once_init_enter()) {
  qemu_mutex_init(_mutex);
  qemu_cond_init(_cond);
@@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t 
hpagesize, size_t numpages,

    if (use_madv_populate_write) {
  /* Avoid creating a single thread for MADV_POPULATE_WRITE */
-    if (context.num_threads == 1) {
+    if (context->num_threads == 1) {
  if (qemu_madvise(area, hpagesize * numpages,
   QEMU_MADV_POPULATE_WRITE)) {
  return -errno;
@@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t 
hpagesize, size_t numpages,

  touch_fn = do_touch_pages;
  }
  -    context.threads = g_new0(MemsetThread, context.num_threads);
-    numpages_per_thread = numpages / context.num_threads;
-    leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
-    context.threads[i].addr = addr;
-    context.threads[i].numpages = numpages_per_thread + (i < 
leftover);

-    context.threads[i].hpagesize = hpagesize;
-    context.threads[i].context = 
+    context->threads = g_new0(MemsetThread, context->num_threads);
+    numpages_per_thread = numpages / context->num_threads;
+    leftover = numpages % context->num_threads;
+    for (i = 0; i < context->num_threads; i++) {
+    context->threads[i].addr = addr;
+    context->threads[i].numpages = numpages_per_thread + (i < 
leftover);

+    context->threads[i].hpagesize = hpagesize;
+    context->threads[i].context = context;
  if (tc) {
-    thread_context_create_thread(tc, 
[i].pgthread,
+    thread_context_create_thread(tc, 
>threads[i].pgthread,

   "touch_pages",
- touch_fn, [i],
+ touch_fn, 
>threads[i],

QEMU_THREAD_JOINABLE);
  } else {
- qemu_thread_create([i].pgthread, "touch_pages",
-   touch_fn, [i],
+ qemu_thread_create(>threads[i].pgthread, "touch_pages",
+   touch_fn, >threads[i],
 QEMU_THREAD_JOINABLE);
  

[PATCH v1 0/2] Initialize backend memory objects in parallel

2024-01-08 Thread Mark Kanda
QEMU initializes preallocated backend memory when parsing the corresponding
objects from the command line. In certain scenarios, such as memory being
preallocated across multiple numa nodes, this approach is not optimal due to
the unnecessary serialization.

This series addresses this issue by initializing the backend memory objects in
parallel.

Mark Kanda (2):
  oslib-posix: refactor memory prealloc threads
  oslib-posix: initialize backend memory objects in parallel

 include/qemu/osdep.h |   6 ++
 system/vl.c  |   2 +
 util/oslib-posix.c   | 150 +--
 util/oslib-win32.c   |   5 ++
 4 files changed, 116 insertions(+), 47 deletions(-)

-- 
2.39.3




[PATCH v1 1/2] oslib-posix: refactor memory prealloc threads

2024-01-08 Thread Mark Kanda
Refactor the memory prealloc threads support:
- Make memset context a global qlist
- Move the memset thread join/cleanup code to a separate routine

This is functionally equivalent and facilitates multiple memset contexts
(used in a subsequent patch).

Signed-off-by: Mark Kanda 
---
 util/oslib-posix.c | 104 +
 1 file changed, 68 insertions(+), 36 deletions(-)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e86fd64e09..293297ac6c 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -63,11 +63,15 @@
 
 struct MemsetThread;
 
+static QLIST_HEAD(, MemsetContext) memset_contexts =
+QLIST_HEAD_INITIALIZER(memset_contexts);
+
 typedef struct MemsetContext {
 bool all_threads_created;
 bool any_thread_failed;
 struct MemsetThread *threads;
 int num_threads;
+QLIST_ENTRY(MemsetContext) next;
 } MemsetContext;
 
 struct MemsetThread {
@@ -81,7 +85,7 @@ struct MemsetThread {
 typedef struct MemsetThread MemsetThread;
 
 /* used by sigbus_handler() */
-static MemsetContext *sigbus_memset_context;
+static bool sigbus_memset_context;
 struct sigaction sigbus_oldact;
 static QemuMutex sigbus_mutex;
 
@@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
 #endif /* CONFIG_LINUX */
 {
 int i;
+MemsetContext *context;
 
 if (sigbus_memset_context) {
-for (i = 0; i < sigbus_memset_context->num_threads; i++) {
-MemsetThread *thread = _memset_context->threads[i];
+QLIST_FOREACH(context, _contexts, next) {
+for (i = 0; i < context->num_threads; i++) {
+MemsetThread *thread = >threads[i];
 
-if (qemu_thread_is_self(>pgthread)) {
-siglongjmp(thread->env, 1);
+if (qemu_thread_is_self(>pgthread)) {
+siglongjmp(thread->env, 1);
+}
 }
 }
 }
@@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
bool use_madv_populate_write)
 {
 static gsize initialized = 0;
-MemsetContext context = {
-.num_threads = get_memset_num_threads(hpagesize, numpages, 
max_threads),
-};
+MemsetContext *context = g_malloc0(sizeof(MemsetContext));
 size_t numpages_per_thread, leftover;
 void *(*touch_fn)(void *);
-int ret = 0, i = 0;
+int i = 0;
 char *addr = area;
 
+context->num_threads =
+get_memset_num_threads(hpagesize, numpages, max_threads);
+
 if (g_once_init_enter()) {
 qemu_mutex_init(_mutex);
 qemu_cond_init(_cond);
@@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
 
 if (use_madv_populate_write) {
 /* Avoid creating a single thread for MADV_POPULATE_WRITE */
-if (context.num_threads == 1) {
+if (context->num_threads == 1) {
 if (qemu_madvise(area, hpagesize * numpages,
  QEMU_MADV_POPULATE_WRITE)) {
 return -errno;
@@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t hpagesize, 
size_t numpages,
 touch_fn = do_touch_pages;
 }
 
-context.threads = g_new0(MemsetThread, context.num_threads);
-numpages_per_thread = numpages / context.num_threads;
-leftover = numpages % context.num_threads;
-for (i = 0; i < context.num_threads; i++) {
-context.threads[i].addr = addr;
-context.threads[i].numpages = numpages_per_thread + (i < leftover);
-context.threads[i].hpagesize = hpagesize;
-context.threads[i].context = 
+context->threads = g_new0(MemsetThread, context->num_threads);
+numpages_per_thread = numpages / context->num_threads;
+leftover = numpages % context->num_threads;
+for (i = 0; i < context->num_threads; i++) {
+context->threads[i].addr = addr;
+context->threads[i].numpages = numpages_per_thread + (i < leftover);
+context->threads[i].hpagesize = hpagesize;
+context->threads[i].context = context;
 if (tc) {
-thread_context_create_thread(tc, [i].pgthread,
+thread_context_create_thread(tc, >threads[i].pgthread,
  "touch_pages",
- touch_fn, [i],
+ touch_fn, >threads[i],
  QEMU_THREAD_JOINABLE);
 } else {
-qemu_thread_create([i].pgthread, "touch_pages",
-   touch_fn, [i],
+qemu_thread_create(>threads[i].pgthread, "touch_pages",
+   touch_fn, >threads[i],
QEMU_THREAD_JOINABLE);
 }
-addr += context.threads[i].numpages * hpagesize;
+addr += context->threads

[PATCH v1 2/2] oslib-posix: initialize backend memory objects in parallel

2024-01-08 Thread Mark Kanda
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple numa nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel. The performance increase is
significant and scales with the number of objects. On a 2 socket Skylake VM
with 128GB and 2 init threads per socket (256GB total), the memory init time
decreases from ~27 seconds to ~14 seconds.

Signed-off-by: Mark Kanda 
---
 include/qemu/osdep.h |  6 ++
 system/vl.c  |  2 ++
 util/oslib-posix.c   | 46 +---
 util/oslib-win32.c   |  5 +
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index d30ba73eda..57185e6309 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -682,6 +682,12 @@ typedef struct ThreadContext ThreadContext;
 void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp);
 
+/**
+ * Wait for any outstanding memory prealloc initialization
+ * to complete.
+ */
+void wait_mem_prealloc_init(void);
+
 /**
  * qemu_get_pid_name:
  * @pid: pid of a process
diff --git a/system/vl.c b/system/vl.c
index 6b87bfa32c..9e04acbb2c 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2010,6 +2010,8 @@ static void qemu_create_late_backends(void)
 
 object_option_foreach_add(object_create_late);
 
+wait_mem_prealloc_init();
+
 if (tpm_init() < 0) {
 exit(1);
 }
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 293297ac6c..667d2d960c 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -91,6 +91,7 @@ static QemuMutex sigbus_mutex;
 
 static QemuMutex page_mutex;
 static QemuCond page_cond;
+static bool prealloc_init;
 
 int qemu_get_thread_id(void)
 {
@@ -487,6 +488,12 @@ static int wait_mem_prealloc(void)
 {
 int i, ret = 0;
 MemsetContext *context, *next_context;
+
+/* Return if memory prealloc isn't enabled or active */
+if (QLIST_EMPTY(_contexts) || !prealloc_init) {
+return 0;
+}
+
 qemu_mutex_lock(_mutex);
 QLIST_FOREACH(context, _contexts, next) {
 context->all_threads_created = true;
@@ -553,21 +560,23 @@ void qemu_prealloc_mem(int fd, char *area, size_t sz, int 
max_threads,
 }
 
 qemu_mutex_lock(_mutex);
-memset(, 0, sizeof(act));
+if (!sigbus_oldact.sa_handler) {
+memset(, 0, sizeof(act));
 #ifdef CONFIG_LINUX
-act.sa_sigaction = _handler;
-act.sa_flags = SA_SIGINFO;
+act.sa_sigaction = _handler;
+act.sa_flags = SA_SIGINFO;
 #else /* CONFIG_LINUX */
-act.sa_handler = _handler;
-act.sa_flags = 0;
+act.sa_handler = _handler;
+act.sa_flags = 0;
 #endif /* CONFIG_LINUX */
 
-ret = sigaction(SIGBUS, , _oldact);
-if (ret) {
-qemu_mutex_unlock(_mutex);
-error_setg_errno(errp, errno,
-"qemu_prealloc_mem: failed to install signal handler");
-return;
+ret = sigaction(SIGBUS, , _oldact);
+if (ret) {
+qemu_mutex_unlock(_mutex);
+error_setg_errno(errp, errno,
+"qemu_prealloc_mem: failed to install signal handler");
+return;
+}
 }
 }
 
@@ -589,6 +598,21 @@ void qemu_prealloc_mem(int fd, char *area, size_t sz, int 
max_threads,
 }
 }
 
+void wait_mem_prealloc_init(void)
+{
+/*
+ * Set prealloc_init true to make wait_mem_prealloc() wait for the
+ * initialization to complete.
+ */
+prealloc_init = true;
+
+/* Wait for any outstanding init to complete */
+if (wait_mem_prealloc()) {
+perror("wait_mem_prealloc_init: failed waiting for memory prealloc");
+exit(1);
+}
+}
+
 char *qemu_get_pid_name(pid_t pid)
 {
 char *name = NULL;
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 55b0189dc3..72e050bee1 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -276,6 +276,11 @@ void qemu_prealloc_mem(int fd, char *area, size_t sz, int 
max_threads,
 }
 }
 
+void wait_mem_prealloc_init(void)
+{
+/* not supported */
+}
+
 char *qemu_get_pid_name(pid_t pid)
 {
 /* XXX Implement me */
-- 
2.39.3




Re: [PATCH] Revert "virtio-scsi: Send "REPORTED LUNS CHANGED" sense data upon disk hotplug events"

2023-07-07 Thread Mark Kanda

Hi Stefano,

On 7/5/2023 7:36 AM, Stefano Garzarella wrote:

Hi Mark,

On Wed, Jul 05, 2023 at 07:28:05AM -0500, Mark Kanda wrote:

On 7/5/2023 2:15 AM, Stefano Garzarella wrote:

This reverts commit 8cc5583abe6419e7faaebc9fbd109f34f4c850f2.

That commit causes several problems in Linux as described in the BZ.
In particular, after a while, other devices on the bus are no longer
usable even if those devices are not affected by the hotunplug.
This may be a problem in Linux, but we have not been able to identify
it so far. So better to revert this patch until we find a solution.

Also, Oracle, which initially proposed this patch for a problem with
Solaris, seems to have already reversed it downstream:
    https://linux.oracle.com/errata/ELSA-2023-12065.html

Suggested-by: Thomas Huth 
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2176702
Cc: qemu-sta...@nongnu.org
Cc: Mark Kanda 
Signed-off-by: Stefano Garzarella 


Reviewed-by: Mark Kanda 



Thanks for the review.

By any chance do you have any information you can share regarding
[Orabug: 34905939] mentioned in the errata?

I'd like to better understand why this patch created problems in Linux,
but solved others in Solaris.


Apologies for the delay. I unfortunately can't provide any useful details. We 
had a brief internal discussion about whether the Solaris or Linux driver was 
technically correct per SCSI spec (I'm not sure we came to a conclusion). In any 
case, we ultimately decided it didn't matter because we cannot tolerate a Linux 
regression, and therefore Solaris should change to behave like Linux.


Thanks/regards,
-Mark




Re: [PATCH v4] virtio-scsi: Send "REPORTED LUNS CHANGED" sense data upon disk hotplug events.

2023-07-05 Thread Mark Kanda

HI Stefano,

On 7/4/2023 9:14 AM, Stefano Garzarella wrote:

Hi Mark,
we have a bug [1] possibly related to this patch.

I saw this Oracle Linux errata [2] where you reverted this patch, but
there are no details.

Do you think we should revert it upstream as well?
Do you have any details about the problem it causes in Linux?


While the patch fixed an issue with Solaris VMs, we unfortunately discovered 
regressions with hotplug on Linux VMs. The symptoms were similar to what is 
reported in bz2176702. I think reverting it is the right thing to do until this 
can be investigated further.


Thanks/regards,
-Mark


[1] https://bugzilla.redhat.com/show_bug.cgi?id=2176702
[2] https://linux.oracle.com/errata/ELSA-2023-12065.html

Thanks,
Stefano

On Thu, Oct 6, 2022 at 9:53 PM Venu Busireddy  wrote:

Section 5.6.6.3 of VirtIO specification states, "Events will also
be reported via sense codes..." However, no sense data is sent when
VIRTIO_SCSI_EVT_RESET_RESCAN or VIRTIO_SCSI_EVT_RESET_REMOVED events
are reported (when disk hotplug/hotunplug events occur). SCSI layer
on Solaris depends on this sense data, and hence does not handle disk
hotplug/hotunplug events.

When the disk inventory changes, use the bus unit attention mechanism
to return a CHECK_CONDITION status with sense data of 0x06/0x3F/0x0E
(sense code REPORTED_LUNS_CHANGED).

Signed-off-by: Venu Busireddy 

v3 -> v4:
 - As suggested by Paolo Bonzini, use the bus unit attention mechanism
   instead of the device unit attention mechanism.

v2 -> v3:
 - Implement the suggestion from Paolo Bonzini .

v1 -> v2:
 - Send the sense data for VIRTIO_SCSI_EVT_RESET_REMOVED event too.
---
  hw/scsi/scsi-bus.c | 18 ++
  hw/scsi/virtio-scsi.c  |  2 ++
  include/hw/scsi/scsi.h |  1 +
  3 files changed, 21 insertions(+)

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 4403717c4aaf..ceceafb2cdf3 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1616,6 +1616,24 @@ static int scsi_ua_precedence(SCSISense sense)
  return (sense.asc << 8) | sense.ascq;
  }

+void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense)
+{
+int prec1, prec2;
+if (sense.key != UNIT_ATTENTION) {
+return;
+}
+
+/*
+ * Override a pre-existing unit attention condition, except for a more
+ * important reset condition.
+ */
+prec1 = scsi_ua_precedence(bus->unit_attention);
+prec2 = scsi_ua_precedence(sense);
+if (prec2 < prec1) {
+bus->unit_attention = sense;
+}
+}
+
  void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense)
  {
  int prec1, prec2;
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 41f2a5630173..cf2721aa46c0 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -956,6 +956,7 @@ static void virtio_scsi_hotplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
  virtio_scsi_push_event(s, sd,
 VIRTIO_SCSI_T_TRANSPORT_RESET,
 VIRTIO_SCSI_EVT_RESET_RESCAN);
+scsi_bus_set_ua(>bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
  virtio_scsi_release(s);
  }
  }
@@ -973,6 +974,7 @@ static void virtio_scsi_hotunplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
  virtio_scsi_push_event(s, sd,
 VIRTIO_SCSI_T_TRANSPORT_RESET,
 VIRTIO_SCSI_EVT_RESET_REMOVED);
+scsi_bus_set_ua(>bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
  virtio_scsi_release(s);
  }

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 001103488c23..3b1b3d278ee8 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -186,6 +186,7 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, 
BlockBackend *blk,
BlockdevOnError rerror,
BlockdevOnError werror,
const char *serial, Error **errp);
+void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense);
  void scsi_bus_legacy_handle_cmdline(SCSIBus *bus);
  void scsi_legacy_handle_cmdline(void);







Re: [PATCH] Revert "virtio-scsi: Send "REPORTED LUNS CHANGED" sense data upon disk hotplug events"

2023-07-05 Thread Mark Kanda

On 7/5/2023 2:15 AM, Stefano Garzarella wrote:

This reverts commit 8cc5583abe6419e7faaebc9fbd109f34f4c850f2.

That commit causes several problems in Linux as described in the BZ.
In particular, after a while, other devices on the bus are no longer
usable even if those devices are not affected by the hotunplug.
This may be a problem in Linux, but we have not been able to identify
it so far. So better to revert this patch until we find a solution.

Also, Oracle, which initially proposed this patch for a problem with
Solaris, seems to have already reversed it downstream:
 https://linux.oracle.com/errata/ELSA-2023-12065.html

Suggested-by: Thomas Huth 
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2176702
Cc: qemu-sta...@nongnu.org
Cc: Mark Kanda 
Signed-off-by: Stefano Garzarella 


Reviewed-by: Mark Kanda 

Thanks/regards,
-Mark


---
  include/hw/scsi/scsi.h |  1 -
  hw/scsi/scsi-bus.c | 18 --
  hw/scsi/virtio-scsi.c  |  2 --
  3 files changed, 21 deletions(-)

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index e2bb1a2fbf..7c8adf10b1 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -198,7 +198,6 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, 
BlockBackend *blk,
BlockdevOnError rerror,
BlockdevOnError werror,
const char *serial, Error **errp);
-void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense);
  void scsi_bus_legacy_handle_cmdline(SCSIBus *bus);
  
  SCSIRequest *scsi_req_alloc(const SCSIReqOps *reqops, SCSIDevice *d,

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index f80f4cb4fc..42a915f8b7 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1617,24 +1617,6 @@ static int scsi_ua_precedence(SCSISense sense)
  return (sense.asc << 8) | sense.ascq;
  }
  
-void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense)

-{
-int prec1, prec2;
-if (sense.key != UNIT_ATTENTION) {
-return;
-}
-
-/*
- * Override a pre-existing unit attention condition, except for a more
- * important reset condition.
- */
-prec1 = scsi_ua_precedence(bus->unit_attention);
-prec2 = scsi_ua_precedence(sense);
-if (prec2 < prec1) {
-bus->unit_attention = sense;
-}
-}
-
  void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense)
  {
  int prec1, prec2;
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 45b95ea070..1f56607100 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -1080,7 +1080,6 @@ static void virtio_scsi_hotplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
  
  virtio_scsi_acquire(s);

  virtio_scsi_push_event(s, );
-scsi_bus_set_ua(>bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
  virtio_scsi_release(s);
  }
  }
@@ -1112,7 +,6 @@ static void virtio_scsi_hotunplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
  if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
  virtio_scsi_acquire(s);
  virtio_scsi_push_event(s, );
-scsi_bus_set_ua(>bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
  virtio_scsi_release(s);
  }
  }





Re: [PATCH 22/27] qapi stats: Elide redundant has_FOO in generated C

2022-09-16 Thread Mark Kanda

On 9/15/2022 3:43 PM, Markus Armbruster wrote:

The has_FOO for pointer-valued FOO are redundant, except for arrays.
They are also a nuisance to work with.  Recent commit "qapi: Start to
elide redundant has_FOO in generated C" provided the means to elide
them step by step.  This is the step for qapi/stats.json.

Said commit explains the transformation in more detail.  The invariant
violations mentioned there do not occur here.

Cc: Mark Kanda 
Cc: Paolo Bonzini 
Signed-off-by: Markus Armbruster 


Reviewed-by:  Mark Kanda 

Thanks/regards,
-Mark


---
  monitor/qmp-cmds.c | 5 +
  scripts/qapi/schema.py | 1 -
  2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index 672fc5d1cc..21a21d6171 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -560,10 +560,7 @@ void add_stats_entry(StatsResultList **stats_results, 
StatsProvider provider,
  StatsResult *entry = g_new0(StatsResult, 1);
  
  entry->provider = provider;

-if (qom_path) {
-entry->has_qom_path = true;
-entry->qom_path = g_strdup(qom_path);
-}
+entry->qom_path = g_strdup(qom_path);
  entry->stats = stats_list;
  
  QAPI_LIST_PREPEND(*stats_results, entry);

diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py
index 3001b715d0..6f35e57fe2 100644
--- a/scripts/qapi/schema.py
+++ b/scripts/qapi/schema.py
@@ -759,7 +759,6 @@ def need_has(self):
  assert self.type
  # Temporary hack to support dropping the has_FOO in reviewable chunks
  opt_out = [
-'qapi/stats.json',
  'qapi/tpm.json',
  'qapi/transaction.json',
  'qapi/ui.json',





Re: [PATCH 4/8] hmp: add basic "info stats" implementation

2022-05-24 Thread Mark Kanda

On 5/23/2022 10:07 AM, Paolo Bonzini wrote:

From: Mark Kanda 

Add an HMP command to retrieve statistics collected at run-time.
The command will retrieve and print either all VM-level statistics,
or all vCPU-level statistics for the currently selected CPU.


As I'm credited as the 'poster' (thank you Paolo), I assume this should be 
added:

Signed-off-by: Mark Kanda 

Thanks/regards,
-Mark


Signed-off-by: Paolo Bonzini 
---
  hmp-commands-info.hx  |  13 +++
  include/monitor/hmp.h |   1 +
  monitor/hmp-cmds.c| 187 ++
  3 files changed, 201 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index adfa085a9b..221feab8c0 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -894,3 +894,16 @@ SRST
``info via``
  Show guest mos6522 VIA devices.
  ERST
+
+{
+.name   = "stats",
+.args_type  = "target:s",
+.params = "target",
+.help   = "show statistics; target is either vm or vcpu",
+.cmd= hmp_info_stats,
+},
+
+SRST
+  ``stats``
+Show runtime-collected statistics
+ERST
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d014826a..2e89a97bd6 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -133,5 +133,6 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
  void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
  void hmp_human_readable_text_helper(Monitor *mon,
  HumanReadableText *(*qmp_handler)(Error 
**));
+void hmp_info_stats(Monitor *mon, const QDict *qdict);
  
  #endif

diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 622c783c32..c0cb440902 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -40,6 +40,7 @@
  #include "qapi/qapi-commands-pci.h"
  #include "qapi/qapi-commands-rocker.h"
  #include "qapi/qapi-commands-run-state.h"
+#include "qapi/qapi-commands-stats.h"
  #include "qapi/qapi-commands-tpm.h"
  #include "qapi/qapi-commands-ui.h"
  #include "qapi/qapi-visit-net.h"
@@ -52,6 +53,7 @@
  #include "ui/console.h"
  #include "qemu/cutils.h"
  #include "qemu/error-report.h"
+#include "hw/core/cpu.h"
  #include "hw/intc/intc.h"
  #include "migration/snapshot.h"
  #include "migration/misc.h"
@@ -2239,3 +2241,188 @@ void hmp_info_memory_size_summary(Monitor *mon, const 
QDict *qdict)
  }
  hmp_handle_error(mon, err);
  }
+
+static void print_stats_schema_value(Monitor *mon, StatsSchemaValue *value)
+{
+const char *prefix = "";
+monitor_printf(mon, "%s (%s", value->name, StatsType_str(value->type));
+
+if (value->has_unit && value->unit == STATS_UNIT_SECONDS &&
+(value->exponent == 0 || value->base == 10) &&
+value->exponent >= -9 && value->exponent <= 0 &&
+value->exponent % 3 == 0) {
+
+static const char *si_prefix[] = { "", "milli", "micro", "nano" };
+prefix = si_prefix[value->exponent / -3];
+
+} else if (value->has_unit && value->unit == STATS_UNIT_BYTES &&
+   (value->exponent == 0 || value->base == 2) &&
+   value->exponent >= 0 && value->exponent <= 40 &&
+   value->exponent % 10 == 0) {
+
+static const char *si_prefix[] = {
+"", "kilo", "mega", "giga", "tera" };
+prefix = si_prefix[value->exponent / 10];
+
+} else if (value->exponent) {
+/* Print the base and exponent as "x ^" */
+monitor_printf(mon, ", * %d^%d", value->base,
+   value->exponent);
+}
+
+if (value->has_unit) {
+monitor_printf(mon, " %s%s", prefix, StatsUnit_str(value->unit));
+}
+
+/* Print bucket size for linear histograms */
+if (value->type == STATS_TYPE_LINEAR_HISTOGRAM && value->has_bucket_size) {
+monitor_printf(mon, ", bucket size=%d", value->bucket_size);
+}
+monitor_printf(mon, ")");
+}
+
+static StatsSchemaValueList *find_schema_value_list(
+StatsSchemaList *list, StatsProvider provider,
+StatsTarget target)
+{
+StatsSchemaList *node;
+
+for (node = list; node; node = node->next) {
+if (node->value->provider == provider &&
+node->value->target == target) {
+return node->value->stats;
+}
+}
+return NULL;
+}
+
+static void print_stats_results(Monitor *mon, StatsTarget target,
+StatsResult *result,
+   

Re: [PATCH v4 01/13] cpu: Free cpu->cpu_ases in cpu_address_space_destroy()

2022-03-23 Thread Mark Kanda

On 3/23/2022 1:56 PM, Philippe Mathieu-Daudé wrote:

On 23/3/22 18:17, Philippe Mathieu-Daudé wrote:

From: Mark Kanda 

Create cpu_address_space_destroy() to free a CPU's cpu_ases list.


This seems incorrect...


vCPU hotunplug related leak reported by Valgrind:

==132362== 216 bytes in 1 blocks are definitely lost in loss record 7,119 of 
8,549

==132362==    at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==132362==    by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==132362==    by 0x7E34AF: cpu_address_space_init (physmem.c:751)
==132362==    by 0x45053E: qemu_init_vcpu (cpus.c:635)
==132362==    by 0x76B4A7: x86_cpu_realizefn (cpu.c:6520)
==132362==    by 0x9343ED: device_set_realized (qdev.c:531)
==132362==    by 0x93E26F: property_set_bool (object.c:2273)
==132362==    by 0x93C23E: object_property_set (object.c:1408)
==132362==    by 0x9406DC: object_property_set_qobject (qom-qobject.c:28)
==132362==    by 0x93C5A9: object_property_set_bool (object.c:1477)
==132362==    by 0x933C81: qdev_realize (qdev.c:333)
==132362==    by 0x455E9A: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
Tested-by: Philippe Mathieu-Daudé 
Message-Id: <20220321141409.3112932-5-mark.ka...@oracle.com>
Signed-off-by: Philippe Mathieu-Daudé 
---
  cpu.c | 1 +
  include/exec/cpu-common.h | 7 +++
  softmmu/physmem.c | 5 +
  3 files changed, 13 insertions(+)

diff --git a/cpu.c b/cpu.c
index be1f8b074c..59352a1487 100644
--- a/cpu.c
+++ b/cpu.c
@@ -174,6 +174,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
  tcg_exec_unrealizefn(cpu);
  }
  +    cpu_address_space_destroy(cpu);
  cpu_list_remove(cpu);
  }
  diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 50a7d2912e..b17ad61ae4 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -111,6 +111,13 @@ size_t qemu_ram_pagesize_largest(void);
   */
  void cpu_address_space_init(CPUState *cpu, int asidx,
  const char *prefix, MemoryRegion *mr);


... cpu_address_space_init() creates a single AS, ...


+/**
+ * cpu_address_space_destroy:
+ * @cpu: CPU for this address space
+ *
+ * Cleanup CPU's cpu_ases list.
+ */
+void cpu_address_space_destroy(CPUState *cpu);
    void cpu_physical_memory_rw(hwaddr addr, void *buf,
  hwaddr len, bool is_write);
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 43ae70fbe2..aec61ca07a 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -762,6 +762,11 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
  }
  }
  +void cpu_address_space_destroy(CPUState *cpu)
+{
+    g_free(cpu->cpu_ases);


... but here you destroy all the ASes.


I was thinking the whole ASes list should be freed because the CPU is going 
away...

Thanks/regards,
-Mark



Re: [PATCH v4 12/13] softmmu/cpus: Free cpu->thread in generic_destroy_vcpu_thread()

2022-03-23 Thread Mark Kanda

Thanks Philippe,

In the patch subject, 'generic_destroy_vcpu_thread()' should be changed to 
'common_vcpu_thread_destroy()'.

Same goes for the next patch (Free cpu->halt_cond).

Thanks/regards,
-Mark

On 3/23/2022 12:17 PM, Philippe Mathieu-Daudé wrote:

From: Mark Kanda 

Free cpu->thread in a new AccelOpsClass::destroy_vcpu_thread() handler
generic_destroy_vcpu_thread().

vCPU hotunplug related leak reported by Valgrind:

   ==102631== 8 bytes in 1 blocks are definitely lost in loss record 1,037 of 
8,555
   ==102631==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
   ==102631==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
   ==102631==by 0x92443A: kvm_start_vcpu_thread (kvm-accel-ops.c:68)
   ==102631==by 0x4505C2: qemu_init_vcpu (cpus.c:643)
   ==102631==by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
   ==102631==by 0x9344A7: device_set_realized (qdev.c:531)
   ==102631==by 0x93E329: property_set_bool (object.c:2273)
   ==102631==by 0x93C2F8: object_property_set (object.c:1408)
   ==102631==by 0x940796: object_property_set_qobject (qom-qobject.c:28)
   ==102631==by 0x93C663: object_property_set_bool (object.c:1477)
   ==102631==by 0x933D3B: qdev_realize (qdev.c:333)
   ==102631==by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
Message-Id: <20220321141409.3112932-3-mark.ka...@oracle.com>
Signed-off-by: Philippe Mathieu-Daudé 
---
  softmmu/cpus.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 37325b3b8d..efa8397f04 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -619,6 +619,7 @@ static void common_vcpu_thread_create(CPUState *cpu)
  
  static void common_vcpu_thread_destroy(CPUState *cpu)

  {
+g_free(cpu->thread);
  }
  
  void cpu_remove_sync(CPUState *cpu)





Re: [RFC PATCH-for-7.0 v4] target/i386/kvm: Free xsave_buf when destroying vCPU

2022-03-22 Thread Mark Kanda

On 3/22/2022 8:29 AM, Philippe Mathieu-Daudé wrote:

On 22/3/22 13:05, Philippe Mathieu-Daudé wrote:

From: Philippe Mathieu-Daudé 

Fix vCPU hot-unplug related leak reported by Valgrind:

   ==132362== 4,096 bytes in 1 blocks are definitely lost in loss record 
8,440 of 8,549

   ==132362==    at 0x4C3B15F: memalign (vg_replace_malloc.c:1265)
   ==132362==    by 0x4C3B288: posix_memalign (vg_replace_malloc.c:1429)
   ==132362==    by 0xB41195: qemu_try_memalign (memalign.c:53)
   ==132362==    by 0xB41204: qemu_memalign (memalign.c:73)
   ==132362==    by 0x7131CB: kvm_init_xsave (kvm.c:1601)
   ==132362==    by 0x7148ED: kvm_arch_init_vcpu (kvm.c:2031)
   ==132362==    by 0x91D224: kvm_init_vcpu (kvm-all.c:516)
   ==132362==    by 0x9242C9: kvm_vcpu_thread_fn (kvm-accel-ops.c:40)
   ==132362==    by 0xB2EB26: qemu_thread_start (qemu-thread-posix.c:556)
   ==132362==    by 0x7EB2159: start_thread (in /usr/lib64/libpthread-2.28.so)
   ==132362==    by 0x9D45DD2: clone (in /usr/lib64/libc-2.28.so)

Reported-by: Mark Kanda 
Signed-off-by: Philippe Mathieu-Daudé 
---
Based on a series from Mark:
https://lore.kernel.org/qemu-devel/20220321141409.3112932-1-mark.ka...@oracle.com/ 



RFC because currently no time to test


Mark, do you mind testing this patch?
Sanity tested with x86_64 KVM. Valgrind confirms the leak is fixed upon the vCPU 
hotplug.


Tested-by: Mark Kanda 

Thanks/regards,
-Mark



Re: [PATCH v3 3/5] softmmu/cpus: Free cpu->halt_cond in generic_destroy_vcpu_thread()

2022-03-22 Thread Mark Kanda

Thanks Philippe,

On 3/21/2022 5:12 PM, Philippe Mathieu-Daudé wrote:

On 21/3/22 15:14, Mark Kanda wrote:

vCPU hotunplug related leak reported by Valgrind:

==102631== 56 bytes in 1 blocks are definitely lost in loss record 5,089 of 
8,555

==102631==    at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==102631==    by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==102631==    by 0x924452: kvm_start_vcpu_thread (kvm-accel-ops.c:69)


Here we want to extract a common generic_init_vcpu_thread().



How about I add extracting 'generic_init_vcpu_thread()' as a separate cleanup 
patch at the end? I'll also drop my xsave_buf patch due to your followup.


Thanks/regards,
-Mark


==102631==    by 0x4505C2: qemu_init_vcpu (cpus.c:643)
==102631==    by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
==102631==    by 0x9344A7: device_set_realized (qdev.c:531)
==102631==    by 0x93E329: property_set_bool (object.c:2273)
==102631==    by 0x93C2F8: object_property_set (object.c:1408)
==102631==    by 0x940796: object_property_set_qobject (qom-qobject.c:28)
==102631==    by 0x93C663: object_property_set_bool (object.c:1477)
==102631==    by 0x933D3B: qdev_realize (qdev.c:333)
==102631==    by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
  accel/accel-common.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/accel/accel-common.c b/accel/accel-common.c
index 623df43cc3..297d4e4ef1 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -140,4 +140,5 @@ type_init(register_accel_types);
  void generic_destroy_vcpu_thread(CPUState *cpu)
  {
  g_free(cpu->thread);
+    g_free(cpu->halt_cond);
  }







Re: [PATCH v4 1/3] qmp: Support for querying stats

2022-03-21 Thread Mark Kanda

On 3/21/2022 9:55 AM, Paolo Bonzini wrote:

On 3/21/22 14:50, Markus Armbruster wrote:

Mark Kanda  writes:

Thank you Markus.
On 3/11/2022 7:06 AM, Markus Armbruster wrote:

Are the stats bulky enough to justfify the extra complexity of
filtering?


If this was only for KVM, the complexity probably isn't worth it. However, the
framework is intended to support future stats with new providers and targets
(there has also been mention of moving existing stats to this framework).
Without some sort of filtering, I think the payload could become unmanageable.


I'm deeply wary of "may need $complexity in the future" when $complexity
could be added when we actually need it :)


I think it's better to have the filtering already.  There are several uses for 
it.


Regarding filtering by provider, consider that a command like "info jit" 
should be a wrapper over


{ "execute": "query-stats", "arguments" : { "target": "vm",
  "filters": [ { "provider": "tcg" } ] } }

So we have an example of the intended use already within QEMU. Yes, the 
usefulness depends on actually having >1 provider but I think it's pretty 
central to the idea of having a statistics *subsystem*.


Regarding filtering by name, query-stats mostly has two usecases. The first is 
retrieving all stats and publishing them up to the user, for example once per 
minute per VM.  The second is monitoring a small number and building a 
relatively continuous plot (e.g. 1-10 times per second per vCPU).  For the 
latter, not having to return hundreds of values unnecessarily (KVM has almost 
60 stats, multiply by the number of vCPUs and the frequency) is worth having 
even just with the KVM provider.



Can you give a use case for query-stats-schemas?


'query-stats-schemas' provide the the type details about each stat; such as the
unit, base, etc. These details are not reported by 'query-stats' (only the stat
name and raw values are returned).


Yes, but what is going to use these type details, and for what purpose?


QEMU does not know in advance which stats are provided.  The types, etc. are 
provided by the kernel and can change by architecture and kernel version.  In 
the case of KVM, introspection is done through a file descriptor.  QEMU passes 
these up as QMP and in the future it could/should extend this to other 
providers (such as TCG) and devices (such as block devices).


See the "info stats" implementation for how it uses the schema:

vcpu (qom path: /machine/unattached/device[2])
  provider: kvm
    exits (cumulative): 52369
    halt_wait_ns (cumulative nanoseconds): 416092704390

Information such as "cumulative nanoseconds" is provided by the schema.


Have you considered splitting this up into three parts: unfiltered
query-stats, filtering, and query-stats-schemas?


Splitting could be an idea, but I think only filtering would be a separate 
step.  The stats are not really usable without a schema that tells you the 
units, or whether a number can go down or only up.  (Well, a human export 
could use them through its intuition, but a HMP-level command could not be 
provided).



We could perhaps merge with the current schema, then clean it up on top,
both in 7.1, if that's easier for you.


The serialized JSON would change, so that would be a bit worrisome (but it 
makes me feel a little less bad about this missing 7.0). It seems to be as 
easy as this, as far as alternates go:


diff --git a/scripts/qapi/expr.py b/scripts/qapi/expr.py
index 3cb389e875..48578e1698 100644
--- a/scripts/qapi/expr.py
+++ b/scripts/qapi/expr.py
@@ -554,7 +554,7 @@ def check_alternate(expr: _JSONObject, info: 
QAPISourceInfo) -> None:

 check_name_lower(key, info, source)
 check_keys(value, info, source, ['type'], ['if'])
 check_if(value, info, source)
-    check_type(value['type'], info, source)
+    check_type(value['type'], info, source, allow_array=True)


 def check_command(expr: _JSONObject, info: QAPISourceInfo) -> None:

diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py
index b7b3fc0ce4..3728340c37 100644
--- a/scripts/qapi/schema.py
+++ b/scripts/qapi/schema.py
@@ -243,6 +243,7 @@ def alternate_qtype(self):
 'number':  'QTYPE_QNUM',
 'int': 'QTYPE_QNUM',
 'boolean': 'QTYPE_QBOOL',
+    'array':   'QTYPE_QLIST',
 'object':  'QTYPE_QDICT'
 }
 return json2qtype.get(self.json_type())
@@ -1069,6 +1070,9 @@ def _def_struct_type(self, expr, info, doc):
 None))

 def _make_variant(self, case, typ, ifcond, info):
+    if isinstance(typ, list):
+    assert len(typ) == 1
+    typ = self._make_array_type(typ[0], info)
 return QAPISchemaVariant(case, info, typ, ifcond)

 def _def_union_type(self, expr, info, doc):


I'll try to write some testcases and also cover other uses o

[PATCH v3 3/5] softmmu/cpus: Free cpu->halt_cond in generic_destroy_vcpu_thread()

2022-03-21 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==102631== 56 bytes in 1 blocks are definitely lost in loss record 5,089 of 
8,555
==102631==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==102631==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==102631==by 0x924452: kvm_start_vcpu_thread (kvm-accel-ops.c:69)
==102631==by 0x4505C2: qemu_init_vcpu (cpus.c:643)
==102631==by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
==102631==by 0x9344A7: device_set_realized (qdev.c:531)
==102631==by 0x93E329: property_set_bool (object.c:2273)
==102631==by 0x93C2F8: object_property_set (object.c:1408)
==102631==by 0x940796: object_property_set_qobject (qom-qobject.c:28)
==102631==by 0x93C663: object_property_set_bool (object.c:1477)
==102631==by 0x933D3B: qdev_realize (qdev.c:333)
==102631==by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 accel/accel-common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/accel/accel-common.c b/accel/accel-common.c
index 623df43cc3..297d4e4ef1 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -140,4 +140,5 @@ type_init(register_accel_types);
 void generic_destroy_vcpu_thread(CPUState *cpu)
 {
 g_free(cpu->thread);
+g_free(cpu->halt_cond);
 }
-- 
2.27.0




[PATCH v3 2/5] softmmu/cpus: Free cpu->thread in generic_destroy_vcpu_thread()

2022-03-21 Thread Mark Kanda
Free cpu->thread in a new AccelOpsClass::destroy_vcpu_thread() handler
generic_destroy_vcpu_thread().

vCPU hotunplug related leak reported by Valgrind:

==102631== 8 bytes in 1 blocks are definitely lost in loss record 1,037 of 8,555
==102631==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==102631==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==102631==by 0x92443A: kvm_start_vcpu_thread (kvm-accel-ops.c:68)
==102631==by 0x4505C2: qemu_init_vcpu (cpus.c:643)
==102631==by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
==102631==by 0x9344A7: device_set_realized (qdev.c:531)
==102631==by 0x93E329: property_set_bool (object.c:2273)
==102631==by 0x93C2F8: object_property_set (object.c:1408)
==102631==by 0x940796: object_property_set_qobject (qom-qobject.c:28)
==102631==by 0x93C663: object_property_set_bool (object.c:1477)
==102631==by 0x933D3B: qdev_realize (qdev.c:333)
==102631==by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 accel/accel-common.c  | 6 ++
 accel/hvf/hvf-accel-ops.c | 1 +
 accel/kvm/kvm-accel-ops.c | 1 +
 accel/qtest/qtest.c   | 1 +
 accel/tcg/tcg-accel-ops.c | 1 +
 accel/xen/xen-all.c   | 1 +
 include/sysemu/accel-ops.h| 2 ++
 target/i386/hax/hax-accel-ops.c   | 1 +
 target/i386/nvmm/nvmm-accel-ops.c | 1 +
 target/i386/whpx/whpx-accel-ops.c | 1 +
 10 files changed, 16 insertions(+)

diff --git a/accel/accel-common.c b/accel/accel-common.c
index 7b8ec7e0f7..623df43cc3 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -28,6 +28,7 @@
 
 #include "cpu.h"
 #include "hw/core/accel-cpu.h"
+#include "sysemu/accel-ops.h"
 
 #ifndef CONFIG_USER_ONLY
 #include "accel-softmmu.h"
@@ -135,3 +136,8 @@ static void register_accel_types(void)
 }
 
 type_init(register_accel_types);
+
+void generic_destroy_vcpu_thread(CPUState *cpu)
+{
+g_free(cpu->thread);
+}
diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c
index 54457c76c2..b23a67881c 100644
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -467,6 +467,7 @@ static void hvf_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = hvf_start_vcpu_thread;
+ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
 ops->kick_vcpu_thread = hvf_kick_vcpu_thread;
 
 ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c
index c4244a23c6..5a7a9ae79c 100644
--- a/accel/kvm/kvm-accel-ops.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -89,6 +89,7 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = kvm_start_vcpu_thread;
+ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
 ops->cpu_thread_is_idle = kvm_vcpu_thread_is_idle;
 ops->cpus_are_resettable = kvm_cpus_are_resettable;
 ops->synchronize_post_reset = kvm_cpu_synchronize_post_reset;
diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
index f6056ac836..ba8573fc2c 100644
--- a/accel/qtest/qtest.c
+++ b/accel/qtest/qtest.c
@@ -51,6 +51,7 @@ static void qtest_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = dummy_start_vcpu_thread;
+ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
 ops->get_virtual_clock = qtest_get_virtual_clock;
 };
 
diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
index ea7dcad674..527592c4d7 100644
--- a/accel/tcg/tcg-accel-ops.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -94,6 +94,7 @@ void tcg_handle_interrupt(CPUState *cpu, int mask)
 
 static void tcg_accel_ops_init(AccelOpsClass *ops)
 {
+ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
 if (qemu_tcg_mttcg_enabled()) {
 ops->create_vcpu_thread = mttcg_start_vcpu_thread;
 ops->kick_vcpu_thread = mttcg_kick_vcpu_thread;
diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 69aa7d018b..0efda554cc 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -220,6 +220,7 @@ static void xen_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = dummy_start_vcpu_thread;
+ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
 }
 
 static const TypeInfo xen_accel_ops_type = {
diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index e296b27b82..fac7d6b34e 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -46,4 +46,6 @@ struct AccelOpsClass {
 int64_t (*get_elapsed_ticks)(void);
 };
 
+/* free vcpu thread structures */
+void generic_destroy_vcpu_thread(CPUState *cpu);
 #endif /* ACCEL_OPS_H *

[PATCH v3 1/5] accel: Introduce AccelOpsClass::destroy_vcpu_thread()

2022-03-21 Thread Mark Kanda
Add destroy_vcpu_thread() to AccelOps as a method for vcpu thread cleanup.
This will be used in subsequent patches.

Suggested-by: Philippe Mathieu-Daudé  
Signed-off-by: Mark Kanda 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/sysemu/accel-ops.h | 1 +
 softmmu/cpus.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index 6013c9444c..e296b27b82 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -31,6 +31,7 @@ struct AccelOpsClass {
 bool (*cpus_are_resettable)(void);
 
 void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */
+void (*destroy_vcpu_thread)(CPUState *cpu);
 void (*kick_vcpu_thread)(CPUState *cpu);
 bool (*cpu_thread_is_idle)(CPUState *cpu);
 
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 7b75bb66d5..622f8b4608 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -609,6 +609,9 @@ void cpu_remove_sync(CPUState *cpu)
 qemu_mutex_unlock_iothread();
 qemu_thread_join(cpu->thread);
 qemu_mutex_lock_iothread();
+if (cpus_accel->destroy_vcpu_thread) {
+cpus_accel->destroy_vcpu_thread(cpu);
+}
 }
 
 void cpus_register_accel(const AccelOpsClass *ops)
-- 
2.27.0




[PATCH v3 5/5] i386/cpu: Free env->xsave_buf in KVM and HVF destory_vcpu_thread routines

2022-03-21 Thread Mark Kanda
Create KVM and HVF specific destory_vcpu_thread() routines to free
env->xsave_buf.

vCPU hotunplug related leak reported by Valgrind:

==132362== 4,096 bytes in 1 blocks are definitely lost in loss record 8,440 of 
8,549
==132362==at 0x4C3B15F: memalign (vg_replace_malloc.c:1265)
==132362==by 0x4C3B288: posix_memalign (vg_replace_malloc.c:1429)
==132362==by 0xB41195: qemu_try_memalign (memalign.c:53)
==132362==by 0xB41204: qemu_memalign (memalign.c:73)
==132362==by 0x7131CB: kvm_init_xsave (kvm.c:1601)
==132362==by 0x7148ED: kvm_arch_init_vcpu (kvm.c:2031)
==132362==by 0x91D224: kvm_init_vcpu (kvm-all.c:516)
==132362==by 0x9242C9: kvm_vcpu_thread_fn (kvm-accel-ops.c:40)
==132362==by 0xB2EB26: qemu_thread_start (qemu-thread-posix.c:556)
==132362==by 0x7EB2159: start_thread (in /usr/lib64/libpthread-2.28.so)
==132362==by 0x9D45DD2: clone (in /usr/lib64/libc-2.28.so)

Signed-off-by: Mark Kanda 
---
 accel/hvf/hvf-accel-ops.c | 11 ++-
 accel/kvm/kvm-accel-ops.c | 11 ++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c
index b23a67881c..bc53890352 100644
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -462,12 +462,21 @@ static void hvf_start_vcpu_thread(CPUState *cpu)
cpu, QEMU_THREAD_JOINABLE);
 }
 
+static void hvf_destroy_vcpu_thread(CPUState *cpu)
+{
+X86CPU *x86_cpu = X86_CPU(cpu);
+CPUX86State *env = _cpu->env;
+
+g_free(env->xsave_buf);
+generic_destroy_vcpu_thread(cpu);
+}
+
 static void hvf_accel_ops_class_init(ObjectClass *oc, void *data)
 {
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = hvf_start_vcpu_thread;
-ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
+ops->destroy_vcpu_thread = hvf_destroy_vcpu_thread;
 ops->kick_vcpu_thread = hvf_kick_vcpu_thread;
 
 ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c
index 5a7a9ae79c..0345a30139 100644
--- a/accel/kvm/kvm-accel-ops.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -74,6 +74,15 @@ static void kvm_start_vcpu_thread(CPUState *cpu)
cpu, QEMU_THREAD_JOINABLE);
 }
 
+static void kvm_destroy_vcpu_thread(CPUState *cpu)
+{
+X86CPU *x86_cpu = X86_CPU(cpu);
+CPUX86State *env = _cpu->env;
+
+g_free(env->xsave_buf);
+generic_destroy_vcpu_thread(cpu);
+}
+
 static bool kvm_vcpu_thread_is_idle(CPUState *cpu)
 {
 return !kvm_halt_in_kernel();
@@ -89,7 +98,7 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = kvm_start_vcpu_thread;
-ops->destroy_vcpu_thread = generic_destroy_vcpu_thread;
+ops->destroy_vcpu_thread = kvm_destroy_vcpu_thread;
 ops->cpu_thread_is_idle = kvm_vcpu_thread_is_idle;
 ops->cpus_are_resettable = kvm_cpus_are_resettable;
 ops->synchronize_post_reset = kvm_cpu_synchronize_post_reset;
-- 
2.27.0




[PATCH v3 0/5] vCPU hotunplug related memory leaks

2022-03-21 Thread Mark Kanda
This series addresses a few vCPU hotunplug related leaks (found with Valgrind).

v3:
- patch 4: create cpu_address_space_destroy() to free cpu_ases (Phillipe)
- patch 5: create _destroy_vcpu_thread() to free xsave_buf (Phillipe)

v2: Create AccelOpsClass::destroy_vcpu_thread() for vcpu thread related cleanup
(Philippe)

Mark Kanda (5):
  accel: Introduce AccelOpsClass::destroy_vcpu_thread()
  softmmu/cpus: Free cpu->thread in generic_destroy_vcpu_thread()
  softmmu/cpus: Free cpu->halt_cond in generic_destroy_vcpu_thread()
  cpu: Free cpu->cpu_ases in cpu_address_space_destroy()
  i386/cpu: Free env->xsave_buf in KVM and HVF destory_vcpu_thread
routines

 accel/accel-common.c  |  7 +++
 accel/hvf/hvf-accel-ops.c | 10 ++
 accel/kvm/kvm-accel-ops.c | 10 ++
 accel/qtest/qtest.c   |  1 +
 accel/tcg/tcg-accel-ops.c |  1 +
 accel/xen/xen-all.c   |  1 +
 cpu.c |  1 +
 include/exec/cpu-common.h |  7 +++
 include/sysemu/accel-ops.h|  3 +++
 softmmu/cpus.c|  3 +++
 softmmu/physmem.c |  5 +
 target/i386/hax/hax-accel-ops.c   |  1 +
 target/i386/nvmm/nvmm-accel-ops.c |  1 +
 target/i386/whpx/whpx-accel-ops.c |  1 +
 14 files changed, 52 insertions(+)

-- 
2.27.0




[PATCH v3 4/5] cpu: Free cpu->cpu_ases in cpu_address_space_destroy()

2022-03-21 Thread Mark Kanda
Create cpu_address_space_destroy() to free a CPU's cpu_ases list.

vCPU hotunplug related leak reported by Valgrind:

==132362== 216 bytes in 1 blocks are definitely lost in loss record 7,119 of 
8,549
==132362==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==132362==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==132362==by 0x7E34AF: cpu_address_space_init (physmem.c:751)
==132362==by 0x45053E: qemu_init_vcpu (cpus.c:635)
==132362==by 0x76B4A7: x86_cpu_realizefn (cpu.c:6520)
==132362==by 0x9343ED: device_set_realized (qdev.c:531)
==132362==by 0x93E26F: property_set_bool (object.c:2273)
==132362==by 0x93C23E: object_property_set (object.c:1408)
==132362==by 0x9406DC: object_property_set_qobject (qom-qobject.c:28)
==132362==by 0x93C5A9: object_property_set_bool (object.c:1477)
==132362==by 0x933C81: qdev_realize (qdev.c:333)
==132362==by 0x455E9A: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 cpu.c | 1 +
 include/exec/cpu-common.h | 7 +++
 softmmu/physmem.c | 5 +
 3 files changed, 13 insertions(+)

diff --git a/cpu.c b/cpu.c
index be1f8b074c..59352a1487 100644
--- a/cpu.c
+++ b/cpu.c
@@ -174,6 +174,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
 tcg_exec_unrealizefn(cpu);
 }
 
+cpu_address_space_destroy(cpu);
 cpu_list_remove(cpu);
 }
 
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 50a7d2912e..b17ad61ae4 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -111,6 +111,13 @@ size_t qemu_ram_pagesize_largest(void);
  */
 void cpu_address_space_init(CPUState *cpu, int asidx,
 const char *prefix, MemoryRegion *mr);
+/**
+ * cpu_address_space_destroy:
+ * @cpu: CPU for this address space
+ *
+ * Cleanup CPU's cpu_ases list.
+ */
+void cpu_address_space_destroy(CPUState *cpu);
 
 void cpu_physical_memory_rw(hwaddr addr, void *buf,
 hwaddr len, bool is_write);
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 43ae70fbe2..aec61ca07a 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -762,6 +762,11 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
 }
 }
 
+void cpu_address_space_destroy(CPUState *cpu)
+{
+g_free(cpu->cpu_ases);
+}
+
 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 {
 /* Return the AddressSpace corresponding to the specified index */
-- 
2.27.0




Re: [PATCH v2 5/5] i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

2022-03-18 Thread Mark Kanda

On 3/18/2022 11:32 AM, Philippe Mathieu-Daudé wrote:

On 18/3/22 16:15, Mark Kanda wrote:

vCPU hotunplug related leak reported by Valgrind:

==132362== 4,096 bytes in 1 blocks are definitely lost in loss record 8,440 
of 8,549

==132362==    at 0x4C3B15F: memalign (vg_replace_malloc.c:1265)
==132362==    by 0x4C3B288: posix_memalign (vg_replace_malloc.c:1429)
==132362==    by 0xB41195: qemu_try_memalign (memalign.c:53)
==132362==    by 0xB41204: qemu_memalign (memalign.c:73)
==132362==    by 0x7131CB: kvm_init_xsave (kvm.c:1601)
==132362==    by 0x7148ED: kvm_arch_init_vcpu (kvm.c:2031)
==132362==    by 0x91D224: kvm_init_vcpu (kvm-all.c:516)
==132362==    by 0x9242C9: kvm_vcpu_thread_fn (kvm-accel-ops.c:40)
==132362==    by 0xB2EB26: qemu_thread_start (qemu-thread-posix.c:556)
==132362==    by 0x7EB2159: start_thread (in /usr/lib64/libpthread-2.28.so)
==132362==    by 0x9D45DD2: clone (in /usr/lib64/libc-2.28.so)

Signed-off-by: Mark Kanda 
---
  target/i386/cpu.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a88d6554c8..014a716c36 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6572,6 +6572,11 @@ static void x86_cpu_unrealizefn(DeviceState *dev)
  }
    xcc->parent_unrealize(dev);
+
+#if defined(CONFIG_KVM) || defined(CONFIG_HVF)
+    CPUX86State *env = >env;
+    g_free(env->xsave_buf);


This belong to hvf_arch_vcpu_destroy().

And for KVM, in the missing kvm_arch_destroy_vcpu().



Will fix in v3.

Thanks Philippe,
-Mark



Re: [PATCH v2 4/5] cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()

2022-03-18 Thread Mark Kanda

On 3/18/2022 11:26 AM, Philippe Mathieu-Daudé wrote:

On 18/3/22 16:15, Mark Kanda wrote:

vCPU hotunplug related leak reported by Valgrind:

==132362== 216 bytes in 1 blocks are definitely lost in loss record 7,119 of 
8,549

==132362==    at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==132362==    by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==132362==    by 0x7E34AF: cpu_address_space_init (physmem.c:751)
==132362==    by 0x45053E: qemu_init_vcpu (cpus.c:635)
==132362==    by 0x76B4A7: x86_cpu_realizefn (cpu.c:6520)
==132362==    by 0x9343ED: device_set_realized (qdev.c:531)
==132362==    by 0x93E26F: property_set_bool (object.c:2273)
==132362==    by 0x93C23E: object_property_set (object.c:1408)
==132362==    by 0x9406DC: object_property_set_qobject (qom-qobject.c:28)
==132362==    by 0x93C5A9: object_property_set_bool (object.c:1477)
==132362==    by 0x933C81: qdev_realize (qdev.c:333)
==132362==    by 0x455E9A: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
  cpu.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/cpu.c b/cpu.c
index be1f8b074c..6a3475022f 100644
--- a/cpu.c
+++ b/cpu.c
@@ -173,6 +173,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
  if (tcg_enabled()) {
  tcg_exec_unrealizefn(cpu);
  }
+    g_free(cpu->cpu_ases);


There is an API mismatch here. We miss cpu_address_space_destroy().

cpu_exec_unrealizefn() then calls cpu_address_space_destroy(),
and cpu_address_space_destroy() frees cpu_ases.

Otherwise other cpu_address_space_init() calls will keep leaking.



Will fix in v3.

Thanks Philippe,
-Mark




[PATCH v2 2/5] softmmu/cpus: Free cpu->thread in destroy_vcpu_thread_generic()

2022-03-18 Thread Mark Kanda
Use a new AccelOpsClass::destroy_vcpu_thread() handler
destroy_vcpu_thread_generic() to free cpu->thread.

vCPU hotunplug related leak reported by Valgrind:

==102631== 8 bytes in 1 blocks are definitely lost in loss record 1,037 of 8,555
==102631==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==102631==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==102631==by 0x92443A: kvm_start_vcpu_thread (kvm-accel-ops.c:68)
==102631==by 0x4505C2: qemu_init_vcpu (cpus.c:643)
==102631==by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
==102631==by 0x9344A7: device_set_realized (qdev.c:531)
==102631==by 0x93E329: property_set_bool (object.c:2273)
==102631==by 0x93C2F8: object_property_set (object.c:1408)
==102631==by 0x940796: object_property_set_qobject (qom-qobject.c:28)
==102631==by 0x93C663: object_property_set_bool (object.c:1477)
==102631==by 0x933D3B: qdev_realize (qdev.c:333)
==102631==by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 accel/accel-common.c  | 6 ++
 accel/hvf/hvf-accel-ops.c | 1 +
 accel/kvm/kvm-accel-ops.c | 1 +
 accel/qtest/qtest.c   | 1 +
 accel/tcg/tcg-accel-ops.c | 1 +
 accel/xen/xen-all.c   | 1 +
 include/sysemu/accel-ops.h| 2 ++
 target/i386/hax/hax-accel-ops.c   | 1 +
 target/i386/nvmm/nvmm-accel-ops.c | 1 +
 target/i386/whpx/whpx-accel-ops.c | 1 +
 10 files changed, 16 insertions(+)

diff --git a/accel/accel-common.c b/accel/accel-common.c
index 7b8ec7e0f7..80b0d909b2 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -28,6 +28,7 @@
 
 #include "cpu.h"
 #include "hw/core/accel-cpu.h"
+#include "sysemu/accel-ops.h"
 
 #ifndef CONFIG_USER_ONLY
 #include "accel-softmmu.h"
@@ -135,3 +136,8 @@ static void register_accel_types(void)
 }
 
 type_init(register_accel_types);
+
+void destroy_vcpu_thread_generic(CPUState *cpu)
+{
+g_free(cpu->thread);
+}
diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c
index 54457c76c2..69c23f6763 100644
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -467,6 +467,7 @@ static void hvf_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = hvf_start_vcpu_thread;
+ops->destroy_vcpu_thread = destroy_vcpu_thread_generic;
 ops->kick_vcpu_thread = hvf_kick_vcpu_thread;
 
 ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c
index c4244a23c6..fd439f8e23 100644
--- a/accel/kvm/kvm-accel-ops.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -89,6 +89,7 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = kvm_start_vcpu_thread;
+ops->destroy_vcpu_thread = destroy_vcpu_thread_generic;
 ops->cpu_thread_is_idle = kvm_vcpu_thread_is_idle;
 ops->cpus_are_resettable = kvm_cpus_are_resettable;
 ops->synchronize_post_reset = kvm_cpu_synchronize_post_reset;
diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
index f6056ac836..3ea148ed0e 100644
--- a/accel/qtest/qtest.c
+++ b/accel/qtest/qtest.c
@@ -51,6 +51,7 @@ static void qtest_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = dummy_start_vcpu_thread;
+ops->destroy_vcpu_thread = destroy_vcpu_thread_generic;
 ops->get_virtual_clock = qtest_get_virtual_clock;
 };
 
diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
index ea7dcad674..4ef80c81e4 100644
--- a/accel/tcg/tcg-accel-ops.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -94,6 +94,7 @@ void tcg_handle_interrupt(CPUState *cpu, int mask)
 
 static void tcg_accel_ops_init(AccelOpsClass *ops)
 {
+ops->destroy_vcpu_thread = destroy_vcpu_thread_generic;
 if (qemu_tcg_mttcg_enabled()) {
 ops->create_vcpu_thread = mttcg_start_vcpu_thread;
 ops->kick_vcpu_thread = mttcg_kick_vcpu_thread;
diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 69aa7d018b..c5982a782c 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -220,6 +220,7 @@ static void xen_accel_ops_class_init(ObjectClass *oc, void 
*data)
 AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
 ops->create_vcpu_thread = dummy_start_vcpu_thread;
+ops->destroy_vcpu_thread = destroy_vcpu_thread_generic;
 }
 
 static const TypeInfo xen_accel_ops_type = {
diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index e296b27b82..46e3190119 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -46,4 +46,6 @@ struct AccelOpsClass {
 int64_t (*get_elapsed_ticks)(void);
 };
 
+/* free vcpu thread structures */
+void destroy_vcpu_thread_generic(CPUState *cpu);
 #endif /* ACCEL_OPS_H *

[PATCH v2 1/5] accel: Introduce AccelOpsClass::destroy_vcpu_thread()

2022-03-18 Thread Mark Kanda
Add destroy_vcpu_thread() to AccelOps as a method for vcpu thread cleanup.
This will be used in subsequent patches.

Suggested-by: Philippe Mathieu-Daude 
Signed-off-by: Mark Kanda 
---
 include/sysemu/accel-ops.h | 1 +
 softmmu/cpus.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index 6013c9444c..e296b27b82 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -31,6 +31,7 @@ struct AccelOpsClass {
 bool (*cpus_are_resettable)(void);
 
 void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */
+void (*destroy_vcpu_thread)(CPUState *cpu);
 void (*kick_vcpu_thread)(CPUState *cpu);
 bool (*cpu_thread_is_idle)(CPUState *cpu);
 
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 7b75bb66d5..622f8b4608 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -609,6 +609,9 @@ void cpu_remove_sync(CPUState *cpu)
 qemu_mutex_unlock_iothread();
 qemu_thread_join(cpu->thread);
 qemu_mutex_lock_iothread();
+if (cpus_accel->destroy_vcpu_thread) {
+cpus_accel->destroy_vcpu_thread(cpu);
+}
 }
 
 void cpus_register_accel(const AccelOpsClass *ops)
-- 
2.27.0




[PATCH v2 0/5] vCPU hotunplug related memory leaks

2022-03-18 Thread Mark Kanda
This series addresses a few vCPU hotunplug related leaks (found with Valgrind).

v2: Create AccelOpsClass::destroy_vcpu_thread() for vcpu thread related cleanup
(Philippe)

Mark Kanda (5):
  accel: Introduce AccelOpsClass::destroy_vcpu_thread()
  softmmu/cpus: Free cpu->thread in destroy_vcpu_thread_generic()
  softmmu/cpus: Free cpu->halt_cond in destroy_vcpu_thread_generic()
  cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()
  i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

 accel/accel-common.c  | 7 +++
 accel/hvf/hvf-accel-ops.c | 1 +
 accel/kvm/kvm-accel-ops.c | 1 +
 accel/qtest/qtest.c   | 1 +
 accel/tcg/tcg-accel-ops.c | 1 +
 accel/xen/xen-all.c   | 1 +
 cpu.c | 1 +
 include/sysemu/accel-ops.h| 3 +++
 softmmu/cpus.c| 3 +++
 target/i386/cpu.c | 5 +
 target/i386/hax/hax-accel-ops.c   | 1 +
 target/i386/nvmm/nvmm-accel-ops.c | 1 +
 target/i386/whpx/whpx-accel-ops.c | 1 +
 13 files changed, 27 insertions(+)

-- 
2.27.0




[PATCH v2 5/5] i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

2022-03-18 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==132362== 4,096 bytes in 1 blocks are definitely lost in loss record 8,440 of 
8,549
==132362==at 0x4C3B15F: memalign (vg_replace_malloc.c:1265)
==132362==by 0x4C3B288: posix_memalign (vg_replace_malloc.c:1429)
==132362==by 0xB41195: qemu_try_memalign (memalign.c:53)
==132362==by 0xB41204: qemu_memalign (memalign.c:73)
==132362==by 0x7131CB: kvm_init_xsave (kvm.c:1601)
==132362==by 0x7148ED: kvm_arch_init_vcpu (kvm.c:2031)
==132362==by 0x91D224: kvm_init_vcpu (kvm-all.c:516)
==132362==by 0x9242C9: kvm_vcpu_thread_fn (kvm-accel-ops.c:40)
==132362==by 0xB2EB26: qemu_thread_start (qemu-thread-posix.c:556)
==132362==by 0x7EB2159: start_thread (in /usr/lib64/libpthread-2.28.so)
==132362==by 0x9D45DD2: clone (in /usr/lib64/libc-2.28.so)

Signed-off-by: Mark Kanda 
---
 target/i386/cpu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a88d6554c8..014a716c36 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6572,6 +6572,11 @@ static void x86_cpu_unrealizefn(DeviceState *dev)
 }
 
 xcc->parent_unrealize(dev);
+
+#if defined(CONFIG_KVM) || defined(CONFIG_HVF)
+CPUX86State *env = >env;
+g_free(env->xsave_buf);
+#endif
 }
 
 typedef struct BitProperty {
-- 
2.27.0




[PATCH v2 4/5] cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()

2022-03-18 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==132362== 216 bytes in 1 blocks are definitely lost in loss record 7,119 of 
8,549
==132362==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==132362==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==132362==by 0x7E34AF: cpu_address_space_init (physmem.c:751)
==132362==by 0x45053E: qemu_init_vcpu (cpus.c:635)
==132362==by 0x76B4A7: x86_cpu_realizefn (cpu.c:6520)
==132362==by 0x9343ED: device_set_realized (qdev.c:531)
==132362==by 0x93E26F: property_set_bool (object.c:2273)
==132362==by 0x93C23E: object_property_set (object.c:1408)
==132362==by 0x9406DC: object_property_set_qobject (qom-qobject.c:28)
==132362==by 0x93C5A9: object_property_set_bool (object.c:1477)
==132362==by 0x933C81: qdev_realize (qdev.c:333)
==132362==by 0x455E9A: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpu.c b/cpu.c
index be1f8b074c..6a3475022f 100644
--- a/cpu.c
+++ b/cpu.c
@@ -173,6 +173,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
 if (tcg_enabled()) {
 tcg_exec_unrealizefn(cpu);
 }
+g_free(cpu->cpu_ases);
 
 cpu_list_remove(cpu);
 }
-- 
2.27.0




[PATCH v2 3/5] softmmu/cpus: Free cpu->halt_cond in destroy_vcpu_thread_generic()

2022-03-18 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==102631== 56 bytes in 1 blocks are definitely lost in loss record 5,089 of 
8,555
==102631==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==102631==by 0x69EE4CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==102631==by 0x924452: kvm_start_vcpu_thread (kvm-accel-ops.c:69)
==102631==by 0x4505C2: qemu_init_vcpu (cpus.c:643)
==102631==by 0x76B4D1: x86_cpu_realizefn (cpu.c:6520)
==102631==by 0x9344A7: device_set_realized (qdev.c:531)
==102631==by 0x93E329: property_set_bool (object.c:2273)
==102631==by 0x93C2F8: object_property_set (object.c:1408)
==102631==by 0x940796: object_property_set_qobject (qom-qobject.c:28)
==102631==by 0x93C663: object_property_set_bool (object.c:1477)
==102631==by 0x933D3B: qdev_realize (qdev.c:333)
==102631==by 0x455EC4: qdev_device_add_from_qdict (qdev-monitor.c:713)

Signed-off-by: Mark Kanda 
---
 accel/accel-common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/accel/accel-common.c b/accel/accel-common.c
index 80b0d909b2..ae71a27799 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -140,4 +140,5 @@ type_init(register_accel_types);
 void destroy_vcpu_thread_generic(CPUState *cpu)
 {
 g_free(cpu->thread);
+g_free(cpu->halt_cond);
 }
-- 
2.27.0




Re: [PATCH v4 1/3] qmp: Support for querying stats

2022-03-14 Thread Mark Kanda

Thank you Markus.

On 3/11/2022 7:06 AM, Markus Armbruster wrote:

Mark Kanda  writes:


Introduce QMP support for querying stats. Provide a framework for adding new
stats and support for the following commands:

- query-stats
Returns a list of all stats per target type (only VM and vCPU to start), with
additional options for specifying stat names, vCPU qom paths, and providers.

- query-stats-schemas
Returns a list of stats included in each target type, with an option for
specifying the provider.

The framework provides a method to register callbacks for these QMP commands.

The first use-case will be for fd-based KVM stats (in an upcoming patch).

Examples (with fd-based KVM stats):

- Query all VM stats:

{ "execute": "query-stats", "arguments" : { "target": "vm" } }

{ "return": {
   "vm": [
  { "provider": "kvm",
"stats": [
   { "name": "max_mmu_page_hash_collisions", "value": 0 },
   { "name": "max_mmu_rmap_size", "value": 0 },
   { "name": "nx_lpage_splits", "value": 148 },
   ...
  { "provider": "xyz",
"stats": [ ...
  ...
] } }

- Query all vCPU stats:

{ "execute": "query-stats", "arguments" : { "target": "vcpu" } }

{ "return": {
 "vcpus": [
   { "path": "/machine/unattached/device[0]"
 "providers": [
   { "provider": "kvm",
 "stats": [
   { "name": "guest_mode", "value": 0 },
   { "name": "directed_yield_successful", "value": 0 },
   { "name": "directed_yield_attempted", "value": 106 },
   ...
   { "provider": "xyz",
 "stats": [ ...
...
   { "path": "/machine/unattached/device[1]"
 "providers": [
   { "provider": "kvm",
 "stats": [...
   ...
} ] } }

- Query 'exits' and 'l1d_flush' KVM stats, and 'somestat' from provider 'xyz'
for vCPUs '/machine/unattached/device[2]' and '/machine/unattached/device[4]':

{ "execute": "query-stats",
   "arguments": {
 "target": "vcpu",
 "vcpus": [ "/machine/unattached/device[2]",
"/machine/unattached/device[4]" ],
 "filters": [
   { "provider": "kvm",
 "fields": [ "l1d_flush", "exits" ] },
   { "provider": "xyz",
 "fields": [ "somestat" ] } ] } }

Are the stats bulky enough to justfify the extra complexity of
filtering?


If this was only for KVM, the complexity probably isn't worth it. However, the 
framework is intended to support future stats with new providers and targets 
(there has also been mention of moving existing stats to this framework). 
Without some sort of filtering, I think the payload could become unmanageable.





{ "return": {
 "vcpus": [
   { "path": "/machine/unattached/device[2]"
 "providers": [
   { "provider": "kvm",
 "stats": [ { "name": "l1d_flush", "value": 41213 },
{ "name": "exits", "value": 74291 } ] },
   { "provider": "xyz",
 "stats": [ ... ] } ] },
   { "path": "/machine/unattached/device[4]"
 "providers": [
   { "provider": "kvm",
 "stats": [ { "name": "l1d_flush", "value": 16132 },
{ "name": "exits", "value": 57922 } ] },
   { "provider": "xyz",
 "stats": [ ... ] } ] } ] } }

- Query stats schemas:

{ "execute": "query-stats-schemas" }

{ "return": {
 "vcpu": [
   { "provider": "kvm",
 "stats": [
{ "name": "guest_mode",
  "unit": "none",
  "base": 10,
  "exponent": 0,
  "type": "instant" },
   { "name": "directed_yield_successful",
  "unit": "none",
  "base": 10,
  "exponen

Re: [PATCH 0/4] vCPU hotunplug related memory leaks

2022-02-22 Thread Mark Kanda

Gentle ping - any thoughts on this series?

Thanks/regards,
-Mark

On 1/26/2022 8:29 AM, Mark Kanda wrote:

This series addresses a few vCPU hotunplug related leaks (found with Valgrind).

Mark Kanda (4):
   softmmu/cpus: Free cpu->thread in cpu_remove_sync()
   softmmu/cpus: Free cpu->halt_cond in cpu_remove_sync()
   cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()
   i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

  cpu.c | 1 +
  softmmu/cpus.c| 2 ++
  target/i386/cpu.c | 2 ++
  3 files changed, 5 insertions(+)






[PATCH v4 0/3] Support fd-based KVM stats

2022-02-15 Thread Mark Kanda
This patchset adds QEMU support for querying fd-based KVM stats. The
kernel support was introduced by:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

v4:
- revise and cleanup the API [Daniel, Paolo]
- filtering for multiple providers [Daniel]
- cache KVM stat descriptors [Paolo]
- use g_autofree and other cleanup [Daniel]

v3:
- various QMP API enhancements from review [Daniel, Paolo, Igor]
- general cleanup

v2: [Paolo]
- generalize the interface
- add support for querying stat schema and instances
- add additional HMP semantic processing for a few exponent/unit
  combinations (related to seconds and bytes)

Mark Kanda (3):
  qmp: Support for querying stats
  hmp: Support for querying stats
  kvm: Support for querying fd-based stats

 accel/kvm/kvm-all.c | 393 
 hmp-commands-info.hx|  28 +++
 include/monitor/hmp.h   |   2 +
 include/monitor/stats.h |  51 ++
 monitor/hmp-cmds.c  | 328 +
 monitor/qmp-cmds.c  | 219 ++
 qapi/meson.build|   1 +
 qapi/qapi-schema.json   |   1 +
 qapi/stats.json | 259 ++
 9 files changed, 1282 insertions(+)
 create mode 100644 include/monitor/stats.h
 create mode 100644 qapi/stats.json

-- 
2.27.0




[PATCH v4 1/3] qmp: Support for querying stats

2022-02-15 Thread Mark Kanda
Introduce QMP support for querying stats. Provide a framework for adding new
stats and support for the following commands:

- query-stats
Returns a list of all stats per target type (only VM and vCPU to start), with
additional options for specifying stat names, vCPU qom paths, and providers.

- query-stats-schemas
Returns a list of stats included in each target type, with an option for
specifying the provider.

The framework provides a method to register callbacks for these QMP commands.

The first use-case will be for fd-based KVM stats (in an upcoming patch).

Examples (with fd-based KVM stats):

- Query all VM stats:

{ "execute": "query-stats", "arguments" : { "target": "vm" } }

{ "return": {
  "vm": [
 { "provider": "kvm",
   "stats": [
  { "name": "max_mmu_page_hash_collisions", "value": 0 },
  { "name": "max_mmu_rmap_size", "value": 0 },
  { "name": "nx_lpage_splits", "value": 148 },
  ...
 { "provider": "xyz",
   "stats": [ ...
 ...
] } }

- Query all vCPU stats:

{ "execute": "query-stats", "arguments" : { "target": "vcpu" } }

{ "return": {
"vcpus": [
  { "path": "/machine/unattached/device[0]"
"providers": [
  { "provider": "kvm",
"stats": [
  { "name": "guest_mode", "value": 0 },
  { "name": "directed_yield_successful", "value": 0 },
  { "name": "directed_yield_attempted", "value": 106 },
  ...
  { "provider": "xyz",
"stats": [ ...
   ...
  { "path": "/machine/unattached/device[1]"
"providers": [
  { "provider": "kvm",
"stats": [...
  ...
} ] } }

- Query 'exits' and 'l1d_flush' KVM stats, and 'somestat' from provider 'xyz'
for vCPUs '/machine/unattached/device[2]' and '/machine/unattached/device[4]':

{ "execute": "query-stats",
  "arguments": {
"target": "vcpu",
"vcpus": [ "/machine/unattached/device[2]",
   "/machine/unattached/device[4]" ],
"filters": [
  { "provider": "kvm",
"fields": [ "l1d_flush", "exits" ] },
  { "provider": "xyz",
"fields": [ "somestat" ] } ] } }

{ "return": {
"vcpus": [
  { "path": "/machine/unattached/device[2]"
"providers": [
  { "provider": "kvm",
"stats": [ { "name": "l1d_flush", "value": 41213 },
   { "name": "exits", "value": 74291 } ] },
  { "provider": "xyz",
"stats": [ ... ] } ] },
  { "path": "/machine/unattached/device[4]"
"providers": [
  { "provider": "kvm",
"stats": [ { "name": "l1d_flush", "value": 16132 },
   { "name": "exits", "value": 57922 } ] },
  { "provider": "xyz",
"stats": [ ... ] } ] } ] } }

- Query stats schemas:

{ "execute": "query-stats-schemas" }

{ "return": {
"vcpu": [
  { "provider": "kvm",
"stats": [
   { "name": "guest_mode",
 "unit": "none",
 "base": 10,
 "exponent": 0,
 "type": "instant" },
  { "name": "directed_yield_successful",
 "unit": "none",
 "base": 10,
 "exponent": 0,
 "type": "cumulative" },
 ...
  { "provider": "xyz",
...
   "vm": [
  { "provider": "kvm",
"stats": [
   { "name": "max_mmu_page_hash_collisions",
 "unit": "none",
 "base": 10,
 "exponent": 0,
 "type": "peak" },
  { "provider": "xyz",
  ...

Sign

[PATCH v4 2/3] hmp: Support for querying stats

2022-02-15 Thread Mark Kanda
Leverage the QMP support for querying stats. The interface supports similar
arguments as the QMP interface. Wildcard char (*) is accepted for names and
stats target.

Examples (with fd-based KVM stats):

- Display all stats
(qemu) info stats
vm
  provider: kvm
max_mmu_page_hash_collisions (peak): 0
max_mmu_rmap_size (peak): 0
...
vcpu (qom path: /machine/unattached/device[0])
  provider: kvm
guest_mode (instant): 0
directed_yield_successful (cumulative): 0
...

(qemu) info stats-schemas
vm
  provider: kvm
max_mmu_page_hash_collisions (peak)
max_mmu_rmap_size (peak)
...
vcpu
  provider: kvm
guest_mode (instant)
directed_yield_successful (cumulative)

- Display 'halt_wait_ns' and 'exits' for vCPUs with qom paths
/machine/unattached/device[2] and /machine/unattached/device[4]:

(qemu) info stats exits,halt_wait_ns /machine/unattached/device[2],
/machine/unattached/device[4]

vcpu (qom path: /machine/unattached/device[2])
  provider: kvm
exits (cumulative): 52369
halt_wait_ns (cumulative nanoseconds): 416092704390
vcpu (qom path: /machine/unattached/device[4])
  provider: kvm
exits (cumulative): 52550
halt_wait_ns (cumulative nanoseconds): 419637402657

- Display all VM stats for provider KVM:

(qemu) info stats * vm kvm
vm
  provider: kvm
max_mmu_page_hash_collisions (peak): 0
max_mmu_rmap_size (peak): 0
nx_lpage_splits (instant): 51
...

Signed-off-by: Mark Kanda 
---
 hmp-commands-info.hx  |  28 
 include/monitor/hmp.h |   2 +
 monitor/hmp-cmds.c| 328 ++
 3 files changed, 358 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index e90f20a107..c0ad3a3e2a 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -879,3 +879,31 @@ SRST
   ``info sgx``
 Show intel SGX information.
 ERST
+
+{
+.name   = "stats",
+.args_type  = "names:s?,vcpus:s?,provider:s?",
+.params = "[names] [vcpus] [provider]",
+.help   = "show statistics; optional comma separated names, "
+ "vcpu qom paths, and provider",
+.cmd= hmp_info_stats,
+},
+
+SRST
+  ``stats``
+Show stats
+ERST
+
+{
+.name   = "stats-schemas",
+.args_type  = "provider:s?",
+.params = "[provider]",
+.help   = "show statistics schema for each provider",
+.cmd= hmp_info_stats_schemas,
+},
+
+SRST
+  ``stats-schemas``
+Show stats per schema
+ERST
+
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d014826a..a748511105 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -133,5 +133,7 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_human_readable_text_helper(Monitor *mon,
 HumanReadableText *(*qmp_handler)(Error 
**));
+void hmp_info_stats(Monitor *mon, const QDict *qdict);
+void hmp_info_stats_schemas(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 8c384dc1b2..2a8f1432fb 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -40,6 +40,7 @@
 #include "qapi/qapi-commands-pci.h"
 #include "qapi/qapi-commands-rocker.h"
 #include "qapi/qapi-commands-run-state.h"
+#include "qapi/qapi-commands-stats.h"
 #include "qapi/qapi-commands-tpm.h"
 #include "qapi/qapi-commands-ui.h"
 #include "qapi/qapi-visit-net.h"
@@ -2178,3 +2179,330 @@ void hmp_info_memory_size_summary(Monitor *mon, const 
QDict *qdict)
 }
 hmp_handle_error(mon, err);
 }
+
+static void print_stats_schema_value(Monitor *mon, StatsSchemaValue *value)
+{
+monitor_printf(mon, "%s (%s", value->name, StatsType_str(value->type));
+
+if (value->unit == STATS_UNIT_SECONDS &&
+value->exponent >= -9 && value->exponent <= 0 &&
+value->exponent % 3 == 0 && value->base == STATS_BASE_POW10) {
+
+const char *si_prefix[] = { "", "milli", "micro", "nano" };
+monitor_printf(mon, " %s", si_prefix[value->exponent / -3]);
+
+} else if (value->unit == STATS_UNIT_BYTES &&
+value->exponent >= 0 && value->exponent <= 40 &&
+value->exponent % 10 == 0 && value->base == STATS_BASE_POW2) {
+
+const char *si_prefix[] = {
+"", "kilo", "mega", "giga", "tera" };
+monitor_printf(mon, " %s", si_prefix[value->exponent / 10]);
+
+} else if (value->exponent) {
+/* Print the base and exponent as "x ^" */
+monitor_printf(mo

[PATCH v4 3/3] kvm: Support for querying fd-based stats

2022-02-15 Thread Mark Kanda
Add support for querying fd-based KVM stats - as introduced by Linux kernel
commit:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

Signed-off-by: Mark Kanda 
---
 accel/kvm/kvm-all.c | 393 
 qapi/stats.json |   2 +-
 2 files changed, 394 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0e66ebb497..7efd9b86c8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -47,6 +47,7 @@
 #include "kvm-cpus.h"
 
 #include "hw/boards.h"
+#include "monitor/stats.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -2309,6 +2310,9 @@ bool kvm_dirty_ring_enabled(void)
 return kvm_state->kvm_dirty_ring_size ? true : false;
 }
 
+static void query_stats_cb(StatsResults *, StatsFilter *, Error **);
+static void query_stats_schemas_cb(StatsSchemaResults *, Error **);
+
 static int kvm_init(MachineState *ms)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2637,6 +2641,11 @@ static int kvm_init(MachineState *ms)
 }
 }
 
+if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
+add_stats_callbacks(STATS_PROVIDER_KVM, _stats_cb,
+_stats_schemas_cb);
+}
+
 return 0;
 
 err:
@@ -3696,3 +3705,387 @@ static void kvm_type_init(void)
 }
 
 type_init(kvm_type_init);
+
+typedef struct StatsArgs {
+StatsFilter *filter;
+bool is_schema;
+union StatsResultsType {
+StatsResults *stats;
+StatsSchemaResults *schema;
+} result;
+Error **errp;
+} StatsArgs;
+
+static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
+uint64_t *stats_data,
+StatsList *stats_list,
+Error **errp)
+{
+
+StatsList *stats_entry;
+Stats *stats;
+uint64List *val_list = NULL;
+
+switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+case KVM_STATS_TYPE_CUMULATIVE:
+case KVM_STATS_TYPE_INSTANT:
+case KVM_STATS_TYPE_PEAK:
+case KVM_STATS_TYPE_LINEAR_HIST:
+case KVM_STATS_TYPE_LOG_HIST:
+break;
+default:
+return stats_list;
+}
+
+switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
+case KVM_STATS_UNIT_NONE:
+case KVM_STATS_UNIT_BYTES:
+case KVM_STATS_UNIT_CYCLES:
+case KVM_STATS_UNIT_SECONDS:
+break;
+default:
+return stats_list;
+}
+
+switch (pdesc->flags & KVM_STATS_BASE_MASK) {
+case KVM_STATS_BASE_POW10:
+case KVM_STATS_BASE_POW2:
+break;
+default:
+return stats_list;
+}
+
+/* Alloc and populate data list */
+stats_entry = g_malloc0(sizeof(*stats_entry));
+stats = g_malloc0(sizeof(*stats));
+stats->name = g_strdup(pdesc->name);
+stats->value = g_malloc0(sizeof(*stats->value));
+
+if (pdesc->size == 1) {
+stats->value->u.scalar = *stats_data;
+stats->value->type = QTYPE_QNUM;
+} else {
+int i;
+for (i = 0; i < pdesc->size; i++) {
+uint64List *val_entry = g_malloc0(sizeof(*val_entry));
+val_entry->value = stats_data[i];
+val_entry->next = val_list;
+val_list = val_entry;
+}
+stats->value->u.list.values = val_list;
+stats->value->type = QTYPE_QDICT;
+}
+
+stats_entry->value = stats;
+stats_entry->next = stats_list;
+
+return stats_entry;
+}
+
+static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
+ StatsSchemaValueList *list,
+ Error **errp)
+{
+StatsSchemaValueList *schema_entry = g_malloc0(sizeof(*schema_entry));
+schema_entry->value = g_malloc0(sizeof(*schema_entry->value));
+
+switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+case KVM_STATS_TYPE_CUMULATIVE:
+schema_entry->value->type = STATS_TYPE_CUMULATIVE;
+break;
+case KVM_STATS_TYPE_INSTANT:
+schema_entry->value->type = STATS_TYPE_INSTANT;
+break;
+case KVM_STATS_TYPE_PEAK:
+schema_entry->value->type = STATS_TYPE_PEAK;
+break;
+case KVM_STATS_TYPE_LINEAR_HIST:
+schema_entry->value->type = STATS_TYPE_LINEAR_HIST;
+schema_entry->value->bucket_size = pdesc->bucket_size;
+schema_entry->value->has_bucket_size = true;
+break;
+case KVM_STATS_TYPE_LOG_HIST:
+schema_entry->value->type = STATS_TYPE_LOG_HIST;
+break;
+default:
+goto exit;
+}
+
+switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
+case KVM_STATS_UNIT_NONE:
+schema_entry->value->unit = STATS_UNIT_NONE;
+break;
+case KVM_STATS_UNIT_

Re: [PATCH v3 1/3] qmp: Support for querying stats

2022-02-03 Thread Mark Kanda

On 2/1/2022 6:08 AM, Daniel P. Berrangé wrote:

+##
+# @StatsResults:
+#
+# Target specific results.
+#
+# Since: 7.0
+##
+{ 'union': 'StatsResults',
+  'base': { 'target': 'StatsTarget' },
+  'discriminator': 'target',
+  'data': { 'vcpu': 'VCPUStatsResults',
+'vm': 'VMStatsResults' } }

I feel we can simplify this all down somewhat, eliminating levels
of redundant nesting

{ 'struct': 'StatsResultsEntry',
   'data': { '*kvm': [ 'Stats' ] } }

{ 'struct': 'StatsResultsVCPUEntry',
   'base': 'StatsResultsEntry',
   'data': 'path': 'str' } }

{ 'struct': 'StatsResults',
   'data': {
   '*vcpus': ['StatsResultsVCPUEntry'],
   '*vm': 'StatsResultsEntry'
   }
}




I'm happy to make this change, but I would like Paolo to comment as he had 
suggested the StatsResults layout [1].


Thanks Daniel/Paolo,
-Mark

[1] https://lore.kernel.org/all/ee0d6990-06f3-9a1b-f7d5-7c379f0e9...@redhat.com/

Re: [PATCH v3 1/3] qmp: Support for querying stats

2022-02-03 Thread Mark Kanda




On 2/3/2022 12:30 PM, Daniel P. Berrangé wrote:

On Thu, Feb 03, 2022 at 12:12:57PM -0600, Mark Kanda wrote:

Thanks Daniel,

On 2/1/2022 6:08 AM, Daniel P. Berrangé wrote:

+#
+# Since: 7.0
+##
+{ 'enum' : 'StatType',
+  'data' : [ 'cumulative', 'instant', 'peak',
+ 'linear-hist', 'log-hist', 'unknown' ] }

IMHO 'unknown' shouldn't exist at all.


I added the 'unknown' member here (and in other enums) to handle situations
where QEMU is behind KVM in terms of enumerating the various stat types,
units, etc. I feel this will be a semi-common scenario (old QEMU on a new
kernel) and with 'unknown', QEMU can at least display the raw data.

That said, I happy skip 'unknown' entries if you think that's better.

Yep, I don't think we should be including 'unknown' stuff.

An application could use this, and then we add support for the
new type and the application will now break with new QEMU because
it is presented in QMP in a different way.

The same for the 'unknown' base and unit too for that matter.



Sounds good. I'll implement the other changes you suggested in v4.

Thanks/regards,
-Mark



Re: [PATCH v3 1/3] qmp: Support for querying stats

2022-02-03 Thread Mark Kanda

Thanks Daniel,

On 2/1/2022 6:08 AM, Daniel P. Berrangé wrote:

+#
+# Since: 7.0
+##
+{ 'enum' : 'StatType',
+  'data' : [ 'cumulative', 'instant', 'peak',
+ 'linear-hist', 'log-hist', 'unknown' ] }

IMHO 'unknown' shouldn't exist at all.



I added the 'unknown' member here (and in other enums) to handle situations 
where QEMU is behind KVM in terms of enumerating the various stat types, units, 
etc. I feel this will be a semi-common scenario (old QEMU on a new kernel) and 
with 'unknown', QEMU can at least display the raw data.


That said, I happy skip 'unknown' entries if you think that's better.

Thanks/regards,
-Mark


[PATCH v3 1/3] qmp: Support for querying stats

2022-01-31 Thread Mark Kanda
Introduce QMP support for querying stats. Provide a framework for adding new
stats and support for the following commands:

- query-stats
Returns a list of all stats per target type (only VM and VCPU for now), with
additional options for specifying stat names, VCPU qom paths, and stat provider.

- query-stats-schemas
Returns a list of stats included in each schema type, with an option for
specifying the stat provider.

The framework provides a method to register callbacks for these QMP commands.

The first usecase will be for fd-based KVM stats (in an upcoming patch).

Examples (with fd-based KVM stats):

- Display all VM stats:

{ "execute": "query-stats", "arguments" : { "target": "vm" } }
{ "return": {
"list": [
  { "provider": "kvm",
"stats": [
  { "name": "max_mmu_page_hash_collisions", "value": 0 },
  { "name": "max_mmu_rmap_size", "value": 0 },
  { "name": "nx_lpage_splits", "value": 131 },
 ...
] }
  { "provider": "provider XYZ",
  ...
],
"target": "vm"
  }
}

- Display all VCPU stats:

{ "execute": "query-stats", "arguments" : { "target": "vcpu" } }
{ "return": {
"list": [
  { "list": [
  { "provider": "kvm",
"stats": [
  { "name": "guest_mode", "value": 0 },
  { "name": "directed_yield_successful", "value": 0  },
  { "name": "directed_yield_attempted", "value": 76 },
  ...
] }
  { "provider": "provider XYZ",
  ...
],
"path": "/machine/unattached/device[0]"
  },
  { "list": [
  { "provider": "kvm",
"stats": [
  { "name": "guest_mode", "value": 0 },
  { "name": "directed_yield_successful", "value": 0 },
  { "name": "directed_yield_attempted", "value": 51 },
  ...
  }
],
"target": "vcpu"
  }
}

- Display 'exits' and 'l1d_flush' KVM stats for VCPUs at 
'/machine/unattached/device[2]'
and '/machine/unattached/device[4]':

{ "execute": "query-stats",
  "arguments" : { "target": "vcpu",
  "fields": [ "exits", "l1d_flush" ],
  "paths": [ "/machine/unattached/device[2]",
  "/machine/unattached/device[4]" ]
  "provider": "kvm" } }

{ "return": {
"list": [
  { "list": [
  { "provider": "kvm",
"stats": [
  { "name": "l1d_flush", "value": 14690 },
  { "name": "exits", "value": 50898 }
] }
],
"path": "/machine/unattached/device[2]"
  },
  { "list": [
  { "provider": "kvm",
"stats": [
  { "name": "l1d_flush", "value": 24902 },
  { "name": "exits", "value": 74374 }
] }
 ],
"path": "/machine/unattached/device[4]"
  }
],
"target": "vcpu"
  }
}

- Query stats schemas:

{ "execute": "query-stats-schemas" }
{ "return": {
"vcpu": [
  { "provider": "kvm",
"stats": [
   { "name": "guest_mode",
 "unit": "none",
 "base": 10,
 "exponent": 0,
 "type": "instant" },
   { "name": "directed_yield_successful",
 "unit": "none",
 "base": 10,
 "exponent": 0,
 "type": "cumulative" },
 ...
"provider": "provider XYZ",
...
   "vm": [
  { "provider": "kvm",
"stats": [
   { "name": "max_mmu_page_hash_collisions",
 "unit": "none",
 "base": 10,
 "exponent": 0,

[PATCH v3 3/3] kvm: Support for querying fd-based stats

2022-01-31 Thread Mark Kanda
Add support for querying fd-based KVM stats - as introduced by Linux kernel
commit:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

Signed-off-by: Mark Kanda 
---
 accel/kvm/kvm-all.c | 308 
 qapi/misc.json  |   2 +-
 2 files changed, 309 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0e66ebb497..e43e1f1c1c 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -47,6 +47,8 @@
 #include "kvm-cpus.h"
 
 #include "hw/boards.h"
+#include "qapi/qapi-commands-misc.h"
+#include "monitor/stats.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -2309,6 +2311,9 @@ bool kvm_dirty_ring_enabled(void)
 return kvm_state->kvm_dirty_ring_size ? true : false;
 }
 
+static void query_stats_cb(StatsResults *, StatsFilter *, Error **);
+static void query_stats_schemas_cb(StatsSchemaResult *, Error **);
+
 static int kvm_init(MachineState *ms)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2637,6 +2642,11 @@ static int kvm_init(MachineState *ms)
 }
 }
 
+if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
+add_stats_callbacks(STATS_PROVIDER_KVM, _stats_cb,
+_stats_schemas_cb);
+}
+
 return 0;
 
 err:
@@ -3696,3 +3706,301 @@ static void kvm_type_init(void)
 }
 
 type_init(kvm_type_init);
+
+typedef struct StatsArgs {
+StatsFilter *filter;
+bool is_schema;
+union {
+StatsResults *stats;
+StatsSchemaResult *schema;
+} result;
+Error **errp;
+} StatsArgs;
+
+static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
+uint64_t *stats_data,
+StatsList *stats_list,
+Error **errp)
+{
+StatsList *stats_entry = g_malloc0(sizeof(*stats_entry));
+Stats *stats = g_malloc0(sizeof(*stats));
+uint64List *val_list = NULL;
+int i;
+
+stats->name = g_strdup(pdesc->name);
+stats->value = g_malloc0(sizeof(*stats->value));
+
+/* Alloc and populate data list */
+if (pdesc->size == 1) {
+stats->value->u.scalar = *stats_data;
+stats->value->type = QTYPE_QNUM;
+} else {
+for (i = 0; i < pdesc->size; i++) {
+uint64List *val_entry = g_malloc0(sizeof(*val_entry));
+val_entry->value = stats_data[i];
+val_entry->next = val_list;
+val_list = val_entry;
+}
+stats->value->u.list.list = val_list;
+stats->value->type = QTYPE_QDICT;
+}
+
+stats_entry->value = stats;
+stats_entry->next = stats_list;
+
+return stats_entry;
+}
+
+static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
+ StatsSchemaValueList *list,
+ Error **errp)
+{
+StatsSchemaValueList *schema_entry;
+
+schema_entry = g_malloc0(sizeof(*schema_entry));
+schema_entry->value = g_malloc0(sizeof(*schema_entry->value));
+schema_entry->value->name = g_strdup(pdesc->name);
+
+switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+case KVM_STATS_TYPE_CUMULATIVE:
+schema_entry->value->type = STAT_TYPE_CUMULATIVE;
+break;
+case KVM_STATS_TYPE_INSTANT:
+schema_entry->value->type = STAT_TYPE_INSTANT;
+break;
+case KVM_STATS_TYPE_PEAK:
+schema_entry->value->type = STAT_TYPE_PEAK;
+break;
+case KVM_STATS_TYPE_LINEAR_HIST:
+schema_entry->value->type = STAT_TYPE_LINEAR_HIST;
+break;
+case KVM_STATS_TYPE_LOG_HIST:
+schema_entry->value->type = STAT_TYPE_LOG_HIST;
+break;
+default:
+schema_entry->value->type = STAT_TYPE_UNKNOWN;
+}
+
+switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
+case KVM_STATS_UNIT_NONE:
+schema_entry->value->unit = STAT_UNIT_NONE;
+break;
+case KVM_STATS_UNIT_BYTES:
+schema_entry->value->unit = STAT_UNIT_BYTES;
+break;
+case KVM_STATS_UNIT_CYCLES:
+schema_entry->value->unit = STAT_UNIT_CYCLES;
+break;
+case KVM_STATS_UNIT_SECONDS:
+schema_entry->value->unit = STAT_UNIT_SECONDS;
+break;
+default:
+schema_entry->value->unit = STAT_UNIT_UNKNOWN;
+}
+
+switch (pdesc->flags & KVM_STATS_BASE_MASK) {
+case KVM_STATS_BASE_POW10:
+schema_entry->value->base = STAT_BASE_POW10;
+break;
+case KVM_STATS_BASE_POW2:
+schema_entry->value->base = STAT_BASE_POW2;
+break;
+default:
+schema_entry->value->base = STAT_BASE_UNKNOWN;
+}
+
+schema_entry->val

[PATCH v3 2/3] hmp: Support for querying stats

2022-01-31 Thread Mark Kanda
Leverage the QMP support for querying stats. The interface supports the same
arguments as the QMP interface. Wildcard char (*) is accepted for names and
stats target.

Examples (with fd-based KVM stats):

- Display all stats
(qemu) info stats
vm
  provider: kvm
max_mmu_page_hash_collisions (peak): 0
max_mmu_rmap_size (peak): 0
...
vcpu (qom path: /machine/unattached/device[0])
  provider: kvm
guest_mode (instant): 0
directed_yield_successful (cumulative): 0
...

(qemu) info stats-schemas
vm
  provider: kvm
max_mmu_page_hash_collisions (peak)
max_mmu_rmap_size (peak)
...
vcpu
  provider: kvm
guest_mode (instant)
directed_yield_successful (cumulative)

- Display 'halt_wait_ns' and 'exits' for vCPUs with qom paths
/machine/unattached/device[2] and /machine/unattached/device[4]:

(qemu) info stats exits,halt_wait_ns /machine/unattached/device[2],
/machine/unattached/device[4]

vcpu (qom path: /machine/unattached/device[2])
  provider: kvm
exits (cumulative): 52369
halt_wait_ns (cumulative nanoseconds): 416092704390
vcpu (qom path: /machine/unattached/device[4])
  provider: kvm
exits (cumulative): 52550
halt_wait_ns (cumulative nanoseconds): 419637402657

- Display all VM stats for provider KVM:

(qemu) info stats * vm kvm
vm
  provider: kvm
max_mmu_page_hash_collisions (peak): 0
max_mmu_rmap_size (peak): 0
nx_lpage_splits (instant): 51
...

Signed-off-by: Mark Kanda 
---
 hmp-commands-info.hx  |  28 
 include/monitor/hmp.h |   2 +
 monitor/hmp-cmds.c| 288 ++
 3 files changed, 318 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index e90f20a107..7365a8e002 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -879,3 +879,31 @@ SRST
   ``info sgx``
 Show intel SGX information.
 ERST
+
+{
+.name   = "stats",
+.args_type  = "names:s?,paths:s?,provider:s?",
+.params = "[names] [paths] [provider]",
+.help   = "show statistics; optional comma separated names, "
+ "vcpu qom paths, and provider",
+.cmd= hmp_info_stats,
+},
+
+SRST
+  ``stats``
+Show stats
+ERST
+
+{
+.name   = "stats-schemas",
+.args_type  = "provider:s?",
+.params = "[provider]",
+.help   = "show statistics schema for each provider",
+.cmd= hmp_info_stats_schemas,
+},
+
+SRST
+  ``stats-schemas``
+Show stats per schema
+ERST
+
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d014826a..a748511105 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -133,5 +133,7 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_human_readable_text_helper(Monitor *mon,
 HumanReadableText *(*qmp_handler)(Error 
**));
+void hmp_info_stats(Monitor *mon, const QDict *qdict);
+void hmp_info_stats_schemas(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 8c384dc1b2..19b2a585d6 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -2178,3 +2178,291 @@ void hmp_info_memory_size_summary(Monitor *mon, const 
QDict *qdict)
 }
 hmp_handle_error(mon, err);
 }
+
+static void print_stats_schema_value(Monitor *mon, StatsSchemaValue *value)
+{
+monitor_printf(mon, "%s (%s", value->name, StatType_str(value->type));
+
+if (value->unit == STAT_UNIT_SECONDS &&
+value->exponent >= -9 && value->exponent <= 0 &&
+value->exponent % 3 == 0 && value->base == STAT_BASE_POW10) {
+
+const char *si_prefix[] = { "", "milli", "micro", "nano" };
+monitor_printf(mon, " %s", si_prefix[value->exponent / -3]);
+
+} else if (value->unit == STAT_UNIT_BYTES &&
+value->exponent >= 0 && value->exponent <= 40 &&
+value->exponent % 10 == 0 && value->base == STAT_BASE_POW2) {
+
+const char *si_prefix[] = {
+"", "kilo", "mega", "giga", "tera" };
+monitor_printf(mon, " %s", si_prefix[value->exponent / 10]);
+
+} else if (value->exponent) {
+/* Print the base and exponent as "x ^" */
+monitor_printf(mon, " x %s^%d", StatBase_str(value->base),
+   value->exponent);
+}
+
+/* Don't print "none" unit type */
+monitor_printf(mon, "%s)", value->unit == STAT_UNIT_NONE ?
+   "" : StatUnit_str(value->unit));
+}
+
+static StatsSchemaValueList *fi

[PATCH v3 0/3] Support fd-based KVM stats

2022-01-31 Thread Mark Kanda
v3:
- various QMP API enhancements from v2 review [1] [Daniel, Paolo, Igor]
- general cleanup

v2: [Paolo]
- generalize the interface
- add support for querying stat schema and instances
- add additional HMP semantic processing for a few exponent/unit
   combinations (related to seconds and bytes)

This patchset adds QEMU support for querying fd-based KVM stats. The
kernel support was introduced by:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

[1] https://lore.kernel.org/all/2029195153.11815-1-mark.ka...@oracle.com/

Mark Kanda (3):
  qmp: Support for querying stats
  hmp: Support for querying stats
  kvm: Support for querying fd-based stats

 accel/kvm/kvm-all.c | 308 
 hmp-commands-info.hx|  28 
 include/monitor/hmp.h   |   2 +
 include/monitor/stats.h |  36 +
 monitor/hmp-cmds.c  | 288 +
 monitor/qmp-cmds.c  | 183 
 qapi/misc.json  | 253 +
 7 files changed, 1098 insertions(+)
 create mode 100644 include/monitor/stats.h

-- 
2.27.0




[PATCH 2/4] softmmu/cpus: Free cpu->halt_cond in cpu_remove_sync()

2022-01-26 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==377357== 56 bytes in 1 blocks are definitely lost in loss record 5,017 of 
8,471
==377357==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==377357==by 0x65C14CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==377357==by 0x8B5AE0: kvm_start_vcpu_thread (kvm-accel-ops.c:69)
==377357==by 0x7817AF: qemu_init_vcpu (cpus.c:634)
==377357==by 0x7185A3: x86_cpu_realizefn (cpu.c:6447)
==377357==by 0x8E46B7: device_set_realized (qdev.c:531)
==377357==by 0x8EE36F: property_set_bool (object.c:2268)
==377357==by 0x8EC3C5: object_property_set (object.c:1403)
==377357==by 0x8F075D: object_property_set_qobject (qom-qobject.c:28)
==377357==by 0x8EC72C: object_property_set_bool (object.c:1472)
==377357==by 0x8E3F7F: qdev_realize (qdev.c:333)
==377357==by 0x43F3A2: qdev_device_add_from_qdict (qdev-monitor.c:711)

Signed-off-by: Mark Kanda 
---
 softmmu/cpus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 1d8380d4aa..edaa36f6dc 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -604,6 +604,7 @@ void cpu_remove_sync(CPUState *cpu)
 qemu_thread_join(cpu->thread);
 qemu_mutex_lock_iothread();
 g_free(cpu->thread);
+g_free(cpu->halt_cond);
 }
 
 void cpus_register_accel(const AccelOpsClass *ops)
-- 
2.27.0




[PATCH 1/4] softmmu/cpus: Free cpu->thread in cpu_remove_sync()

2022-01-26 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==377357== 8 bytes in 1 blocks are definitely lost in loss record 1,029 of 8,471
==377357==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==377357==by 0x65C14CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==377357==by 0x8B5AC8: kvm_start_vcpu_thread (kvm-accel-ops.c:68)
==377357==by 0x7817AF: qemu_init_vcpu (cpus.c:634)
==377357==by 0x7185A3: x86_cpu_realizefn (cpu.c:6447)
==377357==by 0x8E46B7: device_set_realized (qdev.c:531)
==377357==by 0x8EE36F: property_set_bool (object.c:2268)
==377357==by 0x8EC3C5: object_property_set (object.c:1403)
==377357==by 0x8F075D: object_property_set_qobject (qom-qobject.c:28)
==377357==by 0x8EC72C: object_property_set_bool (object.c:1472)
==377357==by 0x8E3F7F: qdev_realize (qdev.c:333)
==377357==by 0x43F3A2: qdev_device_add_from_qdict (qdev-monitor.c:711)

Signed-off-by: Mark Kanda 
---
 softmmu/cpus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 23bca46b07..1d8380d4aa 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -603,6 +603,7 @@ void cpu_remove_sync(CPUState *cpu)
 qemu_mutex_unlock_iothread();
 qemu_thread_join(cpu->thread);
 qemu_mutex_lock_iothread();
+g_free(cpu->thread);
 }
 
 void cpus_register_accel(const AccelOpsClass *ops)
-- 
2.27.0




[PATCH 0/4] vCPU hotunplug related memory leaks

2022-01-26 Thread Mark Kanda
This series addresses a few vCPU hotunplug related leaks (found with Valgrind).

Mark Kanda (4):
  softmmu/cpus: Free cpu->thread in cpu_remove_sync()
  softmmu/cpus: Free cpu->halt_cond in cpu_remove_sync()
  cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()
  i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

 cpu.c | 1 +
 softmmu/cpus.c| 2 ++
 target/i386/cpu.c | 2 ++
 3 files changed, 5 insertions(+)

-- 
2.27.0




[PATCH 3/4] cpu: Free cpu->cpu_ases in cpu_exec_unrealizefn()

2022-01-26 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==377357== 216 bytes in 1 blocks are definitely lost in loss record 7,042 of 
8,471
==377357==at 0x4C3ADBB: calloc (vg_replace_malloc.c:1117)
==377357==by 0x65C14CD: g_malloc0 (in /usr/lib64/libglib-2.0.so.0.5600.4)
==377357==by 0x78552E: cpu_address_space_init (physmem.c:750)
==377357==by 0x78175B: qemu_init_vcpu (cpus.c:629)
==377357==by 0x7185A3: x86_cpu_realizefn (cpu.c:6447)
==377357==by 0x8E46B7: device_set_realized (qdev.c:531)
==377357==by 0x8EE36F: property_set_bool (object.c:2268)
==377357==by 0x8EC3C5: object_property_set (object.c:1403)
==377357==by 0x8F075D: object_property_set_qobject (qom-qobject.c:28)
==377357==by 0x8EC72C: object_property_set_bool (object.c:1472)
==377357==by 0x8E3F7F: qdev_realize (qdev.c:333)
==377357==by 0x43F3A2: qdev_device_add_from_qdict (qdev-monitor.c:711)

Signed-off-by: Mark Kanda 
---
 cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpu.c b/cpu.c
index 016bf06a1a..d5c730164a 100644
--- a/cpu.c
+++ b/cpu.c
@@ -170,6 +170,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
 if (tcg_enabled()) {
 tcg_exec_unrealizefn(cpu);
 }
+g_free(cpu->cpu_ases);
 
 cpu_list_remove(cpu);
 }
-- 
2.27.0




[PATCH 4/4] i386/cpu: Free env->xsave_buf in x86_cpu_unrealizefn()

2022-01-26 Thread Mark Kanda
vCPU hotunplug related leak reported by Valgrind:

==377357== 4,096 bytes in 1 blocks are definitely lost in loss record 8,354 of 
8,471
==377357==at 0x4C3B15F: memalign (vg_replace_malloc.c:1265)
==377357==by 0x4C3B288: posix_memalign (vg_replace_malloc.c:1429)
==377357==by 0xAA4773: qemu_try_memalign (oslib-posix.c:222)
==377357==by 0xAA47E5: qemu_memalign (oslib-posix.c:238)
==377357==by 0x6C403D: kvm_arch_init_vcpu (kvm.c:1986)
==377357==by 0x8AEB01: kvm_init_vcpu (kvm-all.c:516)
==377357==by 0x8B59EA: kvm_vcpu_thread_fn (kvm-accel-ops.c:40)
==377357==by 0xAA72F0: qemu_thread_start (qemu-thread-posix.c:556)
==377357==by 0x8EE8159: start_thread (in /usr/lib64/libpthread-2.28.so)
==377357==by 0x91FCDD2: clone (in /usr/lib64/libc-2.28.so)

Signed-off-by: Mark Kanda 
---
 target/i386/cpu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index aa9e636800..33405d245d 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6487,6 +6487,7 @@ static void x86_cpu_unrealizefn(DeviceState *dev)
 {
 X86CPU *cpu = X86_CPU(dev);
 X86CPUClass *xcc = X86_CPU_GET_CLASS(dev);
+CPUX86State *env = >env;
 
 #ifndef CONFIG_USER_ONLY
 cpu_remove_sync(CPU(dev));
@@ -6499,6 +6500,7 @@ static void x86_cpu_unrealizefn(DeviceState *dev)
 }
 
 xcc->parent_unrealize(dev);
+g_free(env->xsave_buf);
 }
 
 typedef struct BitProperty {
-- 
2.27.0




Re: [PATCH v2 1/3] qmp: Support for querying stats

2022-01-18 Thread Mark Kanda

On 1/18/2022 6:52 AM, Daniel P. Berrangé wrote:

On Tue, Jan 18, 2022 at 01:26:32PM +0100, Paolo Bonzini wrote:

On 1/17/22 16:17, Mark Kanda wrote:

I agree except that I think this and StatsResults should be unions,
even if it means running multiple query-stats commands.

IIUC, making StatsResults a union implies the filter is a required
argument (currently it is optional - omitting it dumps all VM and VCPU
stats). Just to confirm - we want the filter to be required?

Yeah, I think at least the "kind" (vcpu, vm, perhaps in the future block or
net) should be mandatory.  If the caller doesn't know of a "kind", chances
are it won't be able to understand what object the stats refer to, for
example the vcpu "id" here:

{ 'union': 'StatsResults',
'base': { 'target': 'StatsTarget', stats: ['StatsResultsEntry'] },
'discriminator': 'target',
'data': { 'vcpu': ['id': 'int'] }
}

(which is another different between Daniel's proposal and mine; his just
placed all vcpus into an array with no explicit id, if I understand
correctly).

An explicit ID isn't strictly required, since the caller can assume
the results are ordered on CPU ID, so even if they gave a request
for a sparse subset of CPUs, the results can be interpreted.  None
the less having a vCPU id included is more friendly, so worth
having.


Regards,
Daniel

OK. Thank you Daniel and Paolo. I'll implement these changes for v3.

Best regards,
-Mark



Re: [PATCH v2 1/3] qmp: Support for querying stats

2022-01-17 Thread Mark Kanda

On 1/17/2022 6:05 AM, Paolo Bonzini wrote:

On 12/7/21 19:42, Daniel P. Berrangé wrote:

Now onto the values being reported. AFAICT from the kernel
docs, for all the types of data it currently reports
(cumulative, instant, peak, none), there is only ever going
to be a single value. I assume the ability to report multiple
values is future proofing for a later requirement.


Yes, in fact histogram values have since been added.


Second, for a given named statistic, AFAICT, the data type,
unit, base and exponent are all fixed. I don't see a reason
for us to be reporting that information every time we call
'query-stats'. Just report the name + value(s).  Apps that
want a specific named stat won't care about the dimensions,
because they'll already know what the value means.


I agree on this.


The 'name' and 'type' are used for filtering I presume. Only allowing
a single value for each feels pretty inflexible. I'd say we want to
allow mutliple requests at a time for efficiency.

Bearing in mind my other suggestions above, I'd think we should have
something  more like

  { 'enum': 'StatsProvider',
    'data': ["kvm", "qemu", "tcg", ],
  }

  { 'struct': 'StatsRequest',
    'data': {
   'provider': 'StatsProvider',
   // If omitted, report everything for this provider
   '*fields': [ 'str' ]


I think provider should be optional as well.  See below.


    }
  }

  { 'struct': 'StatsVCPURequest',
    'base': 'StatsRequest',
    'data': {
  // To request subset of vCPUs e.g.
  //  "cpu_set": "0-3"
  // Could use ['int'] instead if desired
  '*cpu_set': str,


Yes, ['int'] is preferrable.


    },
  }

  { 'struct': 'StatsFilter',
    'data': {
  // If omitted means don't report that group of data
  '*vcpu': 'StatsVCPURequest',
  '*vm': 'StatsRequest',
    },
  }


I agree except that I think this and StatsResults should be unions, even if it 
means running multiple query-stats commands. 


IIUC, making StatsResults a union implies the filter is a required argument 
(currently it is optional - omitting it dumps all VM and VCPU stats). Just to 
confirm - we want the filter to be required?


Thanks/regards,
-Mark

There should also be an enum ['vcpu', 'vm'] that acts as the discriminator for 
both StatsFilter and StatsResults:


 { 'enum': 'StatsTarget',
   'data': [ 'vcpu', 'vm' ] }

 { 'union': 'StatsFilter',
   'base': { 'target': 'StatsTarget', '*providers': ['StatsProvider'] },
   'discriminator': 'target',
   'data': { 'vcpu': ['*cpu-set': ['int']] }
}

 { 'union': 'StatsResults',
   'base': { 'target': 'StatsTarget', stats: ['StatsResultsEntry'] },
   'discriminator': 'target',
   'data': { 'vcpu': ['id': 'int'] }
}

Alternatively, the providers should simply be keys in the dictionary

Paolo



  { 'alternate': 'StatsValue',
    'data': { 'scalar': 'int64',
  'list': [ 'int64 ' ] }
  }

  { 'struct': 'StatsResultsEntry',
    'data': {
  'provider': 'StatsProvider',
  'stats': [ 'StatsValue' ]
    }
  }

  { 'struct': 'StatsResults':
    'data': {
  '*vcpu': [ [ 'StatsResultsEntry' ] ],
  '*vm': [ 'StatsResultsEntry' ]
    }
  }

  { 'command': 'query-stats',
    'data': { 'filter': '*StatsFilter' },
    'returns': 'StatsResults' }





Re: [PATCH v2 0/3] Support fd-based KVM stats

2022-01-16 Thread Mark Kanda

(+ Daniel and Markus)

Thank you Paolo,

I'm in the midst of implementing various API changes as requested by Daniel [1] 
and was planning to send out v3 this week. Could you please take a look at his 
response and comment on the proposal? Or, perhaps I should publish v3 (based on 
Daniel's proposal) and you could review that? Please let me know your preference.


Thanks/regards,
-Mark

[1] https://lore.kernel.org/qemu-devel/ya+rlex1dju%2f1...@redhat.com/ 
<https://lore.kernel.org/qemu-devel/ya+rlex1dju%2f1...@redhat.com/>


On 1/15/2022 10:22 AM, Paolo Bonzini wrote:

On 11/19/21 20:51, Mark Kanda wrote:

v2: [Paolo]
- generalize the interface
- add support for querying stat schema and instances
- add additional HMP semantic processing for a few exponent/unit
   combinations (related to seconds and bytes)

This patchset adds QEMU support for querying fd-based KVM stats. The
kernel support was introduced by:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")


Hi Mark,

sorry for the late review.  Fortunately there's very little that I'd change.

In particular:

* please change the callbacks to accept a NULL name and type, instead of 
having the "bool"/"const char *" pair.  You can probably benefit from a 
function to cutils.c like


    bool qemu_match_string(const char *value, const char *request) {
    return !request || g_str_equal(value, request);
    }

* please pass a single const struct to add_stats_callbacks, using GList so 
that the struct can be const.


Putting both together it would be something like:

typedef struct StatsCallbacks {
    char *name;
    StatsList *(*get_values)(StatsList *list, const char *name,
   const char *type, Error **errp);
    StatsSchemaList *(*get_schemas)(StatsSchemaList *list,
    const char *name, Error **errp);
    StatsInstanceList *(*get_instances)(StatsInstanceList *list,
    Error **errp);
} StatsCallbacks;

Finally, please put everything in a new header include/monitor/stats.h, so 
that we can document everything and please it in docs/devel.  I can take care 
of that though.


Thanks,

Paolo



Mark Kanda (3):
   qmp: Support for querying stats
   hmp: Support for querying stats
   kvm: Support for querying fd-based stats

  accel/kvm/kvm-all.c   | 399 ++
  hmp-commands-info.hx  |  40 
  include/monitor/hmp.h |   3 +
  include/monitor/monitor.h |  27 +++
  monitor/hmp-cmds.c    | 125 
  monitor/qmp-cmds.c    |  71 +++
  qapi/misc.json    | 142 ++
  7 files changed, 807 insertions(+)





[PATCH v2 0/3] Support fd-based KVM stats

2021-11-19 Thread Mark Kanda
v2: [Paolo]
- generalize the interface
- add support for querying stat schema and instances
- add additional HMP semantic processing for a few exponent/unit
  combinations (related to seconds and bytes)

This patchset adds QEMU support for querying fd-based KVM stats. The
kernel support was introduced by:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

Mark Kanda (3):
  qmp: Support for querying stats
  hmp: Support for querying stats
  kvm: Support for querying fd-based stats

 accel/kvm/kvm-all.c   | 399 ++
 hmp-commands-info.hx  |  40 
 include/monitor/hmp.h |   3 +
 include/monitor/monitor.h |  27 +++
 monitor/hmp-cmds.c| 125 
 monitor/qmp-cmds.c|  71 +++
 qapi/misc.json| 142 ++
 7 files changed, 807 insertions(+)

-- 
2.26.2




[PATCH v2 3/3] kvm: Support for querying fd-based stats

2021-11-19 Thread Mark Kanda
Add support for querying fd-based KVM stats - as introduced by Linux
kernel commit:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

Signed-off-by: Mark Kanda 
---
 accel/kvm/kvm-all.c | 399 
 qapi/misc.json  |   2 +-
 2 files changed, 400 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index eecd8031cf..10e8b8ed5c 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -47,6 +47,8 @@
 #include "kvm-cpus.h"
 
 #include "hw/boards.h"
+#include "qapi/qapi-commands-misc.h"
+#include "monitor/monitor.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -2303,6 +2305,15 @@ bool kvm_dirty_ring_enabled(void)
 return kvm_state->kvm_dirty_ring_size ? true : false;
 }
 
+static StatsList *query_stats_cb(StatsList *, bool, const char *, bool,
+ const char *, Error **);
+
+static StatsSchemaList *query_stats_schemas_cb(StatsSchemaList *, bool,
+   const char *, Error **);
+
+static StatsInstanceList *query_stats_instances_cb(StatsInstanceList *,
+   Error **);
+
 static int kvm_init(MachineState *ms)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2612,6 +2623,11 @@ static int kvm_init(MachineState *ms)
 }
 }
 
+if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
+add_stats_callbacks("kvm", _stats_cb, _stats_schemas_cb,
+_stats_instances_cb);
+}
+
 return 0;
 
 err:
@@ -3667,3 +3683,386 @@ static void kvm_type_init(void)
 }
 
 type_init(kvm_type_init);
+
+typedef struct StatsArgs {
+void *kvm_stat;
+char *name;
+bool query_schema;
+Error **errp;
+} StatsArgs;
+
+static StatDataList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
+   uint64_t *stats_data,
+   StatDataList *data_list,
+   Error **errp)
+{
+StatDataList *data_entry;
+uint64List *val_list = NULL;
+int i;
+
+data_entry = g_malloc0(sizeof(*data_entry));
+data_entry->value = g_malloc0(sizeof(*data_entry->value));
+data_entry->value->name = g_strdup(pdesc->name);
+
+/* Convert flags to type, unit and base (QAPI auto-generated enums) */
+switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+case KVM_STATS_TYPE_CUMULATIVE:
+data_entry->value->type = STAT_TYPE_CUMULATIVE;
+break;
+case KVM_STATS_TYPE_INSTANT:
+data_entry->value->type = STAT_TYPE_INSTANT;
+break;
+case KVM_STATS_TYPE_PEAK:
+data_entry->value->type = STAT_TYPE_PEAK;
+break;
+default:
+/* Unknown type - skip */
+goto exit;
+}
+
+switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
+case KVM_STATS_UNIT_NONE:
+data_entry->value->unit = STAT_UNIT_NONE;
+break;
+case KVM_STATS_UNIT_BYTES:
+data_entry->value->unit = STAT_UNIT_BYTES;
+break;
+case KVM_STATS_UNIT_CYCLES:
+data_entry->value->unit = STAT_UNIT_CYCLES;
+break;
+case KVM_STATS_UNIT_SECONDS:
+data_entry->value->unit = STAT_UNIT_SECONDS;
+break;
+default:
+/* Unknown unit - skip */
+goto exit;
+}
+
+switch (pdesc->flags & KVM_STATS_BASE_MASK) {
+case KVM_STATS_BASE_POW10:
+data_entry->value->base = 10;
+break;
+case  KVM_STATS_BASE_POW2:
+data_entry->value->base = 2;
+break;
+default:
+/* Unknown base - skip */
+goto exit;
+}
+
+data_entry->value->exponent = pdesc->exponent;
+
+/* Alloc and populate data list */
+for (i = 0; i < pdesc->size; i++) {
+uint64List *val_entry = g_malloc0(sizeof(*val_entry));
+val_entry->value = stats_data[i];
+val_entry->next = val_list;
+val_list = val_entry;
+}
+data_entry->value->val = val_list;
+data_entry->next = data_list;
+data_list = data_entry;
+
+return data_list;
+
+exit:
+g_free(data_entry->value->name);
+g_free(data_entry->value);
+g_free(data_entry);
+
+return data_list;
+}
+
+static StatSchemaEntryList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
+   StatSchemaEntryList *data_list,
+   Error **errp)
+{
+StatSchemaEntryList *data_entry;
+
+data_entry = g_malloc0(sizeof(*data_entry));
+data_entry->value = g_malloc0(sizeof(*data_entry->value));
+data_entry->value->name = g_strdup(pdesc->name);
+
+data_entry->next = data_list;
+data_list = data_entry;
+
+  

[PATCH v2 2/3] hmp: Support for querying stats

2021-11-19 Thread Mark Kanda
Leverage the QMP support for querying stats. The interface supports
the same arguments as the QMP interface.

Examples (with fd-based KVM stats):
(qemu) info stats

vcpu_1 (kvm-vcpu):
  guest_mode (instant): 0
  directed_yield_successful (cumulative): 0
...
vcpu_0 (kvm-vcpu):
  guest_mode (instant): 0
...
vm (kvm-vm):
  max_mmu_page_hash_collisions (peak): 0
  max_mmu_rmap_size (peak): 0
...

(qemu) info stats-schemas

kvm-vcpu:
  guest_mode
  directed_yield_successful
...
kvm-vm:
  max_mmu_page_hash_collisions
  max_mmu_rmap_size
...

(qemu) info stats-instances
nametype

vcpu_1  kvm-vcpu
vcpu_0  kvm-vcpu
vm  kvm-vm

Signed-off-by: Mark Kanda 
---
 hmp-commands-info.hx  |  40 ++
 include/monitor/hmp.h |   3 +
 monitor/hmp-cmds.c| 125 ++
 3 files changed, 168 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 407a1da800..0d5f025adb 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -877,3 +877,43 @@ SRST
   ``info sgx``
 Show intel SGX information.
 ERST
+
+{
+.name   = "stats",
+.args_type  = "name:s?,type:s?",
+.params = "[name] [type]",
+.help   = "show statistics; optional name and type",
+.cmd= hmp_info_stats,
+},
+
+SRST
+  ``stats``
+Show stats
+ERST
+
+{
+.name   = "stats-schemas",
+.args_type  = "type:s?",
+.params = "[type]",
+.help   = "show stats for schema type; optional type",
+.cmd= hmp_info_stats_schemas,
+},
+
+SRST
+  ``stats-schemas``
+Show stats for schema type
+ERST
+
+
+{
+.name   = "stats-instances",
+.args_type  = "",
+.params = "",
+.help   = "show current stat instances",
+.cmd= hmp_info_stats_instances,
+},
+
+SRST
+  ``stats-instances``
+Show current stat instances
+ERST
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d014826a..3670280a6b 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -133,5 +133,8 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
 void hmp_human_readable_text_helper(Monitor *mon,
 HumanReadableText *(*qmp_handler)(Error 
**));
+void hmp_info_stats(Monitor *mon, const QDict *qdict);
+void hmp_info_stats_schemas(Monitor *mon, const QDict *qdict);
+void hmp_info_stats_instances(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 9c91bf93e9..9752ca6aa8 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -2165,3 +2165,128 @@ void hmp_info_memory_size_summary(Monitor *mon, const 
QDict *qdict)
 }
 hmp_handle_error(mon, err);
 }
+
+void hmp_info_stats(Monitor *mon, const QDict *qdict)
+{
+StatsList *stats_list, *stats_list_entry;
+Stats *stats_entry;
+StatDataList *data_entry;
+StatData *stat;
+uint64List *val;
+const char *name, *type;
+Error *err = NULL;
+
+name = qdict_get_try_str(qdict, "name");
+type = qdict_get_try_str(qdict, "type");
+
+stats_list = qmp_query_stats(name ? TRUE : FALSE, name,
+ type ? TRUE : FALSE, type, );
+
+if (err) {
+monitor_printf(mon, "%s\n", error_get_pretty(err));
+error_free(err);
+return;
+}
+
+for (stats_list_entry = stats_list; stats_list_entry;
+ stats_list_entry = stats_list_entry->next) {
+stats_entry = stats_list_entry->value;
+monitor_printf(mon, "\n%s (%s):\n", stats_entry->name,
+   StatSchemaType_str(stats_entry->type));
+
+for (data_entry = stats_entry->stats; data_entry;
+ data_entry = data_entry->next) {
+stat = data_entry->value;
+monitor_printf(mon, "  %s (%s):", stat->name,
+   StatType_str(stat->type));
+
+for (val = stat->val; val; val = val->next) {
+if (stat->unit == STAT_UNIT_SECONDS &&
+stat->exponent >= -9 && stat->exponent <= 0 &&
+stat->exponent % 3 == 0 && stat->base == 10) {
+const char *si_prefix[] = { "", "milli", "micro", "nano" };
+monitor_printf(mon, " %lu %s", val->value,
+   si_prefix[stat->exponent / -3]);
+} else if (
+stat->unit == STAT_UNIT_BYTES &&
+stat->exponent >= 0 &&a

[PATCH v2 1/3] qmp: Support for querying stats

2021-11-19 Thread Mark Kanda
Introduce qmp support for querying stats. Provide a framework for
adding new stats and support for the following commands:

- query-stats
Returns a list of all stats, with options for specifying a stat
name and schema type. A schema type is the set of stats associated
with a given component (e.g. vm or vcpu).

- query-stats-schemas
Returns a list of stats included in each schema type, with an
option for specifying the schema name.

- query-stats-instances
Returns a list of stat instances and their associated schema type.

The framework provides a method to register callbacks for these qmp
commands.

The first usecase will be for fd-based KVM stats (in an upcoming
patch).

Examples (with fd-based KVM stats):

{ "execute": "query-stats" }
{ "return": [
{ "name": "vcpu_1",
  "type": "kvm-vcpu",
  "stats": [
{ "name": "guest_mode",
  "unit": "none",
  "base": 10,
  "val": [ 0 ],
  "exponent": 0,
  "type": "instant" },
{ "name": "directed_yield_successful",
  "unit": "none",
  "base": 10,
  "val": [ 0 ],
  "exponent": 0,
  "type": "cumulative" },
...
},
{ "name": "vcpu_0",
  "type": "kvm-vcpu",
  "stats": ...
...
 },
{ "name": "vm",
  "type": "kvm-vm",
  "stats": [
{ "name": "max_mmu_page_hash_collisions",
  "unit": "none",
  "base": 10,
  "val": [ 0 ],
  "exponent": 0,
  "type": "peak" },
  ...

{ "execute": "query-stats-schemas" }
{ "return": [
{ "type": "kvm-vcpu",
  "stats": [
{ "name": "guest_mode" },
{ "name": "directed_yield_successful" },
...
},
{ "type": "kvm-vm",
  "stats": [
{ "name": "max_mmu_page_hash_collisions" },
{ "name": "max_mmu_rmap_size" },
...

{ "execute": "query-stats-instances" }
{ "return": [
{ "name": "vcpu_1",
  "type": "kvm-vcpu" },
{ "name": "vcpu_0",
  "type": "kvm-vcpu" },
{ "name": "vm",
  "type": "kvm-vm" } ]
}

Signed-off-by: Mark Kanda 
---
 include/monitor/monitor.h |  27 
 monitor/qmp-cmds.c|  71 +++
 qapi/misc.json| 142 ++
 3 files changed, 240 insertions(+)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 12d395d62d..14d3432ade 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -56,4 +56,31 @@ void monitor_register_hmp(const char *name, bool info,
 void monitor_register_hmp_info_hrt(const char *name,
HumanReadableText *(*handler)(Error 
**errp));
 
+/*
+ * Add qmp stats callbacks to the stats_callbacks list.
+ *
+ * @name: name of stats callbacks
+ * @stats_fn: routine to query stats - with options for name and type:
+ *StatsList *(*stats_fn)(StatsList *list_tail, bool has_name,
+ *const char *name, bool has_type, const char *type, Error **errp)
+ *
+ * @schema_fn: routine to query stat schemas - with an option for type:
+ *StatsSchemaList *(*schemas_fn)(StatsSchemaList *list tail, bool has_type,
+ *   const char *type, Error **errp)
+ *
+ * @instance_fn: routine to query stat instances:
+ * StatsInstanceList *(*instances_fn)(StatsInstanceList *list_tail,
+ *Error **errp)
+ */
+void add_stats_callbacks(const char *name,
+ StatsList *(*stats_fn)(StatsList *,
+bool, const char *,
+bool, const char *,
+Error **),
+ StatsSchemaList *(*schemas_fn)(StatsSchemaList *,
+bool, const char *,
+Error **),
+ StatsInstanceList *(*instances_fn)(StatsInstanceList 
*,
+Error **));
+
 #endif /* MONITOR_H */
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index 343353e27a..c7bdff1e1c 100644
--- a/monitor/qmp-cmds.c

[PATCH 1/2] qmp: Support fd-based KVM stats query

2021-10-19 Thread Mark Kanda
Introduce a QMP command 'query-kvmstats' to query KVM for vm and vcpu
fd-based statistics. The kernel support is provided by commit:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

The interface supports an optional 'filter' argument to specify a
particular stat to query.

Examples:
{ "execute": "query-kvmstats" }

{ "return": [
{ "name": "vm",
  "stats": [
{ "name": "max_mmu_page_hash_collisions",
  "unit": "none",
  "base": 10,
  "val": [ 0 ],
  "exponent": 0,
  "type": "peak" },
{ "name": "nx_lpage_splits",
  "unit": "none",
  "base": 10,
  "val": [ 120 ],
  "exponent": 0,
  "type": "instant" },
...
} ] },
{ "name": "vcpu_0",
  "stats": [
{ "name": "req_event",
  "unit": "none",
  "base": 10,
  "val": [ 500 ],
  "exponent": 0,
  "type": "cumulative" },
...

{ "execute": "query-kvmstats", "arguments" : { "filter" : "req_event" } }

{ "return": [
{ "name": "vm",
  "stats": [] },
{ "name": "vcpu_0",
  "stats": [
{ "name": "req_event",
  "unit": "none",
  "base": 10,
  "val": [ 500 ],
  "exponent": 0,
  "type": "cumulative" }
] },
{ "name": "vcpu_1",
  "stats": [
{ "name": "req_event",
  "unit": "none",
  "base": 10,
  "val": [ 61 ],
  "exponent": 0,
  "type": "cumulative" }
] } ] }

Signed-off-by: Mark Kanda 
---
 accel/kvm/kvm-all.c | 246 
 qapi/misc.json  |  73 +
 2 files changed, 319 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index db8d83b137..597d0c7a09 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -47,6 +47,7 @@
 #include "kvm-cpus.h"
 
 #include "hw/boards.h"
+#include "qapi/qapi-commands-misc.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -3406,6 +3407,251 @@ int kvm_on_sigbus(int code, void *addr)
 #endif
 }
 
+typedef struct KvmStatsArgs {
+KvmStats *kvm_stat; /* QAPI auto-generated struct */
+char *filter;
+Error **errp;
+} KvmStatsArgs;
+
+static KvmStatDataList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
+  uint64_t *stats_data,
+  KvmStatDataList *data_list,
+  Error **errp)
+{
+KvmStatDataList *data_entry;
+uint64List *val_list = NULL;
+Error *local_err = NULL;
+int i;
+
+data_entry = g_malloc0(sizeof(*data_entry));
+data_entry->value = g_malloc0(sizeof(*data_entry->value));
+data_entry->value->name = g_strdup(pdesc->name);
+
+/* Convert flags to type, unit and base (QAPI auto-generated enums) */
+switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
+case KVM_STATS_TYPE_CUMULATIVE:
+data_entry->value->type = KVM_STAT_TYPE_CUMULATIVE;
+break;
+case KVM_STATS_TYPE_INSTANT:
+data_entry->value->type = KVM_STAT_TYPE_INSTANT;
+break;
+case KVM_STATS_TYPE_PEAK:
+data_entry->value->type = KVM_STAT_TYPE_PEAK;
+break;
+default:
+error_setg(_err, "KVM stats: invalid type %u",
+   (pdesc->flags & KVM_STATS_TYPE_MASK)
+   >> KVM_STATS_TYPE_SHIFT);
+goto exit;
+}
+
+switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
+case KVM_STATS_UNIT_NONE:
+data_entry->value->unit = KVM_STAT_UNIT_NONE;
+break;
+case KVM_STATS_UNIT_BYTES:
+data_entry->value->unit = KVM_STAT_UNIT_BYTES;
+break;
+case KVM_STATS_UNIT_CYCLES:
+data_entry->value->unit = KVM_STAT_UNIT_CYCLES;
+break;
+case KVM_STATS_UNIT_SECONDS:
+data_entry->value->unit = KVM_STAT_UNIT_SECONDS;
+break;
+default:
+error_setg(_err, "KVM stats: invalid unit %u",
+   (pdesc->flags & KVM_STATS_UNIT_MASK)
+   >> KVM_STATS_UNIT_SHIFT);
+goto exit;
+}
+
+switch (pdesc->

[PATCH 2/2] hmp: Support fd-based KVM stats query

2021-10-19 Thread Mark Kanda
Leverage the QMP support for fd-based KVM stats.

The interface supports an optional 'filter' argument to specify a
particular stat to query. Base and exponent are displayed in human
readable format.

Examples:

(qemu) info kvmstats

vm:
  max_mmu_page_hash_collisions (peak): 0
  nx_lpage_splits (instant): 114
  lpages (instant): 193
  mmu_unsync (instant): 0
  mmu_cache_miss (cumulative): 293
  mmu_recycled (cumulative): 0
  mmu_flooded (cumulative): 0
  mmu_pde_zapped (cumulative): 0
  mmu_pte_write (cumulative): 0
  mmu_shadow_zapped (cumulative): 178
  remote_tlb_flush (cumulative): 63

vcpu_0:
  req_event (cumulative): 538
  nmi_injections (cumulative): 0
...

(qemu) info kvmstats halt_poll_fail_ns

vm:

vcpu_0:
  halt_poll_fail_ns (cumulative): 20*10^-9 seconds

vcpu_1:
  halt_poll_fail_ns (cumulative): 30*10^-9 seconds

Signed-off-by: Mark Kanda 
---
 hmp-commands-info.hx  | 13 +++
 include/monitor/hmp.h |  1 +
 monitor/hmp-cmds.c| 52 +++
 3 files changed, 66 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 4c966e8a6b..ef5bca01d9 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -335,6 +335,19 @@ SRST
 Show KVM information.
 ERST
 
+{
+.name   = "kvmstats",
+.args_type  = "filter:s?",
+.params = "filter",
+.help   = "show KVM statistics; optional filter for stat name",
+.cmd= hmp_info_kvmstats,
+},
+
+SRST
+  ``info kvmstats``
+Show KVM statistics.
+ERST
+
 {
 .name   = "numa",
 .args_type  = "",
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 6bc27639e0..20be8f8586 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -21,6 +21,7 @@ void hmp_handle_error(Monitor *mon, Error *err);
 void hmp_info_name(Monitor *mon, const QDict *qdict);
 void hmp_info_version(Monitor *mon, const QDict *qdict);
 void hmp_info_kvm(Monitor *mon, const QDict *qdict);
+void hmp_info_kvmstats(Monitor *mon, const QDict *qdict);
 void hmp_info_status(Monitor *mon, const QDict *qdict);
 void hmp_info_uuid(Monitor *mon, const QDict *qdict);
 void hmp_info_chardev(Monitor *mon, const QDict *qdict);
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index bcaa41350e..24a545a66b 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -134,6 +134,58 @@ void hmp_info_kvm(Monitor *mon, const QDict *qdict)
 qapi_free_KvmInfo(info);
 }
 
+void hmp_info_kvmstats(Monitor *mon, const QDict *qdict)
+{
+KvmStatsList *stats_list, *stats_list_entry;
+KvmStats *stats_entry;
+KvmStatDataList *data_entry;
+KvmStatData *kvm_stat;
+uint64List *val;
+const char *filter;
+Error *err = NULL;
+
+filter = qdict_get_try_str(qdict, "filter");
+if (filter) {
+stats_list = qmp_query_kvmstats(TRUE, filter, );
+} else {
+stats_list = qmp_query_kvmstats(FALSE, NULL, );
+}
+
+if (err) {
+monitor_printf(mon, "%s\n", error_get_pretty(err));
+error_free(err);
+return;
+}
+
+for (stats_list_entry = stats_list; stats_list_entry;
+ stats_list_entry = stats_list_entry->next) {
+stats_entry = stats_list_entry->value;
+monitor_printf(mon, "\n%s:\n", stats_entry->name);
+
+for (data_entry = stats_entry->stats; data_entry;
+ data_entry = data_entry->next) {
+kvm_stat = data_entry->value;
+monitor_printf(mon, "  %s (%s):", kvm_stat->name,
+   KvmStatType_str(kvm_stat->type));
+
+for (val = kvm_stat->val; val; val = val->next) {
+if (kvm_stat->exponent) {
+/* Print the base and exponent as "*^" */
+monitor_printf(mon, " %lu*%d^%d", val->value,
+   kvm_stat->base, kvm_stat->exponent);
+} else {
+monitor_printf(mon, " %lu", val->value);
+}
+}
+
+/* Don't print "none" unit type */
+monitor_printf(mon, " %s\n", kvm_stat->unit == KVM_STAT_UNIT_NONE ?
+   "" : KvmStatUnit_str(kvm_stat->unit));
+}
+}
+qapi_free_KvmStatsList(stats_list);
+}
+
 void hmp_info_status(Monitor *mon, const QDict *qdict)
 {
 StatusInfo *info;
-- 
2.26.2




[PATCH 0/2] Support fd-based KVM stats

2021-10-19 Thread Mark Kanda
This patchset adds QEMU support for querying fd-based KVM stats. The kernel
support is provided by:

cb082bfab59a ("KVM: stats: Add fd-based API to read binary stats data")

Patch 1 adds QMP support; patch 2 adds HMP support.

Mark Kanda (2):
  qmp: Support fd-based KVM stats query
  hmp: Support fd-based KVM stats query

 accel/kvm/kvm-all.c   | 246 ++
 hmp-commands-info.hx  |  13 +++
 include/monitor/hmp.h |   1 +
 monitor/hmp-cmds.c|  52 +
 qapi/misc.json|  73 +
 5 files changed, 385 insertions(+)

-- 
2.26.2




Re: [PATCH] migration/dirtyrate: simplify inlcudes in dirtyrate.c

2020-10-30 Thread Mark Kanda

On 10/29/2020 10:58 PM, Chuan Zheng wrote:

Remove redundant blank line which is left by Commit 662770af7c6e8c,
also take this opportunity to remove redundant includes in dirtyrate.c.

Signed-off-by: Chuan Zheng 


Reviewed-by: Mark Kanda 


---
  migration/dirtyrate.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 8f728d2..ccb9814 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -11,17 +11,12 @@
   */
  
  #include "qemu/osdep.h"

-
  #include 
  #include "qapi/error.h"
  #include "cpu.h"
-#include "qemu/config-file.h"
-#include "exec/memory.h"
  #include "exec/ramblock.h"
-#include "exec/target_page.h"
  #include "qemu/rcu_queue.h"
  #include "qapi/qapi-commands-migration.h"
-#include "migration.h"
  #include "ram.h"
  #include "trace.h"
  #include "dirtyrate.h"





Re: [Qemu-devel] [PATCH-for-4.2 v2] Only enable the halt poll control MSR if it is supported by the host

2019-08-28 Thread Mark Kanda

Gentle ping - I would like to confirm this patch is acceptable.

Thanks/regards,
-Mark

On 7/17/2019 9:38 AM, Mark Kanda wrote:

The halt poll control MSR should only be enabled on hosts which
support it.

Fixes: ("kvm: i386: halt poll control MSR support")

Signed-off-by: Mark Kanda 

---
v2: Remove unnecessary hunks which break migration with older hosts (Paolo)
---
  target/i386/cpu.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a8bafdb8b9..543bc25f64 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2838,7 +2838,6 @@ static PropValue kvm_default_props[] = {
  { "kvm-asyncpf", "on" },
  { "kvm-steal-time", "on" },
  { "kvm-pv-eoi", "on" },
-{ "kvm-poll-control", "on" },
  { "kvmclock-stable-bit", "on" },
  { "x2apic", "on" },
  { "acpi", "off" },





[Qemu-devel] [PATCH-for-4.2 v2] Only enable the halt poll control MSR if it is supported by the host

2019-07-17 Thread Mark Kanda
The halt poll control MSR should only be enabled on hosts which
support it.

Fixes: ("kvm: i386: halt poll control MSR support")

Signed-off-by: Mark Kanda 

---
v2: Remove unnecessary hunks which break migration with older hosts (Paolo)
---
 target/i386/cpu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a8bafdb8b9..543bc25f64 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2838,7 +2838,6 @@ static PropValue kvm_default_props[] = {
 { "kvm-asyncpf", "on" },
 { "kvm-steal-time", "on" },
 { "kvm-pv-eoi", "on" },
-{ "kvm-poll-control", "on" },
 { "kvmclock-stable-bit", "on" },
 { "x2apic", "on" },
 { "acpi", "off" },
-- 
2.21.0




[Qemu-devel] [PATCH-for-4.2 1/1] Only enable the halt poll control MSR if it is supported by the host

2019-07-16 Thread Mark Kanda
The halt poll control MSR should only be enabled on hosts which
support it.

Fixes: ("kvm: i386: halt poll control MSR support")

Signed-off-by: Mark Kanda 
---
 target/i386/cpu.c | 8 +++-
 target/i386/kvm.c | 2 --
 target/i386/machine.c | 1 -
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a8bafdb8b9..dacbf7a9fe 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2838,7 +2838,6 @@ static PropValue kvm_default_props[] = {
 { "kvm-asyncpf", "on" },
 { "kvm-steal-time", "on" },
 { "kvm-pv-eoi", "on" },
-{ "kvm-poll-control", "on" },
 { "kvmclock-stable-bit", "on" },
 { "x2apic", "on" },
 { "acpi", "off" },
@@ -5109,6 +5108,13 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 env->cpuid_xlevel2 = env->cpuid_min_xlevel2;
 }
 
+/* Enable the halt poll control MSR if it is supported by the host */
+if (x86_cpu_get_supported_feature_word(FEAT_KVM, cpu->migratable) &
+(1 << KVM_FEATURE_POLL_CONTROL)) {
+env->features[FEAT_KVM] |= 1 << KVM_FEATURE_POLL_CONTROL;
+env->poll_control_msr = 1;
+}
+
 out:
 if (local_err != NULL) {
 error_propagate(errp, local_err);
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index cb22684139..81dd5d2c1b 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1796,8 +1796,6 @@ void kvm_arch_reset_vcpu(X86CPU *cpu)
 
 hyperv_x86_synic_reset(cpu);
 }
-/* enabled by default */
-env->poll_control_msr = 1;
 }
 
 void kvm_arch_do_init_vcpu(X86CPU *cpu)
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 20077a8a5a..9d6095b264 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -394,7 +394,6 @@ static bool steal_time_msr_needed(void *opaque)
 return cpu->env.steal_time_msr != 0;
 }
 
-/* Poll control MSR enabled by default */
 static bool poll_control_msr_needed(void *opaque)
 {
 X86CPU *cpu = opaque;
-- 
2.21.0




[Qemu-devel] [PATCH-for-4.2 0/1] Only enable the halt poll control MSR if it is supported by the host

2019-07-16 Thread Mark Kanda
This patch addresses an issue with the 'queued, but not yet applied' halt
polling MSR patch. With this patch, halt polling is enabled 'by default';
this causes issues with hosts which don't support halt polling. The fix is
to only enable halt polling if it is supported by the host.

Mark Kanda (1):
  Only enable the halt poll control MSR if it is supported by the host

 target/i386/cpu.c | 8 +++-
 target/i386/kvm.c | 2 --
 target/i386/machine.c | 1 -
 3 files changed, 7 insertions(+), 4 deletions(-)

-- 
2.21.0




Re: [Qemu-devel] [patch QEMU] kvm: i386: halt poll control MSR support

2019-07-16 Thread Mark Kanda

On 7/16/2019 4:15 PM, Paolo Bonzini wrote:

On 16/07/19 23:09, Paolo Bonzini wrote:

As such, I think we should only enable halt polling if it is supported
on the host - see the attached patch.

...thoughts?

No, it should not be enabled by default at all, at least not until we
can require kernel 5.2.  My mistake, sorry.  Can you post a patch?


Doh, nevermind.  This is not included in 4.1 so there's time to fix it.
  Pfew. :)



:)

I'll post a patch regardless.

Thanks/regards,
-Mark



Re: [Qemu-devel] [patch QEMU] kvm: i386: halt poll control MSR support

2019-07-16 Thread Mark Kanda
itectural_pmu_version > 0) {
  if (has_architectural_pmu_version > 1) {
  /* Stop the counter.  */
@@ -2443,6 +2450,9 @@ static int kvm_get_msrs(X86CPU *cpu)
  if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
  kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
  }
+if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
+kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
+}
  if (has_architectural_pmu_version > 0) {
  if (has_architectural_pmu_version > 1) {
  kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
@@ -2677,6 +2687,10 @@ static int kvm_get_msrs(X86CPU *cpu)
  case MSR_KVM_STEAL_TIME:
  env->steal_time_msr = msrs[i].data;
  break;
+case MSR_KVM_POLL_CONTROL: {
+env->poll_control_msr = msrs[i].data;
+break;
+}
  case MSR_CORE_PERF_FIXED_CTR_CTRL:
  env->msr_fixed_ctr_ctrl = msrs[i].data;
  break;
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 225b5d4..1c23e5e 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -323,6 +323,14 @@ static bool steal_time_msr_needed(void *opaque)
  return cpu->env.steal_time_msr != 0;
  }
  
+/* Poll control MSR enabled by default */

+static bool poll_control_msr_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+
+return cpu->env.poll_control_msr != 1;
+}
+
  static const VMStateDescription vmstate_steal_time_msr = {
  .name = "cpu/steal_time_msr",
  .version_id = 1,
@@ -356,6 +364,17 @@ static const VMStateDescription vmstate_pv_eoi_msr = {
  }
  };
  
+static const VMStateDescription vmstate_poll_control_msr = {

+.name = "cpu/poll_control_msr",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = poll_control_msr_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(env.poll_control_msr, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+
  static bool fpop_ip_dp_needed(void *opaque)
  {
  X86CPU *cpu = opaque;
@@ -1062,6 +1081,7 @@ VMStateDescription vmstate_x86_cpu = {
  _async_pf_msr,
  _pv_eoi_msr,
  _steal_time_msr,
+_poll_control_msr,
  _fpop_ip_dp,
  _msr_tsc_adjust,
  _msr_tscdeadline,



Queued, thanks.  Sorry for missing it until now.

Paolo


From ac83cb52be3fbe226645e780500adb767f9bcfd2 Mon Sep 17 00:00:00 2001
From: Mark Kanda 
Date: Tue, 16 Jul 2019 14:46:11 -0500
Subject: [PATCH QEMU] Only enable the halt poll control MSR if it is supported
 by the host

The halt poll control MSR should only be enabled on hosts which
support it.

Fixes: ("kvm: i386: halt poll control MSR support")

Signed-off-by: Mark Kanda 
---
 target/i386/cpu.c | 8 +++-
 target/i386/kvm.c | 2 --
 target/i386/machine.c | 1 -
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a8bafdb8b9..dacbf7a9fe 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2838,7 +2838,6 @@ static PropValue kvm_default_props[] = {
 { "kvm-asyncpf", "on" },
 { "kvm-steal-time", "on" },
 { "kvm-pv-eoi", "on" },
-{ "kvm-poll-control", "on" },
 { "kvmclock-stable-bit", "on" },
 { "x2apic", "on" },
 { "acpi", "off" },
@@ -5109,6 +5108,13 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 env->cpuid_xlevel2 = env->cpuid_min_xlevel2;
 }
 
+/* Enable the halt poll control MSR if it is supported by the host */
+if (x86_cpu_get_supported_feature_word(FEAT_KVM, cpu->migratable) &
+(1 << KVM_FEATURE_POLL_CONTROL)) {
+env->features[FEAT_KVM] |= 1 << KVM_FEATURE_POLL_CONTROL;
+env->poll_control_msr = 1;
+}
+
 out:
 if (local_err != NULL) {
 error_propagate(errp, local_err);
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index cb22684139..81dd5d2c1b 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1796,8 +1796,6 @@ void kvm_arch_reset_vcpu(X86CPU *cpu)
 
 hyperv_x86_synic_reset(cpu);
 }
-/* enabled by default */
-env->poll_control_msr = 1;
 }
 
 void kvm_arch_do_init_vcpu(X86CPU *cpu)
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 20077a8a5a..9d6095b264 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -394,7 +394,6 @@ static bool steal_time_msr_needed(void *opaque)
 return cpu->env.steal_time_msr != 0;
 }
 
-/* Poll control MSR enabled by default */
 static bool poll_control_msr_needed(void *opaque)
 {
 X86CPU *cpu = opaque;
-- 
2.21.0



[Qemu-devel] [PATCH] e1000: Never increment the RX undersize count register

2019-04-04 Thread Mark Kanda
From: Chris Kenna 

In situations where e1000 receives an undersized Ethernet frame,
QEMU increments the emulated "Receive Undersize Count (RUC)"
register when padding the frame.

This is incorrect because this an expected scenario (e.g. with
VLAN tag stripping) and not an error. As such, QEMU should not
increment the emulated RUC.

Fixes: 3b2743017749 ("e1000: Implementing various counters")

Reviewed-by: Mark Kanda 
Reviewed-by: Bhavesh Davda 
Signed-off-by: Chris Kenna 
---
 hw/net/e1000.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 9b39bcc..121452d 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -901,7 +901,6 @@ e1000_receive_iov(NetClientState *nc, const struct iovec 
*iov, int iovcnt)
 if (size < sizeof(min_buf)) {
 iov_to_buf(iov, iovcnt, 0, min_buf, size);
 memset(_buf[size], 0, sizeof(min_buf) - size);
-e1000x_inc_reg_if_not_full(s->mac_reg, RUC);
 min_iov.iov_base = filter_buf = min_buf;
 min_iov.iov_len = size = sizeof(min_buf);
 iovcnt = 1;
-- 
1.8.3.1




Re: [Qemu-devel] Nested SVM only enabled (by default) in pc-i440fx-2.1?

2019-02-27 Thread Mark Kanda

On 2/27/2019 4:05 PM, Mark Kanda wrote:

Hi all,

I noticed nested SVM is enabled only in pc-i440fx-2.1 (default is 
disabled); this was added when 2.1 was the latest:


 75d373ef97 ("target-i386: Disable SVM by default in KVM mode")

However, this change was not carried forward to newer machine types. Is 
this an oversight? Is there a reason we (still) need SVM disabled by 
default?


To clarify, it's my understanding the above patch was added to disable 
nested SVM in pc-i440fx-2.2 (and future) machine types due to stability 
issues (while keeping it enabled in pc-i440fx-2.1 and previous machine 
types).


I'm wondering if nested SVM is now mature enough to remove this 
restriction and enable it by default (at least on recent machine types).


Thanks/regards,
-Mark



[Qemu-devel] Nested SVM only enabled (by default) in pc-i440fx-2.1?

2019-02-27 Thread Mark Kanda

Hi all,

I noticed nested SVM is enabled only in pc-i440fx-2.1 (default is 
disabled); this was added when 2.1 was the latest:


75d373ef97 ("target-i386: Disable SVM by default in KVM mode")

However, this change was not carried forward to newer machine types. Is 
this an oversight? Is there a reason we (still) need SVM disabled by 
default?


Default setting - target/i386/cpu.c:

static PropValue kvm_default_props[] = {
{ "kvmclock", "on" },
...
{ "monitor", "off" },
{ "svm", "off" },
{ NULL, NULL },
};

2.1 override - hw/i386/pc_piix.c:

static void pc_compat_2_1_fn(MachineState *machine)
{
pc_compat_2_2_fn(machine);
x86_cpu_change_kvm_default("svm", NULL);
}

Thanks and best regards,
-Mark



Re: [Qemu-devel] [PATCH for-3.1] nvme: fix out-of-bounds access to the CMB

2018-11-19 Thread Mark Kanda
For CVE-2018-16847, I just noticed Kevin pulled in Li's previous fix (as 
opposed to this one). Was this done in error?


Thanks,

-Mark

On 11/16/2018 3:31 AM, Paolo Bonzini wrote:

Because the CMB BAR has a min_access_size of 2, if you read the last
byte it will try to memcpy *2* bytes from n->cmbuf, causing an off-by-one
error.  This is CVE-2018-16847.

Another way to fix this might be to register the CMB as a RAM memory
region, which would also be more efficient.  However, that might be a
change for big-endian machines; I didn't think this through and I don't
know how real hardware works.  Add a basic testcase for the CMB in case
somebody does this change later on.

Cc: Keith Busch 
Cc: qemu-bl...@nongnu.org
Cc: Li Qiang 
Signed-off-by: Paolo Bonzini 
---
  hw/block/nvme.c|  2 +-
  tests/Makefile.include |  2 +-
  tests/nvme-test.c  | 58 +++---
  3 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 09d7c90259..5d92794ef7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1192,7 +1192,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
  .write = nvme_cmb_write,
  .endianness = DEVICE_LITTLE_ENDIAN,
  .impl = {
-.min_access_size = 2,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 613242bc6e..fb0b449c02 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -730,7 +730,7 @@ tests/test-hmp$(EXESUF): tests/test-hmp.o
  tests/machine-none-test$(EXESUF): tests/machine-none-test.o
  tests/drive_del-test$(EXESUF): tests/drive_del-test.o $(libqos-virtio-obj-y)
  tests/qdev-monitor-test$(EXESUF): tests/qdev-monitor-test.o $(libqos-pc-obj-y)
-tests/nvme-test$(EXESUF): tests/nvme-test.o
+tests/nvme-test$(EXESUF): tests/nvme-test.o $(libqos-pc-obj-y)
  tests/pvpanic-test$(EXESUF): tests/pvpanic-test.o
  tests/i82801b11-test$(EXESUF): tests/i82801b11-test.o
  tests/ac97-test$(EXESUF): tests/ac97-test.o
diff --git a/tests/nvme-test.c b/tests/nvme-test.c
index 7674a446e4..2abb3b6d19 100644
--- a/tests/nvme-test.c
+++ b/tests/nvme-test.c
@@ -8,11 +8,64 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/units.h"
  #include "libqtest.h"
+#include "libqos/libqos-pc.h"
+
+static QOSState *qnvme_start(const char *extra_opts)
+{
+QOSState *qs;
+const char *arch = qtest_get_arch();
+const char *cmd = "-drive id=drv0,if=none,file=null-co://,format=raw "
+  "-device nvme,addr=0x4.0,serial=foo,drive=drv0 %s";
+
+if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
+qs = qtest_pc_boot(cmd, extra_opts ? : "");
+global_qtest = qs->qts;
+return qs;
+}
+
+g_printerr("nvme tests are only available on x86\n");
+exit(EXIT_FAILURE);
+}
+
+static void qnvme_stop(QOSState *qs)
+{
+qtest_shutdown(qs);
+}
  
-/* Tests only initialization so far. TODO: Replace with functional tests */

  static void nop(void)
  {
+QOSState *qs;
+
+qs = qnvme_start(NULL);
+qnvme_stop(qs);
+}
+
+static void nvmetest_cmb_test(void)
+{
+const int cmb_bar_size = 2 * MiB;
+QOSState *qs;
+QPCIDevice *pdev;
+QPCIBar bar;
+
+qs = qnvme_start("-global nvme.cmb_size_mb=2");
+pdev = qpci_device_find(qs->pcibus, QPCI_DEVFN(4,0));
+g_assert(pdev != NULL);
+
+qpci_device_enable(pdev);
+bar = qpci_iomap(pdev, 2, NULL);
+
+qpci_io_writel(pdev, bar, 0, 0xccbbaa99);
+g_assert_cmpint(qpci_io_readb(pdev, bar, 0), ==, 0x99);
+g_assert_cmpint(qpci_io_readw(pdev, bar, 0), ==, 0xaa99);
+
+/* Test partially out-of-bounds accesses.  */
+qpci_io_writel(pdev, bar, cmb_bar_size - 1, 0x44332211);
+g_assert_cmpint(qpci_io_readb(pdev, bar, cmb_bar_size - 1), ==, 0x11);
+g_assert_cmpint(qpci_io_readw(pdev, bar, cmb_bar_size - 1), !=, 0x2211);
+g_assert_cmpint(qpci_io_readl(pdev, bar, cmb_bar_size - 1), !=, 
0x44332211);
+qnvme_stop(qs);
  }
  
  int main(int argc, char **argv)

@@ -21,9 +74,8 @@ int main(int argc, char **argv)
  
  g_test_init(, , NULL);

  qtest_add_func("/nvme/nop", nop);
+qtest_add_func("/nvme/cmb_test", nvmetest_cmb_test);
  
-qtest_start("-drive id=drv0,if=none,file=null-co://,format=raw "

-"-device nvme,drive=drv0,serial=foo");
  ret = g_test_run();
  
  qtest_end();






Re: [Qemu-devel] [PATCH] lsi53c895a: check message length value

2018-10-26 Thread Mark Kanda

On 10/26/2018 1:37 PM, P J P wrote:

+-- On Fri, 26 Oct 2018, Mark Kanda wrote --+
| Deja vu requested that we include the following text in the commit message:
|
|   Discovered by Deja vu Security. Reported by Oracle.
|
| Would that be acceptable?

Generally an email-id is used/preferred in the commit log message. We could
use above for acknowledgement and avoid Reported-by in the commit log message
if that suits Deja vu team.

Please let me know your/their preference.



Yes, please use that acknowledgement text in lieu of a 'Reported-by' line.

Thanks,

-Mark



Re: [Qemu-devel] [PATCH] lsi53c895a: check message length value

2018-10-26 Thread Mark Kanda




On 10/26/2018 4:25 AM, P J P wrote:

+-- On Thu, 25 Oct 2018, Ameya More wrote --+
| While Mark and I reported this issue to you, it was actually discovered by
| Dejvau Security and they should receive credit for reporting this issue.
| http://www.dejavusecurity.com

I see; Would it be possible to share email-id of the original reporter to
include in the commit log message?


Deja vu requested that we include the following text in the commit message:

Discovered by Deja vu Security. Reported by Oracle.

Would that be acceptable?

Thanks,

-Mark



Re: [Qemu-devel] [PATCH 1/1] kvm: add call to qemu_add_opts() for -overcommit option

2018-08-15 Thread Mark Kanda




On 8/15/2018 12:57 PM, prasad.singamse...@oracle.com wrote:

From: Prasad Singamsetty 

qemu command fails to process -overcommit option. Add the missing
call to qemu_add_opts() in vl.c.

Signed-off-by: Prasad Singamsetty 


Reviewed-by: Mark Kanda 


---
  vl.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/vl.c b/vl.c
index 16b913f9d5..12d27fa028 100644
--- a/vl.c
+++ b/vl.c
@@ -2987,6 +2987,7 @@ int main(int argc, char **argv, char **envp)
  qemu_add_opts(_object_opts);
  qemu_add_opts(_tpmdev_opts);
  qemu_add_opts(_realtime_opts);
+qemu_add_opts(_overcommit_opts);
  qemu_add_opts(_msg_opts);
  qemu_add_opts(_name_opts);
  qemu_add_opts(_numa_opts);





Re: [Qemu-devel] [PULL 1/1] vga: fix region calculation

2018-03-13 Thread Mark Kanda

On 3/12/2018 5:59 AM, Gerd Hoffmann wrote:

Typically the scanline length and the line offset are identical.  But
in case they are not our calculation for region_end is incorrect.  Using
line_offset is fine for all scanlines, except the last one where we have
to use the actual scanline length.

Fixes: CVE-2018-7550


This doesn't appear to be the correct CVE number.

https://nvd.nist.gov/vuln/detail/CVE-2018-7550

..please confirm..

Thanks,

-Mark


Reported-by: Ross Lagerwall 
Signed-off-by: Gerd Hoffmann 
Reviewed-by: Prasad J Pandit 
Tested-by: Ross Lagerwall 
Message-id: 20180309143704.13420-1-kra...@redhat.com
---
  hw/display/vga.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/hw/display/vga.c b/hw/display/vga.c
index 28f298b342..72181330b8 100644
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -1483,6 +1483,8 @@ static void vga_draw_graphic(VGACommonState *s, int 
full_update)
  
  region_start = (s->start_addr * 4);

  region_end = region_start + (ram_addr_t)s->line_offset * height;
+region_end += width * s->get_bpp(s) / 8; /* scanline length */
+region_end -= s->line_offset;
  if (region_end > s->vbe_size) {
  /* wraps around (can happen with cirrus vbe modes) */
  region_start = 0;





Re: [Qemu-devel] [Qemu-block] [PATCH] virtio-blk: check for NULL BlockDriverState

2018-01-29 Thread Mark Kanda



On 1/29/2018 9:41 AM, Kevin Wolf wrote:

Am 24.01.2018 um 12:31 hat Stefan Hajnoczi geschrieben:

On Mon, Jan 22, 2018 at 09:01:49AM -0600, Mark Kanda wrote:

Add a BlockDriverState NULL check to virtio_blk_handle_request()
to prevent a segfault if the drive is forcibly removed using HMP
'drive_del' (without performing a hotplug 'device_del' first).

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Karl Heubaum <karl.heub...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
  hw/block/virtio-blk.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index b1532e4..76ddbbf 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -507,6 +507,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
  return -1;
  }
  
+/* If the drive was forcibly removed (e.g. HMP 'drive_del'), the block

+ * driver state may be NULL and there is nothing left to do. */
+if (!blk_bs(req->dev->blk)) {


Adding Markus Armbruster to check my understanding of drive_del:

1. If id is a node name (e.g. created via blockdev-add) then attempting
to remove the root node produces the "Node %s is in use" error.  In
that case this patch isn't needed.

2. If id is a BlockBackend (e.g. created via -drive) then removing the
root node is allowed.  The BlockBackend stays in place but blk->root
becomes NULL, hence this patch is needed.

Markus: What are the valid use cases for #2?  If blk->bs becomes NULL I
would think a lot more code beyond virtio-blk can segfault.


blk->root = NULL is completely normal, it is what happens with removable
media when the drive is empty.

The problem, which was first reported during the 2.10 RC phase and was
worked around in IDE code then, is that Paolo's commit 99723548561 added
unconditional bdrv_inc/dec_in_flight() calls. I am pretty sure that any
segfaults that Mark is seeing have the same cause.


That's correct. The segfault I encountered was the bdrv_inc_in_flight() 
call in blk_aio_prwv().


Thanks,

-Mark


We do need an in-flight counter even for those requests so that
blk_drain() works correctly, so just making the calls condition wouldn't
be right. However, this needs to become a separate counter in
BlockBackend, and the drain functions must be changed to make use of it.

I did post rough patches back then, but they weren't quite ready, and
since then they have fallen through the cracks.

Kevin





[Qemu-devel] [PATCH] virtio-blk: check for NULL BlockDriverState

2018-01-22 Thread Mark Kanda
Add a BlockDriverState NULL check to virtio_blk_handle_request()
to prevent a segfault if the drive is forcibly removed using HMP
'drive_del' (without performing a hotplug 'device_del' first).

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Karl Heubaum <karl.heub...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
 hw/block/virtio-blk.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index b1532e4..76ddbbf 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -507,6 +507,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
 return -1;
 }
 
+/* If the drive was forcibly removed (e.g. HMP 'drive_del'), the block
+ * driver state may be NULL and there is nothing left to do. */
+if (!blk_bs(req->dev->blk)) {
+virtio_error(vdev, "virtio-blk BlockDriverState is NULL");
+return -1;
+}
+
 /* We always touch the last byte, so just see how big in_iov is.  */
 req->in_len = iov_size(in_iov, in_num);
 req->in = (void *)in_iov[in_num - 1].iov_base
-- 
1.8.3.1




Re: [Qemu-devel] [PATCH v2 0/2] virtio-blk: miscellaneous changes

2017-12-11 Thread Mark Kanda



On 12/11/2017 4:30 AM, Stefan Hajnoczi wrote:

Hi Mark,
Please resend as a top level email thread so the continuous integration
and patch management tools will detect your patch series.


Apologies. I've just resent the series.

Thanks,

-Mark



[Qemu-devel] [PATCH v2 0/2] virtio-blk: miscellaneous changes

2017-12-11 Thread Mark Kanda
v2: add check for maximum queue size [Stefan]

This series is for two minor virtio-blk changes. The first patch
makes the virtio-blk queue size user configurable. The second patch
rejects logical block size > physical block configurations (similar
to a recent change in virtio-scsi).

Mark Kanda (2):
  virtio-blk: make queue size configurable
  virtio-blk: reject configs with logical block size > physical block
size

 hw/block/virtio-blk.c  | 17 -
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH v2 1/2] virtio-blk: make queue size configurable

2017-12-11 Thread Mark Kanda
Depending on the configuration, it can be beneficial to adjust the virtio-blk
queue size to something other than the current default of 128. Add a new
property to make the queue size configurable.

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Karl Heubaum <karl.heub...@oracle.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
 hw/block/virtio-blk.c  | 10 +-
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 05d1440..fb59f57 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -928,6 +928,13 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 error_setg(errp, "num-queues property must be larger than 0");
 return;
 }
+if (!is_power_of_2(conf->queue_size) ||
+conf->queue_size > VIRTQUEUE_MAX_SIZE) {
+error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
+   "must be a power of 2 (max %d)",
+   conf->queue_size, VIRTQUEUE_MAX_SIZE);
+return;
+}
 
 blkconf_serial(>conf, >serial);
 blkconf_apply_backend_options(>conf,
@@ -953,7 +960,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 
 for (i = 0; i < conf->num_queues; i++) {
-virtio_add_queue(vdev, 128, virtio_blk_handle_output);
+virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
 }
 virtio_blk_data_plane_create(vdev, conf, >dataplane, );
 if (err != NULL) {
@@ -1012,6 +1019,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
 DEFINE_PROP_END_OF_LIST(),
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index d3c8a6f..5117431 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -39,6 +39,7 @@ struct VirtIOBlkConf
 uint32_t config_wce;
 uint32_t request_merging;
 uint16_t num_queues;
+uint16_t queue_size;
 };
 
 struct VirtIOBlockDataPlane;
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 2/2] virtio-blk: reject configs with logical block size > physical block size

2017-12-11 Thread Mark Kanda
virtio-blk logical block size should never be larger than physical block
size because it doesn't make sense to have such configurations. QEMU doesn't
have a way to effectively express this condition; the best it can do is
report the physical block exponent as 0 - indicating the logical block size
equals the physical block size.

This is identical to commit 3da023b5827543ee4c022986ea2ad9d1274410b2
but applied to virtio-blk (instead of virtio-scsi).

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
---
 hw/block/virtio-blk.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index fb59f57..36c7608 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -952,6 +952,13 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 }
 blkconf_blocksizes(>conf);
 
+if (conf->conf.logical_block_size >
+conf->conf.physical_block_size) {
+error_setg(errp,
+   "logical_block_size > physical_block_size not supported");
+return;
+}
+
 virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
 sizeof(struct virtio_blk_config));
 
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 1/2] virtio-blk: make queue size configurable

2017-12-08 Thread Mark Kanda
Depending on the configuration, it can be beneficial to adjust the virtio-blk
queue size to something other than the current default of 128. Add a new
property to make the queue size configurable.

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Karl Heubaum <karl.heub...@oracle.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
 hw/block/virtio-blk.c  | 10 +-
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 05d1440..fb59f57 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -928,6 +928,13 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 error_setg(errp, "num-queues property must be larger than 0");
 return;
 }
+if (!is_power_of_2(conf->queue_size) ||
+conf->queue_size > VIRTQUEUE_MAX_SIZE) {
+error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
+   "must be a power of 2 (max %d)",
+   conf->queue_size, VIRTQUEUE_MAX_SIZE);
+return;
+}
 
 blkconf_serial(>conf, >serial);
 blkconf_apply_backend_options(>conf,
@@ -953,7 +960,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 
 for (i = 0; i < conf->num_queues; i++) {
-virtio_add_queue(vdev, 128, virtio_blk_handle_output);
+virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
 }
 virtio_blk_data_plane_create(vdev, conf, >dataplane, );
 if (err != NULL) {
@@ -1012,6 +1019,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
 DEFINE_PROP_END_OF_LIST(),
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index d3c8a6f..5117431 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -39,6 +39,7 @@ struct VirtIOBlkConf
 uint32_t config_wce;
 uint32_t request_merging;
 uint16_t num_queues;
+uint16_t queue_size;
 };
 
 struct VirtIOBlockDataPlane;
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 2/2] virtio-blk: reject configs with logical block size > physical block size

2017-12-08 Thread Mark Kanda
virtio-blk logical block size should never be larger than physical block
size because it doesn't make sense to have such configurations. QEMU doesn't
have a way to effectively express this condition; the best it can do is
report the physical block exponent as 0 - indicating the logical block size
equals the physical block size.

This is identical to commit 3da023b5827543ee4c022986ea2ad9d1274410b2
but applied to virtio-blk (instead of virtio-scsi).

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
---
 hw/block/virtio-blk.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index fb59f57..36c7608 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -952,6 +952,13 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 }
 blkconf_blocksizes(>conf);
 
+if (conf->conf.logical_block_size >
+conf->conf.physical_block_size) {
+error_setg(errp,
+   "logical_block_size > physical_block_size not supported");
+return;
+}
+
 virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
 sizeof(struct virtio_blk_config));
 
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 0/2] virtio-blk: miscellaneous changes

2017-12-08 Thread Mark Kanda
v2: add check for maximum queue size [Stefan]

This series is for two minor virtio-blk changes. The first patch
makes the virtio-blk queue size user configurable. The second patch
rejects logical block size > physical block configurations (similar
to a recent change in virtio-scsi).

Mark Kanda (2):
  virtio-blk: make queue size configurable
  virtio-blk: reject configs with logical block size > physical block
size

 hw/block/virtio-blk.c  | 17 -
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH 2/2] virtio-blk: reject configs with logical block size > physical block size

2017-12-06 Thread Mark Kanda
virtio-blk logical block size should never be larger than physical block
size because it doesn't make sense to have such configurations. QEMU doesn't
have a way to effectively express this condition; the best it can do is
report the physical block exponent as 0 - indicating the logical block size
equals the physical block size.

This is identical to commit 3da023b5827543ee4c022986ea2ad9d1274410b2
but applied to virtio-blk (instead of virtio-scsi).

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
 hw/block/virtio-blk.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 002c56f..acfca78 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -949,6 +949,13 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 }
 blkconf_blocksizes(>conf);
 
+if (conf->conf.logical_block_size >
+conf->conf.physical_block_size) {
+error_setg(errp,
+   "logical_block_size > physical_block_size not supported");
+return;
+}
+
 virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
 sizeof(struct virtio_blk_config));
 
-- 
1.8.3.1




[Qemu-devel] [PATCH 0/2] virtio-blk: miscellaneous changes

2017-12-06 Thread Mark Kanda
This series is for two minor virtio-blk changes. The first patch
makes the virtio-blk queue size user configurable. The second patch
rejects logical block size > physical block configurations (similar
to a recent change in virtio-scsi).

Mark Kanda (2):
  virtio-blk: make queue size configurable
  virtio-blk: reject configs with logical block size > physical block
size

 hw/block/virtio-blk.c  | 14 +-
 include/hw/virtio/virtio-blk.h |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH 1/2] virtio-blk: make queue size configurable

2017-12-06 Thread Mark Kanda
Depending on the configuration, it can be beneficial to adjust the virtio-blk
queue size to something other than the current default of 128. Add a new
property to make the queue size configurable.

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Karl Heubaum <karl.heub...@oracle.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
---
 hw/block/virtio-blk.c  | 7 ++-
 include/hw/virtio/virtio-blk.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 05d1440..002c56f 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -928,6 +928,10 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 error_setg(errp, "num-queues property must be larger than 0");
 return;
 }
+if (!is_power_of_2(conf->queue_size)) {
+error_setg(errp, "queue-size property must be a power of 2");
+return;
+}
 
 blkconf_serial(>conf, >serial);
 blkconf_apply_backend_options(>conf,
@@ -953,7 +957,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 
 for (i = 0; i < conf->num_queues; i++) {
-virtio_add_queue(vdev, 128, virtio_blk_handle_output);
+virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
 }
 virtio_blk_data_plane_create(vdev, conf, >dataplane, );
 if (err != NULL) {
@@ -1012,6 +1016,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
 DEFINE_PROP_END_OF_LIST(),
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index d3c8a6f..5117431 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -39,6 +39,7 @@ struct VirtIOBlkConf
 uint32_t config_wce;
 uint32_t request_merging;
 uint16_t num_queues;
+uint16_t queue_size;
 };
 
 struct VirtIOBlockDataPlane;
-- 
1.8.3.1




[Qemu-devel] [PATCH] Reject virtio-scsi configurations with logical block size > physical block size

2017-10-16 Thread Mark Kanda
With virtio-scsi, logical block size should never be larger than
physical block size. From an ATA/SCSI perspective, it makes no sense
to have the logical block size greater than the physical block size,
and it cannot even be effectively expressed in the command set. The
whole point of adding the physical block size to the ATA/SCSI command
set was to communicate a desire for a larger block size (than logical),
while maintaining backwards compatibility with legacy 512 byte block
size.

This was found by setting logical_block_size > physical_block_size in
the QEMU command line, and discovering that it confuses Windows VMs -
fsutil reports both physical and logical block sizes are equal to the
logical size.

Example QEMU option:

  -device scsi-hd,drive=drive-scsi0,id=disk1,bus=scsi.0,
  physical_block_size=512,logical_block_size=4096

Windows Server 2012 R2 VM:

  C:\Users\Administrator>fsutil fsinfo ntfsinfo F:
  ...
  Bytes Per Sector  :   4096
  Bytes Per Physical Sector :   4096
  Bytes Per Cluster :   4096
  Bytes Per FileRecord Segment: 4096
  ...

Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 hw/scsi/scsi-disk.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 6e841fb..2a4f8c5 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2329,6 +2329,14 @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
 
 blkconf_serial(>qdev.conf, >serial);
 blkconf_blocksizes(>qdev.conf);
+
+if (s->qdev.conf.logical_block_size >
+s->qdev.conf.physical_block_size) {
+error_setg(errp,
+   "logical_block_size > physical_block_size not supported");
+return;
+}
+
 if (dev->type == TYPE_DISK) {
 blkconf_geometry(>conf, NULL, 65535, 255, 255, );
 if (err) {
-- 
1.8.3.1