Re: [PATCH v2 1/7] migration/multifd: Add new migration option zero-page-detection.

2024-02-21 Thread Elena Ufimtseva
On Fri, Feb 16, 2024 at 2:41 PM Hao Xiang  wrote:

> This new parameter controls where the zero page checking is running.
> 1. If this parameter is set to 'legacy', zero page checking is
> done in the migration main thread.
> 2. If this parameter is set to 'none', zero page checking is disabled.
>
>
Hello Hao

Few questions and comments.

First the commit message states that the parameter control where the
checking is done, but it also controls
if sending of zero pages is done by multifd threads or not.



> Signed-off-by: Hao Xiang 
> ---
>  hw/core/qdev-properties-system.c| 10 ++
>  include/hw/qdev-properties-system.h |  4 
>  migration/migration-hmp-cmds.c  |  9 +
>  migration/options.c | 21 
>  migration/options.h |  1 +
>  migration/ram.c |  4 
>  qapi/migration.json | 30 ++---
>  7 files changed, 76 insertions(+), 3 deletions(-)
>
> diff --git a/hw/core/qdev-properties-system.c
> b/hw/core/qdev-properties-system.c
> index 1a396521d5..63843f18b5 100644
> --- a/hw/core/qdev-properties-system.c
> +++ b/hw/core/qdev-properties-system.c
> @@ -679,6 +679,16 @@ const PropertyInfo qdev_prop_mig_mode = {
>  .set_default_value = qdev_propinfo_set_default_value_enum,
>  };
>
> +const PropertyInfo qdev_prop_zero_page_detection = {
> +.name = "ZeroPageDetection",
> +.description = "zero_page_detection values, "
> +   "multifd,legacy,none",
> +.enum_table = &ZeroPageDetection_lookup,
> +.get = qdev_propinfo_get_enum,
> +.set = qdev_propinfo_set_enum,
> +.set_default_value = qdev_propinfo_set_default_value_enum,
> +};
> +
>  /* --- Reserved Region --- */
>
>  /*
> diff --git a/include/hw/qdev-properties-system.h
> b/include/hw/qdev-properties-system.h
> index 06c359c190..839b170235 100644
> --- a/include/hw/qdev-properties-system.h
> +++ b/include/hw/qdev-properties-system.h
> @@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr;
>  extern const PropertyInfo qdev_prop_reserved_region;
>  extern const PropertyInfo qdev_prop_multifd_compression;
>  extern const PropertyInfo qdev_prop_mig_mode;
> +extern const PropertyInfo qdev_prop_zero_page_detection;
>  extern const PropertyInfo qdev_prop_losttickpolicy;
>  extern const PropertyInfo qdev_prop_blockdev_on_error;
>  extern const PropertyInfo qdev_prop_bios_chs_trans;
> @@ -47,6 +48,9 @@ extern const PropertyInfo
> qdev_prop_iothread_vq_mapping_list;
>  #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \
>  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \
> MigMode)
> +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \
> +DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \
> +   ZeroPageDetection)
>  #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
>  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
>  LostTickPolicy)
> diff --git a/migration/migration-hmp-cmds.c
> b/migration/migration-hmp-cmds.c
> index 99b49df5dd..7e96ae6ffd 100644
> --- a/migration/migration-hmp-cmds.c
> +++ b/migration/migration-hmp-cmds.c
> @@ -344,6 +344,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const
> QDict *qdict)
>  monitor_printf(mon, "%s: %s\n",
>
>  MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION),
>  MultiFDCompression_str(params->multifd_compression));
> +assert(params->has_zero_page_detection);
>

What is the reason to have assert here?


> +monitor_printf(mon, "%s: %s\n",
> +
> MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION),
> +qapi_enum_lookup(&ZeroPageDetection_lookup,
> +params->zero_page_detection));
>  monitor_printf(mon, "%s: %" PRIu64 " bytes\n",
>  MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE),
>  params->xbzrle_cache_size);
> @@ -634,6 +639,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const
> QDict *qdict)
>  p->has_multifd_zstd_level = true;
>  visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
>  break;
> +case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION:
> +p->has_zero_page_detection = true;
> +visit_type_ZeroPageDetection(v, param, &p->zero_page_detection,
> &err);
> +break;
>  case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
>  p->has_xbzrle_cache_size = true;
>  if (!visit_type_size(v, param, &cache_size, &err)) {
> diff --git a/migration/options.c b/migration/options.c
> index 3e3e0b93b4..3c603391b0 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -179,6 +179,9 @@ Property migration_properties[] = {
>  DEFINE_PROP_MIG_MODE("mode", MigrationState,
>parameters.mode,
>MIG_MODE_NORMAL),
> +DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,

Re: [PATCH v2 2/7] migration/multifd: Support for zero pages transmission in multifd format.

2024-02-21 Thread Elena Ufimtseva
On Fri, Feb 16, 2024 at 2:41 PM Hao Xiang  wrote:

> This change adds zero page counters and updates multifd send/receive
> tracing format to track the newly added counters.
>
> Signed-off-by: Hao Xiang 
> ---
>  migration/multifd.c| 43 ++
>  migration/multifd.h| 21 -
>  migration/ram.c|  1 -
>  migration/trace-events |  8 
>  4 files changed, 59 insertions(+), 14 deletions(-)
>
> diff --git a/migration/multifd.c b/migration/multifd.c
> index adfe8c9a0a..a33dba40d9 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -236,6 +236,8 @@ static void multifd_pages_reset(MultiFDPages_t *pages)
>   * overwritten later when reused.
>   */
>  pages->num = 0;
> +pages->normal_num = 0;
> +pages->zero_num = 0;
>  pages->block = NULL;
>  }
>

> @@ -309,6 +311,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n)
>
>  pages->allocated = n;
>  pages->offset = g_new0(ram_addr_t, n);
> +pages->normal = g_new0(ram_addr_t, n);
> +pages->zero = g_new0(ram_addr_t, n);
>
>
 return pages;
>  }
> @@ -319,6 +323,10 @@ static void multifd_pages_clear(MultiFDPages_t *pages)
>  pages->allocated = 0;
>  g_free(pages->offset);
>  pages->offset = NULL;
> +g_free(pages->normal);
> +pages->normal = NULL;
> +g_free(pages->zero);
> +pages->zero = NULL;
>  g_free(pages);
>  }
>
> @@ -332,6 +340,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>  packet->flags = cpu_to_be32(p->flags);
>  packet->pages_alloc = cpu_to_be32(p->pages->allocated);
>  packet->normal_pages = cpu_to_be32(pages->num);
> +packet->zero_pages = cpu_to_be32(pages->zero_num);
>  packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>
>  packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
> @@ -350,9 +359,10 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>
>  p->packets_sent++;
>  p->total_normal_pages += pages->num;
> +p->total_zero_pages += pages->zero_num;
>
> -trace_multifd_send(p->id, packet_num, pages->num, p->flags,
> -   p->next_packet_size);
> +trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> +   p->flags, p->next_packet_size);
>  }
>
>  static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> @@ -393,20 +403,29 @@ static int
> multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>  p->normal_num = be32_to_cpu(packet->normal_pages);
>  if (p->normal_num > packet->pages_alloc) {
>  error_setg(errp, "multifd: received packet "
> -   "with %u pages and expected maximum pages are %u",
> +   "with %u normal pages and expected maximum pages are
> %u",
> p->normal_num, packet->pages_alloc) ;
>  return -1;
>  }
>
> +p->zero_num = be32_to_cpu(packet->zero_pages);
> +if (p->zero_num > packet->pages_alloc - p->normal_num) {
> +error_setg(errp, "multifd: received packet "
> +   "with %u zero pages and expected maximum zero pages
> are %u",
> +   p->zero_num, packet->pages_alloc - p->normal_num) ;
> +return -1;
> +}


You could probably combine this check with normal_num against pages_alloc.

> +
>  p->next_packet_size = be32_to_cpu(packet->next_packet_size);
>  p->packet_num = be64_to_cpu(packet->packet_num);
>  p->packets_recved++;
>  p->total_normal_pages += p->normal_num;
> +p->total_zero_pages += p->zero_num;
>
> -trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags,
> -   p->next_packet_size);
> +trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
> +   p->flags, p->next_packet_size);
>
> -if (p->normal_num == 0) {
> +if (p->normal_num == 0 && p->zero_num == 0) {
>  return 0;
>  }
>
> @@ -823,6 +842,8 @@ static void *multifd_send_thread(void *opaque)
>
>  stat64_add(&mig_stats.multifd_bytes,
> p->next_packet_size + p->packet_len);
> +stat64_add(&mig_stats.normal_pages, pages->num);
>

That seems wrong. pages->num is the number of pages total in the packet.
But next patch changes it, so I suggest or change it here and not in 3/7.

+stat64_add(&mig_stats.zero_pages, pages->zero_num);
>
>  multifd_pages_reset(p->pages);
>  p->next_packet_size = 0;
> @@ -866,7 +887,8 @@ out:
>
>  rcu_unregister_thread();
>  migration_threads_remove(thread);
> -trace_multifd_send_thread_end(p->id, p->packets_sent,
> p->total_normal_pages);
> +trace_multifd_send_thread_end(p->id, p->packets_sent,
> p->total_normal_pages,
> +  p->total_zero_pages);
>
>  return NULL;
>  }
> @@ -1132,6 +1154,8 @@ static void
> multifd_recv_cleanup_channel(MultiFDRecvParams *p)
>  p-

Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-21 Thread Elena Ufimtseva
On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang  wrote:

> 1. Implements the zero page detection and handling on the multifd
> threads for non-compression, zlib and zstd compression backends.
> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> 3. Add proper asserts to ensure pages->normal are used for normal pages
> in all scenarios.
>
> Signed-off-by: Hao Xiang 
> ---
>  migration/meson.build |  1 +
>  migration/multifd-zero-page.c | 59 +++
>  migration/multifd-zlib.c  | 26 ---
>  migration/multifd-zstd.c  | 25 ---
>  migration/multifd.c   | 50 +++--
>  migration/multifd.h   |  7 +
>  qapi/migration.json   |  4 ++-
>  7 files changed, 151 insertions(+), 21 deletions(-)
>  create mode 100644 migration/multifd-zero-page.c
>
> diff --git a/migration/meson.build b/migration/meson.build
> index 92b1cc4297..1eeb915ff6 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -22,6 +22,7 @@ system_ss.add(files(
>'migration.c',
>'multifd.c',
>'multifd-zlib.c',
> +  'multifd-zero-page.c',
>'ram-compress.c',
>'options.c',
>'postcopy-ram.c',
> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> new file mode 100644
> index 00..f0cd8e2c53
> --- /dev/null
> +++ b/migration/multifd-zero-page.c
> @@ -0,0 +1,59 @@
> +/*
> + * Multifd zero page detection implementation.
> + *
> + * Copyright (c) 2024 Bytedance Inc
> + *
> + * Authors:
> + *  Hao Xiang 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> +#include "exec/ramblock.h"
> +#include "migration.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "ram.h"
> +
> +void multifd_zero_page_check_send(MultiFDSendParams *p)
> +{
> +/*
> + * QEMU older than 9.0 don't understand zero page
> + * on multifd channel. This switch is required to
> + * maintain backward compatibility.
> + */
> +bool use_multifd_zero_page =
> +(migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> +MultiFDPages_t *pages = p->pages;
> +RAMBlock *rb = pages->block;
> +
> +assert(pages->num != 0);
>

Not needed, the check is done right before calling send_prepare.


> +assert(pages->normal_num == 0);
> +assert(pages->zero_num == 0);
>

Why these asserts are needed?

> +
>
+for (int i = 0; i < pages->num; i++) {
> +uint64_t offset = pages->offset[i];
> +if (use_multifd_zero_page &&
> +buffer_is_zero(rb->host + offset, p->page_size)) {
> +pages->zero[pages->zero_num] = offset;
> +pages->zero_num++;
> +ram_release_page(rb->idstr, offset);
> +} else {
> +pages->normal[pages->normal_num] = offset;
> +pages->normal_num++;
> +}
> +}
> +}
> +
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> +{
> +for (int i = 0; i < p->zero_num; i++) {
> +void *page = p->host + p->zero[i];
> +if (!buffer_is_zero(page, p->page_size)) {
> +memset(page, 0, p->page_size);
> +}
> +}
> +}
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 012e3bdea1..cdfe0fa70e 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>  int ret;
>  uint32_t i;
>
> +multifd_zero_page_check_send(p);
> +
> +if (!pages->normal_num) {
> +p->next_packet_size = 0;
> +goto out;
> +}
> +
>  multifd_send_prepare_header(p);
>
> -for (i = 0; i < pages->num; i++) {
> +for (i = 0; i < pages->normal_num; i++) {
>  uint32_t available = z->zbuff_len - out_size;
>  int flush = Z_NO_FLUSH;
>
> -if (i == pages->num - 1) {
> +if (i == pages->normal_num - 1) {
>  flush = Z_SYNC_FLUSH;
>  }
>
> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>   * with compression. zlib does not guarantee that this is safe,
>   * therefore copy the page before calling deflate().
>   */
> -memcpy(z->buf, p->pages->block->host + pages->offset[i],
> p->page_size);
> +memcpy(z->buf, p->pages->block->host + pages->normal[i],
> p->page_size);
>  zs->avail_in = p->page_size;
>  zs->next_in = z->buf;
>
> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>  p->iov[p->iovs_num].iov_len = out_size;
>  p->iovs_num++;
>  p->next_packet_size = out_size;
> -p->flags |= MULTIFD_FLAG_ZLIB;
>
> +out:
> +p->flags |= MULTIFD_FLAG_ZLIB;
>  multifd_send_fill_packet(p);
> -
>
Spurious?

 return 0;
>  }
>
> @@ -261,6 +268,14 @@ s

Re: [PATCH v2 4/7] migration/multifd: Enable zero page checking from multifd threads.

2024-02-21 Thread Elena Ufimtseva
On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang  wrote:

> This change adds a dedicated handler for
> MigrationOps::ram_save_target_page in
> multifd live migration. Now zero page checking can be done in the multifd
> threads
> and this becomes the default configuration. We still provide backward
> compatibility
> where zero page checking is done from the migration main thread.
>
> Signed-off-by: Hao Xiang 
> ---
>  migration/multifd.c |  1 +
>  migration/options.c |  2 +-
>  migration/ram.c | 53 ++---
>  3 files changed, 42 insertions(+), 14 deletions(-)
>
> diff --git a/migration/multifd.c b/migration/multifd.c
> index fbb40ea10b..ef5dad1019 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -13,6 +13,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/cutils.h"
>  #include "qemu/rcu.h"
> +#include "qemu/cutils.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
>  #include "exec/ramblock.h"
> diff --git a/migration/options.c b/migration/options.c
> index 3c603391b0..3c79b6ccd4 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -181,7 +181,7 @@ Property migration_properties[] = {
>MIG_MODE_NORMAL),
>  DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
> parameters.zero_page_detection,
> -   ZERO_PAGE_DETECTION_LEGACY),
> +   ZERO_PAGE_DETECTION_MULTIFD),
>
>  /* Migration capabilities */
>  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> diff --git a/migration/ram.c b/migration/ram.c
> index 5ece9f042e..b088c5a98c 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs,
> PageSearchStatus *pss,
>  QEMUFile *file = pss->pss_channel;
>  int len = 0;
>
> -if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) {
> -return 0;
> -}
> -
>  if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
>  return 0;
>  }
> @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs,
> PageSearchStatus *pss)
>
>  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
>  {
> +assert(migrate_multifd());
>
We only call ram_save_multifd_page() if:
 if (migrate_multifd()) {
migration_ops->ram_save_target_page = ram_save_target_page_multifd;
So this assert is not needed.

+assert(!migrate_compress());
>
+assert(!migration_in_postcopy());
>
These two are redundant and done before we call in here.

+
>  if (!multifd_queue_page(block, offset)) {
>  return -1;
>  }
> @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs,
> PageSearchStatus *pss,
>   */
>  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus
> *pss)
>  {
> -RAMBlock *block = pss->block;
>  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
>  int res;
>
> @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState
> *rs, PageSearchStatus *pss)
>  return 1;
>  }
>
> +return ram_save_page(rs, pss);
> +}
> +
> +/**
> + * ram_save_target_page_multifd: save one target page
> + *
> + * Returns the number of pages written
> + *
> + * @rs: current RAM state
> + * @pss: data about the page we want to send
> + */
> +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus
> *pss)
> +{
> +RAMBlock *block = pss->block;
> +ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> +
> +/* Multifd is not compatible with old compression. */
> +assert(!migrate_compress());
>
Do we need to check this for every page?


> +/* Multifd is not compabible with postcopy. */
> +assert(!migration_in_postcopy());
> +
>  /*
> - * Do not use multifd in postcopy as one whole host page should be
> - * placed.  Meanwhile postcopy requires atomic update of pages, so
> even
> - * if host page size == guest page size the dest guest during run may
> - * still see partially copied pages which is data corruption.
> + * Backward compatibility support. While using multifd live
> + * migration, we still need to handle zero page checking on the
> + * migration main thread.
>   */
> -if (migrate_multifd() && !migration_in_postcopy()) {
> -return ram_save_multifd_page(block, offset);
> +if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
> +if (save_zero_page(rs, pss, offset)) {
> +return 1;
> +}
>  }
>
> -return ram_save_page(rs, pss);
> +return ram_save_multifd_page(block, offset);
>  }
>
>  /* Should be called before sending a host page */
> @@ -2984,7 +3006,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>  }
>
>  migration_ops = g_malloc0(sizeof(MigrationOps));
> -migration_ops->ram_save_target_page = ram_save_target_page_legacy;
> +
> +if (migrate_multifd()) {
> +migratio

Re: [PATCH v3 3/6] util/bufferiszero: remove AVX512 variant

2024-02-06 Thread Elena Ufimtseva
Hello Alexander

On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov 
wrote:

> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
> routines are invoked much more rarely in normal use when most buffers
> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
> frequency and voltage transition periods during which the CPU operates
> at reduced performance, as described in
> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html


I would like to point out that the frequency scaling is not currently an
issue on AMD Zen4 Genoa CPUs, for example.
And microcode architecture description here:
https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf
Although, the cpu frequency downscaling mentioned in the above document is
only in relation to floating point operations.
But from other online discussions I gather that the data path for the
integer registers in Zen4 is also 256 bits and it allows to avoid
frequency downscaling for FP and heavy instructions.
And looking at the optimizations for AVX2 in your other patch, would
unrolling the loop for AVX512 ops benefit from the speedup taken that the
data path has the same width?
If the frequency downscaling is not observed on some of the CPUs, can
AVX512 be maintained and used selectively for some
of the CPUs?

Thank you!


>
>
> Signed-off-by: Mikhail Romanov 
> Signed-off-by: Alexander Monakov 
> ---
>  util/bufferiszero.c | 36 ++--
>  1 file changed, 2 insertions(+), 34 deletions(-)
>
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 01050694a6..c037d11d04 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len)
>  }
>  }
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) ||
> defined(__SSE2__)
> +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
>  #include 
>
>  /* Note that each of these vectorized functions require len >= 64.  */
> @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len)
>  }
>  #endif /* CONFIG_AVX2_OPT */
>
> -#ifdef CONFIG_AVX512F_OPT
> -static bool __attribute__((target("avx512f")))
> -buffer_zero_avx512(const void *buf, size_t len)
> -{
> -/* Begin with an unaligned head of 64 bytes.  */
> -__m512i t = _mm512_loadu_si512(buf);
> -__m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> -__m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
> -
> -/* Loop over 64-byte aligned blocks of 256.  */
> -while (p <= e) {
> -__builtin_prefetch(p);
> -if (unlikely(_mm512_test_epi64_mask(t, t))) {
> -return false;
> -}
> -t = p[-4] | p[-3] | p[-2] | p[-1];
> -p += 4;
> -}
> -
> -t |= _mm512_loadu_si512(buf + len - 4 * 64);
> -t |= _mm512_loadu_si512(buf + len - 3 * 64);
> -t |= _mm512_loadu_si512(buf + len - 2 * 64);
> -t |= _mm512_loadu_si512(buf + len - 1 * 64);
> -
> -return !_mm512_test_epi64_mask(t, t);
> -
> -}
> -#endif /* CONFIG_AVX512F_OPT */
> -
>  static unsigned __attribute__((noinline))
>  select_accel_cpuinfo(unsigned info)
>  {
> @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info)
>  unsigned bit;
>  bool (*fn)(const void *, size_t);
>  } all[] = {
> -#ifdef CONFIG_AVX512F_OPT
> -{ CPUINFO_AVX512F, buffer_zero_avx512 },
> -#endif
>  #ifdef CONFIG_AVX2_OPT
>  { CPUINFO_AVX2,buffer_zero_avx2 },
>  #endif
> @@ -191,7 +159,7 @@ static unsigned used_accel
>  = 0;
>  #endif
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
> +#if defined(CONFIG_AVX2_OPT)
>  static void __attribute__((constructor)) init_accel(void)
>  {
>  used_accel = select_accel_cpuinfo(cpuinfo_init());
> --
> 2.32.0
>
>
>

-- 
Elena


Re: [Qemu-devel] [multiprocess RFC PATCH 36/37] multi-process: add the concept description to docs/devel/qemu-multiprocess

2019-05-07 Thread Elena Ufimtseva
On Mon, Mar 11, 2019 at 10:20:06AM +, Daniel P. Berrangé wrote:
> On Thu, Mar 07, 2019 at 03:29:41PM -0800, John G Johnson wrote:
> > 
> > 

Hi Daniel, Stefan

We have not replied in a while as we were trying to figure out
the best approach after multiple comments we have received on the
patch series.

Leaving other concerns that you, Stefan and others shared with us
out of this particular topic, we would like to get your opinion on
the following approach.

Please see below.

> > > On Mar 7, 2019, at 11:27 AM, Stefan Hajnoczi  wrote:
> > > 
> > > On Thu, Mar 07, 2019 at 02:51:20PM +, Daniel P. Berrangé wrote:
> > >> I guess one obvious answer is that the existing security mechanisms like
> > >> SELinux/ApArmor/DAC can be made to work in a more fine grained manner if
> > >> there are distinct processes. This would allow for a more useful seccomp
> > >> filter to better protect against secondary kernel exploits should QEMU
> > >> itself be exploited, if we can protect individual components.
> > > 
> > > Fine-grained sandboxing is possible in theory but tedious in practice.
> > > From what I can tell this patch series doesn't implement any sandboxing
> > > for child processes.
> > > 
> > 
> > The policies aren’t in QEMU, but in the selinux config files.
> > They would say, for example, that when the QEMU process exec()s the
> > disk emulation process, the process security context type transitions
> > to a new type.  This type would have permission to access the VM image
> > objects, whereas the QEMU process type (and any other device emulation
> > process types) cannot access them.
> 
> Note that currently all QEMU instances run by libvirt have seccomp
> policy applied that explicitly forbids any use of fork+exec as a way
> to reduce avenues of attack for an exploited QEMU.
> 
> Even in a modularized QEMU I'd be loathe to allow QEMU to have the
> fork+exec privileged, unless "QEMU" in this case was just a stub
> process that does nothing more than fork+exec the other binaries,
> while having zero attack exposed to the untrusted guest OS.

We see libvirt uses QEMU’s -sandbox option to indicate that QEMU
should use seccomp() to prohibit future use of certain system calls,
including fork() and exec().  Our idea is to enumerate the remote
processes needed via QEMU command line options, and have QEMU exec()
those processes before -sandbox is processed.
And we also will init seccomp for emulated devices processes.

> 
> > If you wanted to use DAC, you could do the something similar by
> > making the disk emulation executable setuid to a UID than can access
> > VM image files.
> > 
> > In either case, the policies and permissions are set up before
> > libvirt even runs, so it doesn’t need to be aware of them.
> 
> That's not the case bearing in mind the above point about fork+exec
> being forbidden. It would likely require libvirt to be in charge of
> spawning the various helper binaries from a trusted context.
> 
> 
> > > How to do this in practice must be clear from the beginning if
> > > fine-grained sandboxing is the main selling point.
> > > 
> > > Some details to start the discussion:
> > > 
> > > * How will fine-grained SELinux/AppArmor/DAC policies be configured for
> > >   each process?  I guess this requires root, so does libvirt need to
> > >   know about each process?
> > > 
> > 
> > The polices would apply to process security context types (or
> > UIDs in a DAC regime), so I would not expect libvirt to be aware of them.
> 
> I'm pretty skeptical that such a large modularization of QEMU can be
> done without libvirt being aware of it & needing some kind of changes
> applied.
>

We agree with that. With above proposed approach we still have to change hotplug
in some way.
If a eparate process will be spawned, libvirt will be the one doing
fork/exec of the separate processes. Or possibly launch a helper
binaries that will unify the way how an instance is being started with
multiple processes and hotplugging.

Thanks!
Elena, Jag, John.


> 
> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|



Re: [Qemu-devel] [multiprocess RFC PATCH 36/37] multi-process: add the concept description to docs/devel/qemu-multiprocess

2019-06-12 Thread Elena Ufimtseva
On Wed, Jun 12, 2019 at 05:24:13PM +0100, Stefan Hajnoczi wrote:
> On Thu, May 30, 2019 at 01:54:35PM -0700, Elena Ufimtseva wrote:
> > On Tue, May 28, 2019 at 08:18:20AM -0700, Elena Ufimtseva wrote:
> > > On Thu, May 23, 2019 at 12:11:30PM +0100, Stefan Hajnoczi wrote:
> > > > Hi Jag and Elena,
> > > > Do you think a call would help to move discussion along more quickly?
> > > >
> > > 
> > > Hi Stefan,
> > > 
> > > We would like to join this call.
> > > And thank you inviting us!
> > > 
> > > Elena
> > > > We could use the next KVM Community Call on June 4th to discuss
> > > > remaining concerns and the next steps:
> > > > https://calendar.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ
> > > >
> > > > I also hope to include other core QEMU developers.  As you know, I'm
> > > > skeptical, but it could be just me and I don't want to block you
> > > > unnecessarily if others are more enthusiastic about this approach.
> > > >
> > 
> > Hi Stefan
> > 
> > A few questions we have are about the call.
> > What is the format of the call usually? Should we provide some kind of the 
> > project outline for 5 minutes?
> > We are planning to address some of the concerns you have voiced in regards 
> > to amount of changes, usability,
> > security and performance. I assume there will be other questions as well. 
> > Is there any time limit per topic?
> > 
> > And would you mind sharing the call details with us?
> 
> Hi Elena and Jag,

Hi Stefan,

> Sorry, I was away on sick leave. 

Ah, sorry about that - we have guessed that you were away, but thought
people were mostly on vacation.

> The KVM Community Call is informal.
> The goal is to get people together in a teleconference where we can
> discuss topics much more quickly than on the mailing list.  This can
> help make progress in areas where the mailing list discussion seems to
> be making slow progress.
> 
> I would suggest starting with a status update the describes your
> current approach (without assuming the audience has familiarity).  Then
> you could touch on any issues where you'd like input from the community
> and you could take questions.
> 
> Our goal should be to get a consensus on whether disaggregated QEMU can
> be merged or not.
>

Thanks!
> Here are the calendar details (Tuesday, June 18th at 8:00 UTC):
> https://calendar.google.com/calendar/ical/tob1tjqp37v8evp74h0q8kpjqs%40group.calendar.google.com/public/basic.ics
> 
> Is this time okay for you?

Yes, this time is fine.
Do you have dial-in info for us?

Thank you!

Elena, Jag and JJ
> 
> Stefan





Re: [Qemu-devel] [multiprocess RFC PATCH 36/37] multi-process: add the concept description to docs/devel/qemu-multiprocess

2019-03-08 Thread Elena Ufimtseva
On Thu, Mar 07, 2019 at 03:16:42PM +0100, Kevin Wolf wrote:
> Am 07.03.2019 um 09:14 hat Thomas Huth geschrieben:
> > On 07/03/2019 08.22, elena.ufimts...@oracle.com wrote:
> > > From: Elena Ufimtseva 
> > > 
> > > TODO: Make relevant changes to the doc.
> > > 
> > > Signed-off-by: John G Johnson 
> > > Signed-off-by: Elena Ufimtseva 
> > > Signed-off-by: Jagannathan Raman 
> > > ---
> > >  docs/devel/qemu-multiprocess.txt | 1109 
> > > ++
> > >  1 file changed, 1109 insertions(+)
> > >  create mode 100644 docs/devel/qemu-multiprocess.txt
> > > 
> > > diff --git a/docs/devel/qemu-multiprocess.txt 
> > > b/docs/devel/qemu-multiprocess.txt
> > > new file mode 100644
> > > index 000..e29c6c8
> > > --- /dev/null
> > > +++ b/docs/devel/qemu-multiprocess.txt
> > > @@ -0,0 +1,1109 @@
> > > +/*
> > > + * Copyright 2019, Oracle and/or its affiliates. All rights reserved.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person obtaining 
> > > a copy
> > > + * of this software and associated documentation files (the "Software"), 
> > > to deal
> > > + * in the Software without restriction, including without limitation the 
> > > rights
> > > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
> > > sell
> > > + * copies of the Software, and to permit persons to whom the Software is
> > > + * furnished to do so, subject to the following conditions:
> > > + *
> > > + * The above copyright notice and this permission notice shall be 
> > > included in
> > > + * all copies or substantial portions of the Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
> > > EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
> > > MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 
> > > SHALL
> > > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
> > > OTHER
> > > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
> > > ARISING FROM,
> > > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
> > > DEALINGS IN
> > > + * THE SOFTWARE.
> > > + */
> > 
> > Somehow weird to see such a big license statement talking about
> > "software", but which applies to a text file only... Not sure if it is
> > an option for you, but maybe one of the Creative Common licenses
> > (dual-licensed with the GPLv2+) would be a better fit? E.g. for the QEMU
> > website, the content is dual-licensed: https://www.qemu.org/license.html
> 

Thanks Thomas,
working on figuring this part out.

> While we're talking about licenses, the "All rights reserved." notice is
> out of place in a license header that declares that a lot of permissions
> are granted. Better to remove it to avoid any ambiguities that could
> result from the contradiction. (Applies to the whole series.)
>

Thanks Kevin,

This will be removed.

Elena

> Kevin



Re: [Qemu-devel] [multiprocess RFC PATCH 21/37] multi-process: QMP/HMP commands to add a device to the remote process

2019-03-08 Thread Elena Ufimtseva
On Thu, Mar 07, 2019 at 10:39:54AM +, Dr. David Alan Gilbert wrote:
> * elena.ufimts...@oracle.com (elena.ufimts...@oracle.com) wrote:
> > From: Jagannathan Raman 
> > 
> > Adds rdevice_add QMP & HMP commands to hotplug device to a remote device.
> > 
> > Signed-off-by: Jagannathan Raman 
> > Signed-off-by: John G Johnson 
> > Signed-off-by: Elena Ufimtseva 
> > ---
> >  hmp-commands.hx | 14 
> >  hmp.h   |  1 +
> >  hw/proxy/monitor.c  | 92 
> > +
> >  include/io/proxy-link.h |  2 ++
> >  include/monitor/qdev.h  |  4 +++
> >  monitor.c   |  5 +++
> >  qapi/misc.json  | 22 
> >  remote/remote-main.c| 48 ++
> >  8 files changed, 188 insertions(+)
> > 
> > diff --git a/hmp-commands.hx b/hmp-commands.hx
> > index fb3c8ba..7e8e8ab 100644
> > --- a/hmp-commands.hx
> > +++ b/hmp-commands.hx
> > @@ -727,6 +727,20 @@ ETEXI
> >  
> >  #if defined(CONFIG_MPQEMU)
> >  {
> > +.name   = "rdevice_add",
> > +.args_type  = "rdev_id:s,driver:s,id:s,drive:s,bus:s",
> > +.params = "rdev_id driver id drive bus",
> > +.help   = "add device to remote proc, like -rdevice on the 
> > command line",
> > +.cmd= hmp_rdevice_add,
> > +},
> > +
> > +STEXI
> > +@item rdevice_add @var{config}
> > +@findex rdevice_add
> > +Add device to remote proc.
> > +ETEXI
> > +
> > +{
> >  .name   = "remote_proc_list",
> >  .args_type  = "",
> >  .params = "",
> > diff --git a/hmp.h b/hmp.h
> > index 0940634..355a27e 100644
> > --- a/hmp.h
> > +++ b/hmp.h
> > @@ -150,5 +150,6 @@ void hmp_info_vm_generation_id(Monitor *mon, const 
> > QDict *qdict);
> >  void hmp_info_memory_size_summary(Monitor *mon, const QDict *qdict);
> >  void hmp_info_sev(Monitor *mon, const QDict *qdict);
> >  void hmp_remote_proc_list(Monitor *mon, const QDict *qdict);
> > +void hmp_rdevice_add(Monitor *mon, const QDict *qdict);
> >  
> >  #endif
> > diff --git a/hw/proxy/monitor.c b/hw/proxy/monitor.c
> > index 3005eec..2e2cda0 100644
> > --- a/hw/proxy/monitor.c
> > +++ b/hw/proxy/monitor.c
> > @@ -23,6 +23,7 @@
> >   */
> >  
> >  #include 
> > +#include 
> >  
> >  #include "qemu/osdep.h"
> >  #include "qapi/qapi-types-block-core.h"
> > @@ -33,6 +34,13 @@
> >  #include "hw/boards.h"
> >  #include "hw/i386/pc.h"
> >  #include "hw/proxy/qemu-proxy.h"
> > +#include "qapi/qapi-commands-misc.h"
> > +#include "monitor/qdev.h"
> > +#include "qapi/qmp/qdict.h"
> > +#include "qapi/qmp/qjson.h"
> > +#include "qapi/qmp/qstring.h"
> > +#include "qapi/error.h"
> > +#include "io/proxy-link.h"
> >  
> >  /*
> >   * TODO: Is there a callback where the allocated memory for QMP could be 
> > free'd
> > @@ -87,3 +95,87 @@ void hmp_remote_proc_list(Monitor *mon, const QDict 
> > *qdict)
> > pdev->remote_pid, pdev->rid, id, k->command);
> >  }
> >  }
> > +
> > +static PCIProxyDev *get_proxy_device(QDict *qdict, Error **errp)
> > +{
> > +PCMachineState *pcms = PC_MACHINE(current_machine);
> > +PCIProxyDev *pdev = NULL;
> > +const char *rdev_id;
> > +
> > +if (!qdict_haskey(qdict, "rdev_id")) {
> > +error_setg(errp, "Please specify a value for rdev_id");
> > +return NULL;
> > +}
> > +
> > +rdev_id = qdict_get_str(qdict, "rdev_id");
> > +
> > +pdev = (PCIProxyDev *)g_hash_table_lookup(pcms->remote_devs, rdev_id);
> > +if (!pdev) {
> > +error_setg(errp,
> > +   "No remote device by ID %s. Use query-remote command to 
> > get remote devices",
> > +   rdev_id);
> > +}
> > +
> > +return pdev;
> > +}
> > +
> > +static void rdevice_add_del(QDict *qdict, proc_cmd_t cmd, Error **errp)
> > +{
> > +PCMachineState *pcms = PC_MACHINE(current_machine);
> > +ProcMsg msg = {0};
> > +PCIProxyDev *pdev = NULL;
> > +const char *id;
> > +QString *json;
> >

Re: [Qemu-devel] [multiprocess RFC PATCH 00/37] Initial support of multi-process qemu

2019-03-08 Thread Elena Ufimtseva
On Thu, Mar 07, 2019 at 02:27:57PM +0100, Marc-André Lureau wrote:
> Hi
> 
> On Thu, Mar 7, 2019 at 11:46 AM Stefan Hajnoczi  wrote:
> >
> > On Wed, Mar 06, 2019 at 11:20:25PM -0800, elena.ufimts...@oracle.com wrote:
> > > From: Elena Ufimtseva 
> > >
> > > Initial support of multi-process qemu
> >
> > Hi Elena,
> > Please use the following setting when sending future patch series:
> >
> >   $ git config sendemail.thread shallow
> >
> > This way all patches are part of a single email thread (starting with
> > your PATCH 00 cover letter).  Reviewers find this more convenient so
> > that individual emails don't get separated and lost.

Thanks Stefan, will do.
> >
> 
> Please also check that there is no regression after each commit. In
> particular, the build shouldn't fail, so we can easily study, apply
> and bisect patches one by one.
> 


Yep, we try to make sure each commit builds and will automate for future
series as well, maybe with some simple tests.

Elena

> thanks
> 
> 
> -- 
> Marc-André Lureau



Re: [Qemu-devel] [multiprocess RFC PATCH 35/37] multi-process: QMP/HMP commands to resize block device on remote process

2019-03-14 Thread Elena Ufimtseva
On Thu, Mar 07, 2019 at 10:15:36AM -0600, Eric Blake wrote:
> On 3/7/19 1:22 AM, elena.ufimts...@oracle.com wrote:
> > From: Jagannathan Raman 
> > 
> > Adds rblock_resize QMP/HMP commands to resize block devices on the remote
> > process.
> > 
> > Signed-off-by: John G Johnson 
> > Signed-off-by: Jagannathan Raman 
> > Signed-off-by: Elena Ufimtseva 
> > ---
> 
> I know the discussion is questioning whether this is even the right way
> to go, but if we DO add a QMP command,
> 
> > +++ b/qapi/block-core.json
> > @@ -1260,6 +1260,31 @@
> >  'size': 'int' } }
> >  
> >  ##
> > +# @rblock_resize:
> 
> It should be named 'rblock-resize'
>

Ok. 
> > +#
> > +# Resize a block image while a guest is running, on a remote device.
> > +#
> > +# @device: the name of the device to get the image resized
> > +#
> > +# @size:  new image size in bytes
> > +#
> > +# Returns: nothing on success
> > +#  If @device is not a valid block device, DeviceNotFound
> > +#
> > +# Since: 3.0.93
> 
> and you've missed 3.0 (if you got it in THIS week, it would be Since
> 4.0; but that's unlikely, so you want Since 4.1).

Got it,

Thanks Eric!

Elena
> 
> -- 
> Eric Blake, Principal Software Engineer
> Red Hat, Inc.   +1-919-301-3226
> Virtualization:  qemu.org | libvirt.org
> 






Re: [RFC v4 PATCH 00/49] Initial support of multi-process qemu - status update

2019-12-09 Thread Elena Ufimtseva


Hi

We would like to give a short update to the community about the multi-process 
project.

Firstly, we appreciate the feedback and all productive discussions we had
at KVM 2019 forum.
As an outcome of the conference, we have switched gears and are investigating
the ways of using the muser framework in our project.

At this moment we are working on the evaluation and a first prototype
of qemu-multiprocess based on muser framework.
We first heard about it at the conference from the presentation given by
Thanos Makatos and Swapnil Ingle from Nutanix.
Their presentation is available 
https://static.sched.com/hosted_files/kvmforum2019/3b/muser.pdf
 along with github link to the source repo.
After the conversation we had with a group of people including Felipe Franciosi,
Stefan Hajnoczi, Daniel Berrangé, Konrad Wilk, Peter Maydell, John Jonson and 
few others
(apologies if some names are missing), we have gathered important answers on 
how to move
forward with qemu-multiprocess.

At this moment we are working on the first stage of the project with help of
the Nutanix developers.
The questions we have gathered so far will be addressed with muser
and Qemu developers after we finish the first stage and make sure we understand
what it will take for us to move onto the next stage.

We will also incorporate relevant review from Stefan that he provided
on the series 4 of the patchset. Thank you Stefan.

If anyone have any further suggestions or questions about the status,
please reply to this email.

Thank you

JJ, Jag & Elena

On Thu, Oct 24, 2019 at 05:08:41AM -0400, Jagannathan Raman wrote:
> Started with the presentation in October 2017 made by Marc-Andre (Red Hat)
> and Konrad Wilk (Oracle) [1], and continued by Jag's BoF at KVM Forum 2018,
> the multi-process project is now a prototype and presented in this patchset.
> John & Elena will present the status of this project in KVM Forum 2019.
> 
> This first series enables the emulation of lsi53c895a in a separate process.
> 
> We posted the Proof Of Concept patches [2] before the BoF session in 2018.
> Subsequently, we posted RFC v1 [3], RFC v2 [4] and RFC v3 [5] of this series. 
> 
> We want to present version 4 of this series, which incorporates the feedback
> we received for v3 & adds support for live migrating the remote process.
> 
> Following people contributed to this patchset:
> 
> John G Johnson 
> Jagannathan Raman 
> Elena Ufimtseva 
> Kanth Ghatraju 
> 
> For full concept writeup about QEMU disaggregation refer to
> docs/devel/qemu-multiprocess.rst. Please refer to 
> docs/qemu-multiprocess.txt for usage information.
> 
> We are planning on making the following improvements in the future:
>  - Performance improvements
>  - Libvirt support
>  - Enforcement of security policies
>  - blockdev support
> 
> We welcome all your ideas, concerns, and questions for this patchset.
> 
> Thank you!
> 
> [1]: 
> http://events17.linuxfoundation.org/sites/events/files/slides/KVM%20FORUM%20multi-process.pdf
> [1]: https://www.youtube.com/watch?v=Kq1-coHh7lg
> [2]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg566538.html
> [3]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg602285.html
> [4]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg624877.html
> [5]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg642000.html
> 
> Elena Ufimtseva (22):
>   multi-process: add a command line option for debug file
>   multi-process: introduce proxy object
>   mutli-process: build remote command line args
>   multi-process: configure remote side devices
>   multi-process: add qdev_proxy_add to create proxy devices
>   multi-process: remote: add setup_devices and setup_drive msg
> processing
>   multi-process: remote: use fd for socket from parent process
>   multi-process: remote: add create_done condition
>   multi-process: add processing of remote drive and device command line
>   multi-process: refractor vl.c code to re-use in remote
>   multi-process: add remote option
>   multi-process: add remote options parser
>   multi-process: add parse_cmdline in remote process
>   multi-process: send heartbeat messages to remote
>   multi-process: handle heartbeat messages in remote process
>   multi-process/mon: choose HMP commands based on target
>   multi-process/mig: Load VMSD in the proxy object
>   multi-process/mig: refactor runstate_check into common file
>   multi-process/mig: Synchronize runstate of remote process
>   multi-process/mig: Restore the VMSD in remote process
>   multi-process: Enable support for multiple devices in remote
>   multi-process: add configure and usage information
> 
> Jagannathan Raman (26):
>   multi-process: memory: alloc RAM from file at offset
>   multi-process: util: Add qemu_thre

Re: [RFC v4 PATCH 00/49] Initial support of multi-process qemu - status update

2019-12-16 Thread Elena Ufimtseva
On Fri, Dec 13, 2019 at 10:41:16AM +, Stefan Hajnoczi wrote:
> On Mon, Dec 09, 2019 at 10:47:17PM -0800, Elena Ufimtseva wrote:
> > At this moment we are working on the first stage of the project with help of
> > the Nutanix developers.
> > The questions we have gathered so far will be addressed with muser
> > and Qemu developers after we finish the first stage and make sure we 
> > understand
> > what it will take for us to move onto the next stage.
> > 
> > We will also incorporate relevant review from Stefan that he provided
> > on the series 4 of the patchset. Thank you Stefan.
> > 
> > If anyone have any further suggestions or questions about the status,
> > please reply to this email.
> 
> Hi Elena,
> At KVM Forum we discussed spending 1 or 2 weeks trying out muser.  A few
> weeks have passed and from your email it sounds like this "next stage"
> might be a lot of work.
>

Hi Stefan

Perhaps we were not too clear about our work in the previous email.
Our assumption was that the question that came from KVM Forum was
if muser can be used to achieve the same what we have now.
We should have answered clearly yes to this question.  We have not yet
discovered major road blocks.
At the moment, we are mostly engaged in learning the code and discussing
the design, plus some coding to answer the specific questions.
We understand that the best way to make a progress is to work with the
upstream community on early stages and we agree with this and will present
the proposal shortly for discussion.
 
> Is there a work-in-progress muser patch series you can post to start the
> discussion early?  That way we can avoid reviewers like myself asking
> you to make changes after you have invested a lot of time.
>

Absolutely, that is our plan. At the moment we do not have the patches
ready for the review. We have setup internally a milestone and will be
sending that early version as a tarball after we have it completed.
Would be also a meeting something that could help us to stay on the same
page?
 
> It's good that you are in touch with the muser developers (via private
> discussion?  I haven't seen much activity on #muser IRC).
>

We use IRC (I know Jag got some answers there) and github for issues
(one of which was addressed). We are hoping to get the conversation going over
the email.

JJ, Jag and Elena 
> Stefan





[RFC 1/8] ioregionfd: introduce a syscall and memory API

2022-02-08 Thread Elena Ufimtseva
Signed-off-by: Elena Ufimtseva 
---
 include/exec/memory.h |  50 +++
 include/sysemu/kvm.h  |  15 +
 linux-headers/linux/kvm.h |  25 
 accel/kvm/kvm-all.c   | 132 ++
 accel/stubs/kvm-stub.c|   1 +
 5 files changed, 223 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 20f1b27377..2ce7f35cc2 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -712,6 +712,7 @@ void 
ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
 typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
+typedef struct MemoryRegionIoregionfd MemoryRegionIoregionfd;
 
 /** MemoryRegion:
  *
@@ -756,6 +757,8 @@ struct MemoryRegion {
 const char *name;
 unsigned ioeventfd_nb;
 MemoryRegionIoeventfd *ioeventfds;
+unsigned ioregionfd_nb;
+MemoryRegionIoregionfd *ioregionfds;
 RamDiscardManager *rdm; /* Only for RAM */
 };
 
@@ -974,6 +977,38 @@ struct MemoryListener {
  */
 void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section,
 bool match_data, uint64_t data, EventNotifier *e);
+/**
+ * @ioregionfd_add:
+ *
+ * Called during an address space update transaction,
+ * for a section of the address space that has had a new ioregionfd
+ * registration since the last transaction.
+ *
+ * @listener: The #MemoryListener.
+ * @section: The new #MemoryRegionSection.
+ * @data: The @data parameter for the new ioregionfd.
+ * @fd: The file descriptor parameter for the new ioregionfd.
+ */
+void (*ioregionfd_add)(MemoryListener *listener,
+   MemoryRegionSection *section,
+   uint64_t data, int fd);
+
+/**
+ * @ioregionfd_del:
+ *
+ * Called during an address space update transaction,
+ * for a section of the address space that has dropped an ioregionfd
+ * registration since the last transaction.
+ *
+ * @listener: The #MemoryListener.
+ * @section: The new #MemoryRegionSection.
+ * @data: The @data parameter for the dropped ioregionfd.
+ * @fd: The file descriptor parameter for the dropped ioregionfd.
+ */
+void (*ioregionfd_del)(MemoryListener *listener,
+   MemoryRegionSection *section,
+   uint64_t data, int fd);
+
 
 /**
  * @coalesced_io_add:
@@ -1041,6 +1076,8 @@ struct AddressSpace {
 
 int ioeventfd_nb;
 struct MemoryRegionIoeventfd *ioeventfds;
+int ioregionfd_nb;
+struct MemoryRegionIoregionfd *ioregionfds;
 QTAILQ_HEAD(, MemoryListener) listeners;
 QTAILQ_ENTRY(AddressSpace) address_spaces_link;
 };
@@ -2175,6 +2212,19 @@ void memory_region_del_eventfd(MemoryRegion *mr,
uint64_t data,
EventNotifier *e);
 
+void memory_region_add_ioregionfd(MemoryRegion *mr,
+  hwaddr addr,
+  unsigned size,
+  uint64_t data,
+  int fd,
+  bool pio);
+
+void memory_region_del_ioregionfd(MemoryRegion *mr,
+  hwaddr addr,
+  unsigned size,
+  uint64_t data,
+  int fd);
+
 /**
  * memory_region_add_subregion: Add a subregion to a container.
  *
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 7b22aeb6ae..fea77b5185 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -46,6 +46,7 @@ extern bool kvm_readonly_mem_allowed;
 extern bool kvm_direct_msi_allowed;
 extern bool kvm_ioeventfd_any_length_allowed;
 extern bool kvm_msi_use_devid;
+extern bool kvm_ioregionfds_allowed;
 
 #define kvm_enabled()   (kvm_allowed)
 /**
@@ -167,6 +168,15 @@ extern bool kvm_msi_use_devid;
  */
 #define kvm_msi_devid_required() (kvm_msi_use_devid)
 
+/**
+ * kvm_ioregionfds_enabled:
+ *
+ * Returns: true if we can use ioregionfd to receive the MMIO/PIO
+ * dispatches from KVM (ie the kernel supports ioregionfd and we are running
+ * with a configuration where it is meaningful to use them).
+ */
+#define kvm_ioregionfds_enabled() (kvm_ioregionfds_allowed)
+
 #else
 
 #define kvm_enabled()   (0)
@@ -184,12 +194,14 @@ extern bool kvm_msi_use_devid;
 #define kvm_direct_msi_enabled() (false)
 #define kvm_ioeventfd_any_length_enabled() (false)
 #define kvm_msi_devid_required() (false)
+#define kvm_ioregionfds_enabled (false)
 
 #endif  /* CONFIG_KVM_IS_POSSIBLE */
 
 struct kvm_run;
 struct kvm_lapic_state;
 struct kvm_irq_routing_entry;
+struct kvm_ioregion;
 
 typedef struct KVMCapabilityInfo {
 const char *name;
@@ -548,4 +560,7 @@ bool kvm_cpu_check_are_resettable(void);
 bool

[RFC 2/8] multiprocess: place RemoteObject definition in a header file

2022-02-08 Thread Elena Ufimtseva
This will be needed later. No functional changes.

Signed-off-by: Elena Ufimtseva 
---
 include/hw/remote/remote.h | 28 
 hw/remote/remote-obj.c | 16 +---
 MAINTAINERS|  1 +
 3 files changed, 30 insertions(+), 15 deletions(-)
 create mode 100644 include/hw/remote/remote.h

diff --git a/include/hw/remote/remote.h b/include/hw/remote/remote.h
new file mode 100644
index 00..a2d23178b9
--- /dev/null
+++ b/include/hw/remote/remote.h
@@ -0,0 +1,28 @@
+/*
+ * RemoteObject header.
+ *
+ * Copyright © 2018, 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef REMOTE_H
+#define REMOTE_H
+
+struct RemoteObject {
+/* private */
+Object parent;
+
+Notifier machine_done;
+
+int32_t fd;
+char *devid;
+
+QIOChannel *ioc;
+
+DeviceState *dev;
+DeviceListener listener;
+};
+
+#endif
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
index 4f21254219..f0da696662 100644
--- a/hw/remote/remote-obj.c
+++ b/hw/remote/remote-obj.c
@@ -23,6 +23,7 @@
 #include "hw/pci/pci.h"
 #include "qemu/sockets.h"
 #include "monitor/monitor.h"
+#include "hw/remote/remote.h"
 
 #define TYPE_REMOTE_OBJECT "x-remote-object"
 OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
@@ -34,21 +35,6 @@ struct RemoteObjectClass {
 unsigned int max_devs;
 };
 
-struct RemoteObject {
-/* private */
-Object parent;
-
-Notifier machine_done;
-
-int32_t fd;
-char *devid;
-
-QIOChannel *ioc;
-
-DeviceState *dev;
-DeviceListener listener;
-};
-
 static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
 {
 RemoteObject *o = REMOTE_OBJECT(obj);
diff --git a/MAINTAINERS b/MAINTAINERS
index 7543eb4d59..3c60a29760 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3453,6 +3453,7 @@ F: hw/remote/proxy-memory-listener.c
 F: include/hw/remote/proxy-memory-listener.h
 F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
+F: include/hw/remote/remote.h
 
 EBPF:
 M: Jason Wang 
-- 
2.25.1




[RFC 7/8] multiprocess: add ioregionfd memory region in proxy

2022-02-08 Thread Elena Ufimtseva
Signed-off-by: Elena Ufimtseva 
---
 include/hw/remote/proxy.h |  1 +
 hw/remote/proxy.c | 66 ---
 2 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
index 741def71f1..9efef0b935 100644
--- a/include/hw/remote/proxy.h
+++ b/include/hw/remote/proxy.h
@@ -29,6 +29,7 @@ struct PCIProxyDev {
 PCIDevice parent_dev;
 char *fd;
 
+char *ioregfd;
 /*
  * Mutex used to protect the QIOChannel fd from
  * the concurrent access by the VCPUs since proxy
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
index bad164299d..ba1aa20d78 100644
--- a/hw/remote/proxy.c
+++ b/hw/remote/proxy.c
@@ -146,6 +146,33 @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
 event_notifier_cleanup(&dev->resample);
 }
 
+static void config_get_ioregionfd_info(PCIProxyDev *pdev, uint32_t reg_num,
+   uint32_t *val, bool memory)
+{
+MPQemuMsg msg = { 0 };
+Error *local_err = NULL;
+uint64_t ret = -EINVAL;
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+msg.cmd = MPQEMU_CMD_BAR_INFO;
+msg.num_fds = 0;
+msg.data.u64 = (uint64_t)reg_num & MAKE_64BIT_MASK(0, 32);
+
+msg.data.u64 |= memory ? (1ULL << 32) : 0;
+msg.size = sizeof(msg.data.u64);
+
+ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+if (local_err) {
+error_report_err(local_err);
+error_report("Error while receiving reply from remote about fd");
+}
+if (ret == UINT64_MAX) {
+error_report("Failed to request bar info for %d", reg_num);
+}
+
+*val = (uint32_t)ret;
+}
+
 static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
int len, unsigned int op)
 {
@@ -198,6 +225,7 @@ static void pci_proxy_write_config(PCIDevice *d, uint32_t 
addr, uint32_t val,
 
 static Property proxy_properties[] = {
 DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
+DEFINE_PROP_STRING("ioregfd", PCIProxyDev, ioregfd),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -297,7 +325,7 @@ const MemoryRegionOps proxy_mr_ops = {
 static void probe_pci_info(PCIDevice *dev, Error **errp)
 {
 PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
-uint32_t orig_val, new_val, base_class, val;
+uint32_t orig_val, new_val, base_class, val, ioregionfd_bar;
 PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
 DeviceClass *dc = DEVICE_CLASS(pc);
 uint8_t type;
@@ -342,6 +370,9 @@ static void probe_pci_info(PCIDevice *dev, Error **errp)
 }
 
 for (i = 0; i < PCI_NUM_REGIONS; i++) {
+bool init_ioregionfd = false;
+int fd = -1;
+
 config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
MPQEMU_CMD_PCI_CFGREAD);
 new_val = 0x;
@@ -362,9 +393,36 @@ static void probe_pci_info(PCIDevice *dev, Error **errp)
 if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
 pdev->region[i].memory = true;
 }
-memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
-  &proxy_mr_ops, &pdev->region[i],
-  name, size);
+#ifdef CONFIG_IOREGIONFD
+/*
+ * Currently, only one fd per device is supported.
+ * TODO: Drop this limit.
+ */
+if (pdev->ioregfd) {
+fd = monitor_fd_param(monitor_cur(), pdev->ioregfd, errp);
+if (fd == -1) {
+error_prepend(errp, "Could not parse ioregionfd fd %s:",
+  pdev->ioregfd);
+}
+
+config_get_ioregionfd_info(pdev, i, &ioregionfd_bar,
+   pdev->region[i].memory);
+if (ioregionfd_bar == i) {
+init_ioregionfd = true;
+}
+}
+#endif
+if (init_ioregionfd) {
+memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+  NULL, &pdev->region[i],
+  name, size);
+memory_region_add_ioregionfd(&pdev->region[i].mr, 0, size, i,
+ fd, false);
+} else {
+memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+  &proxy_mr_ops, &pdev->region[i],
+  name, size);
+}
 pci_register_bar(dev, i, type, &pdev->region[i].mr);
 }
 }
-- 
2.25.1




[RFC 0/8] ioregionfd introduction

2022-02-08 Thread Elena Ufimtseva
This patchset is an RFC version for the ioregionfd implementation
in QEMU. The kernel patches are to be posted with some fixes as a v4.

For this implementation version 3 of the posted kernel patches was user:
https://lore.kernel.org/kvm/cover.1613828726.git.eafanas...@gmail.com/

The future version will include support for vfio/libvfio-user.
Please refer to the design discussion here proposed by Stefan:
https://lore.kernel.org/all/YXpb1f3KicZxj1oj@stefanha-x1.localdomain/T/

The vfio-user version needed some bug-fixing and it was decided to send
this for multiprocess first.

The ioregionfd is configured currently trough the command line and each
ioregionfd represent an object. This allow for easy parsing and does
not require device/remote object command line option modifications.

The following command line can be used to specify ioregionfd:

  '-object', 'x-remote-object,id=robj1,devid=lsi0,fd='+str(remote.fileno()),\
  '-object', 
'ioregionfd-object,id=ioreg2,devid=lsi0,iofd='+str(iord.fileno())+',bar=1',\
  '-object', 
'ioregionfd-object,id=ioreg3,devid=lsi0,iofd='+str(iord.fileno())+',bar=2',\


Proxy side of ioregionfd in this version uses only one file descriptor:

  '-device', 
'x-pci-proxy-dev,id=lsi0,fd='+str(proxy.fileno())+',ioregfd='+str(iowr.fileno()),
 \


This is done for RFC version and my though was that next version will
be for vfio-user, so I have not dedicated much effort to this command
line options.

The multiprocess messaging protocol was extended to support inquiries
by the proxy if device has any ioregionfds.
This RFC implements inquires by proxy about the type of BAR (ioregionfd
or not) and the type of it (memory/io).

Currently there are few limitations in this version of ioregionfd.
 - one ioregionfd per bar, only full bar size is supported;
 - one file descriptor per device for all of its ioregionfds;
 - each remote device runs fd handler for all its BARs in one IOThread;
 - proxy supports only one fd.

Some of these limitations will be dropped in the future version.
This RFC is to acquire the feedback/suggestions from the community
on the general approach.

The quick performance test was done for the remote lsi device with
ioregionfd and without for both mem BARs (1 and 2) with help
of the fio tool:

Random R/W:

 read IOPS  read BW write IOPS   write BW
no ioregionfd8893559KiB/s   890  3561KiB/s
ioregionfd   9383756KiB/s   939  3757KiB/s


Sequential Read and Sequential Write:

 Sequential readSequential write
 read IOPS  read BW write IOPS   write BW

no ioregionfd367k   1434MiB/s   76k  297MiB/s
ioregionfd   374k   1459MiB/s   77.3k302MiB/s


Please review and send your feedback.

Thank you!
Elena

Elena Ufimtseva (8):
  ioregionfd: introduce a syscall and memory API
  multiprocess: place RemoteObject definition in a header file
  ioregionfd: introduce memory API functions
  ioregionfd: Introduce IORegionDFObject type
  multiprocess: prepare ioregionfds for remote device
  multiprocess: add MPQEMU_CMD_BAR_INFO
  multiprocess: add ioregionfd memory region in proxy
  multiprocess: handle ioregionfd commands

 meson.build |  15 +-
 qapi/qom.json   |  32 ++-
 include/exec/memory.h   |  50 +
 include/hw/remote/ioregionfd.h  |  45 
 include/hw/remote/machine.h |   1 +
 include/hw/remote/mpqemu-link.h |   2 +
 include/hw/remote/proxy.h   |   1 +
 include/hw/remote/remote.h  |  31 +++
 include/sysemu/kvm.h|  15 ++
 linux-headers/ioregionfd.h  |  30 +++
 linux-headers/linux/kvm.h   |  25 +++
 accel/kvm/kvm-all.c | 132 
 accel/stubs/kvm-stub.c  |   1 +
 hw/remote/ioregionfd.c  | 361 
 hw/remote/message.c |  38 
 hw/remote/proxy.c   |  66 +-
 hw/remote/remote-obj.c  | 154 --
 softmmu/memory.c| 207 ++
 Kconfig.host|   3 +
 MAINTAINERS |   3 +
 hw/remote/Kconfig   |   4 +
 hw/remote/meson.build   |   1 +
 meson_options.txt   |   2 +
 scripts/meson-buildoptions.sh   |   3 +
 24 files changed, 1199 insertions(+), 23 deletions(-)
 create mode 100644 include/hw/remote/ioregionfd.h
 create mode 100644 include/hw/remote/remote.h
 create mode 100644 linux-headers/ioregionfd.h
 create mode 100644 hw/remote/ioregionfd.c

-- 
2.25.1




[RFC 8/8] multiprocess: handle ioregionfd commands

2022-02-08 Thread Elena Ufimtseva
Signed-off-by: Elena Ufimtseva 
---
 include/hw/remote/ioregionfd.h |   2 +
 include/hw/remote/remote.h |   2 +
 linux-headers/ioregionfd.h |  30 +
 hw/remote/ioregionfd.c | 111 +
 hw/remote/remote-obj.c |  44 +
 5 files changed, 189 insertions(+)
 create mode 100644 linux-headers/ioregionfd.h

diff --git a/include/hw/remote/ioregionfd.h b/include/hw/remote/ioregionfd.h
index 66bb459f76..8021eed6f1 100644
--- a/include/hw/remote/ioregionfd.h
+++ b/include/hw/remote/ioregionfd.h
@@ -40,4 +40,6 @@ typedef struct IORegionFDObject IORegionFDObject;
 GSList *ioregionfd_get_obj_list(void);
 IORegionFD *ioregionfd_get_by_bar(GSList *list, uint32_t bar);
 void ioregionfd_set_bar_type(GSList *list, uint32_t bar, bool memory);
+int qio_channel_ioregionfd_read(QIOChannel *ioc, gpointer opaque,
+Error **errp);
 #endif /* IOREGIONFD_H */
diff --git a/include/hw/remote/remote.h b/include/hw/remote/remote.h
index 46390c7934..53b570e1ac 100644
--- a/include/hw/remote/remote.h
+++ b/include/hw/remote/remote.h
@@ -23,6 +23,8 @@ struct RemoteObject {
 
 DeviceState *dev;
 DeviceListener listener;
+QIOChannel *ioregfd_ioc;
+AioContext *ioregfd_ctx;
 GHashTable *ioregionfd_hash;
 };
 
diff --git a/linux-headers/ioregionfd.h b/linux-headers/ioregionfd.h
new file mode 100644
index 00..58f9b5ba61
--- /dev/null
+++ b/linux-headers/ioregionfd.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR 
BSD-3-Clause) */
+#ifndef _UAPI_LINUX_IOREGION_H
+#define _UAPI_LINUX_IOREGION_H
+
+/* Wire protocol */
+
+struct ioregionfd_cmd {
+   __u8 cmd;
+   __u8 size_exponent : 4;
+   __u8 resp : 1;
+   __u8 padding[6];
+   __u64 user_data;
+   __u64 offset;
+   __u64 data;
+};
+
+struct ioregionfd_resp {
+   __u64 data;
+   __u8 pad[24];
+};
+
+#define IOREGIONFD_CMD_READ0
+#define IOREGIONFD_CMD_WRITE   1
+
+#define IOREGIONFD_SIZE_8BIT   0
+#define IOREGIONFD_SIZE_16BIT  1
+#define IOREGIONFD_SIZE_32BIT  2
+#define IOREGIONFD_SIZE_64BIT  3
+
+#endif
diff --git a/hw/remote/ioregionfd.c b/hw/remote/ioregionfd.c
index 1d371357c6..dd04c39e25 100644
--- a/hw/remote/ioregionfd.c
+++ b/hw/remote/ioregionfd.c
@@ -26,6 +26,7 @@
 #include "hw/pci/pci.h"
 #include "qapi/qapi-visit-qom.h"
 #include "hw/remote/remote.h"
+#include "ioregionfd.h"
 
 #define TYPE_IOREGIONFD_OBJECT "ioregionfd-object"
 OBJECT_DECLARE_TYPE(IORegionFDObject, IORegionFDObjectClass, IOREGIONFD_OBJECT)
@@ -91,6 +92,116 @@ void ioregionfd_set_bar_type(GSList *list, uint32_t bar, 
bool memory)
 }
 }
 
+int qio_channel_ioregionfd_read(QIOChannel *ioc, gpointer opaque,
+Error **errp)
+{
+struct RemoteObject *o = (struct RemoteObject *)opaque;
+struct ioregionfd_cmd cmd = {};
+struct iovec iov = {
+.iov_base = &cmd,
+.iov_len = sizeof(struct ioregionfd_cmd),
+};
+IORegionFDObject *ioregfd_obj;
+PCIDevice *pci_dev;
+hwaddr addr;
+struct ioregionfd_resp resp = {};
+int bar = 0;
+Error *local_err = NULL;
+uint64_t val = UINT64_MAX;
+AddressSpace *as;
+int ret = -EINVAL;
+
+ERRP_GUARD();
+
+if (!ioc) {
+return -EINVAL;
+}
+ret = qio_channel_readv_full(ioc, &iov, 1, NULL, 0, &local_err);
+
+if (ret == QIO_CHANNEL_ERR_BLOCK) {
+return -EINVAL;
+}
+
+if (ret <= 0) {
+/* read error or other side closed connection */
+if (local_err) {
+error_report_err(local_err);
+}
+error_setg(errp, "ioregionfd receive error");
+return -EINVAL;
+}
+
+bar = cmd.user_data;
+pci_dev = PCI_DEVICE(o->dev);
+addr = (hwaddr)(pci_get_bar_addr(pci_dev, bar) + cmd.offset);
+IORegionFDObject key = {.ioregfd = {.bar = bar} };
+ioregfd_obj = g_hash_table_lookup(o->ioregionfd_hash, &key);
+
+if (!ioregfd_obj) {
+error_setg(errp, "Could not find IORegionFDObject");
+return -EINVAL;
+}
+if (ioregfd_obj->ioregfd.memory) {
+as = &address_space_memory;
+} else {
+as = &address_space_io;
+}
+
+if (ret > 0 && pci_dev) {
+switch (cmd.cmd) {
+case IOREGIONFD_CMD_READ:
+ret = address_space_rw(as, addr, MEMTXATTRS_UNSPECIFIED,
+   (void *)&val, 1 << cmd.size_exponent,
+   false);
+if (ret != MEMTX_OK) {
+ret = -EINVAL;
+error_setg(errp, "Bad address %"PRIx64" in mem read", addr);
+val = UINT64_MAX;
+}
+
+memset(&resp, 0, sizeof(resp));
+resp.data = val;
+if (qio_channel_write_all(ioc, (char *)&a

[RFC 3/8] ioregionfd: introduce memory API functions

2022-02-08 Thread Elena Ufimtseva
Similar to ioeventfd, introduce the ioregionfd
functions to add and delete ioregionfds.

Signed-off-by: Elena Ufimtseva 
---
 softmmu/memory.c | 207 +++
 1 file changed, 207 insertions(+)

diff --git a/softmmu/memory.c b/softmmu/memory.c
index 7340e19ff5..3618c5d1cf 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -40,6 +40,7 @@ static unsigned memory_region_transaction_depth;
 static bool memory_region_update_pending;
 static bool ioeventfd_update_pending;
 unsigned int global_dirty_tracking;
+static bool ioregionfd_update_pending;
 
 static QTAILQ_HEAD(, MemoryListener) memory_listeners
 = QTAILQ_HEAD_INITIALIZER(memory_listeners);
@@ -170,6 +171,13 @@ struct MemoryRegionIoeventfd {
 EventNotifier *e;
 };
 
+struct MemoryRegionIoregionfd {
+AddrRange addr;
+uint64_t data;
+int fd;
+bool pio;
+};
+
 static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
MemoryRegionIoeventfd *b)
 {
@@ -214,6 +222,33 @@ static bool 
memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
 return false;
 }
 
+static bool memory_region_ioregionfd_before(MemoryRegionIoregionfd *a,
+   MemoryRegionIoregionfd *b)
+{
+if (int128_lt(a->addr.start, b->addr.start)) {
+return true;
+} else if (int128_gt(a->addr.start, b->addr.start)) {
+return false;
+} else if (int128_lt(a->addr.size, b->addr.size)) {
+return true;
+} else if (int128_gt(a->addr.size, b->addr.size)) {
+return false;
+}
+return false;
+}
+
+static bool memory_region_ioregionfd_equal(MemoryRegionIoregionfd *a,
+  MemoryRegionIoregionfd *b)
+{
+if (int128_eq(a->addr.start, b->addr.start) &&
+(!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+ (int128_eq(a->addr.size, b->addr.size) &&
+  (a->fd == b->fd
+return true;
+
+return false;
+}
+
 /* Range of memory in the global map.  Addresses are absolute. */
 struct FlatRange {
 MemoryRegion *mr;
@@ -800,6 +835,52 @@ static void address_space_add_del_ioeventfds(AddressSpace 
*as,
 }
 }
 
+static void address_space_add_del_ioregionfds(AddressSpace *as,
+  MemoryRegionIoregionfd *fds_new,
+  unsigned fds_new_nb,
+  MemoryRegionIoregionfd *fds_old,
+  unsigned fds_old_nb)
+{
+unsigned iold, inew;
+MemoryRegionIoregionfd *fd;
+MemoryRegionSection section;
+
+iold = inew = 0;
+while (iold < fds_old_nb || inew < fds_new_nb) {
+if (iold < fds_old_nb
+&& (inew == fds_new_nb
+|| memory_region_ioregionfd_before(&fds_old[iold],
+  &fds_new[inew]))) {
+fd = &fds_old[iold];
+section = (MemoryRegionSection) {
+.fv = address_space_to_flatview(as),
+.offset_within_address_space = int128_get64(fd->addr.start),
+.size = fd->addr.size,
+};
+MEMORY_LISTENER_CALL(as, ioregionfd_del, Forward, §ion,
+ fd->data, fd->fd);
+++iold;
+
+} else if (inew < fds_new_nb
+   && (iold == fds_old_nb
+   || memory_region_ioregionfd_before(&fds_new[inew],
+ &fds_old[iold]))) {
+fd = &fds_new[inew];
+section = (MemoryRegionSection) {
+.fv = address_space_to_flatview(as),
+.offset_within_address_space = int128_get64(fd->addr.start),
+.size = fd->addr.size,
+};
+MEMORY_LISTENER_CALL(as, ioregionfd_add, Reverse, §ion,
+ fd->data, fd->fd);
+++inew;
+} else {
+++iold;
+++inew;
+}
+}
+}
+
 FlatView *address_space_get_flatview(AddressSpace *as)
 {
 FlatView *view;
@@ -814,6 +895,52 @@ FlatView *address_space_get_flatview(AddressSpace *as)
 return view;
 }
 
+static void address_space_update_ioregionfds(AddressSpace *as)
+{
+FlatView *view;
+FlatRange *fr;
+unsigned ioregionfd_nb = 0;
+unsigned ioregionfd_max;
+MemoryRegionIoregionfd *ioregionfds;
+AddrRange tmp;
+unsigned i;
+
+/*
+ * It is likely that the number of ioregionfds hasn't changed much, so use
+ * the previous size as the starting value, with some headroom to avoid
+ * gratuitous reallocations.
+ */
+ioregionfd_max = QEMU_ALIGN_UP(as->ioregionfd_nb, 4);
+ioregionfds = g_new(MemoryR

[RFC 4/8] ioregionfd: Introduce IORegionDFObject type

2022-02-08 Thread Elena Ufimtseva
Signed-off-by: Elena Ufimtseva 
---
 meson.build|  15 ++-
 qapi/qom.json  |  32 +-
 include/hw/remote/ioregionfd.h |  40 +++
 hw/remote/ioregionfd.c | 196 +
 Kconfig.host   |   3 +
 MAINTAINERS|   2 +
 hw/remote/Kconfig  |   4 +
 hw/remote/meson.build  |   1 +
 meson_options.txt  |   2 +
 scripts/meson-buildoptions.sh  |   3 +
 10 files changed, 294 insertions(+), 4 deletions(-)
 create mode 100644 include/hw/remote/ioregionfd.h
 create mode 100644 hw/remote/ioregionfd.c

diff --git a/meson.build b/meson.build
index 96de1a6ef9..6483e754bd 100644
--- a/meson.build
+++ b/meson.build
@@ -258,6 +258,17 @@ if targetos != 'linux' and 
get_option('multiprocess').enabled()
 endif
 multiprocess_allowed = targetos == 'linux' and not 
get_option('multiprocess').disabled()
 
+# TODO: drop this limitation
+if not multiprocess_allowed and not get_option('ioregionfd').disabled()
+  error('To enable ioregiofd support, enable mutliprocess option.')
+endif
+ioregionfd_allowed = multiprocess_allowed and not 
get_option('ioregionfd').disabled()
+if ioregionfd_allowed
+config_host += { 'CONFIG_IOREGIONFD': 'y' }
+else
+config_host += { 'CONFIG_IOREGIONFD': 'n' }
+endif
+
 libm = cc.find_library('m', required: false)
 threads = dependency('threads')
 util = cc.find_library('util', required: false)
@@ -1837,7 +1848,8 @@ host_kconfig = \
   (have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
   ('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
   ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \
-  (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
+  (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \
+  (ioregionfd_allowed ? ['CONFIG_IOREGIONFD=y'] : [])
 
 ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
 
@@ -3315,6 +3327,7 @@ summary_info += {'target list':   ' 
'.join(target_dirs)}
 if have_system
   summary_info += {'default devices':   get_option('default_devices')}
   summary_info += {'out of process emulation': multiprocess_allowed}
+  summary_info += {'ioregionfd support': ioregionfd_allowed}
 endif
 summary(summary_info, bool_yn: true, section: 'Targets and accelerators')
 
diff --git a/qapi/qom.json b/qapi/qom.json
index eeb5395ff3..439fb94c93 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -689,6 +689,29 @@
 'data': { 'chardev': 'str',
   '*log': 'str' } }
 
+##
+# @IORegionFDObjectProperties:
+#
+# Describes ioregionfd for the device
+#
+# @devid: the id of the device to be associated with the ioregionfd
+#
+# @iofd: File descriptor
+#
+# @bar: BAR number to use with ioregionfd
+#
+# @start: offset from the BAR start address of ioregionfd
+#
+# @size: size of the ioregionfd
+##
+# Since: 2.9
+{ 'struct': 'IORegionFDObjectProperties',
+  'data': { 'devid': 'str',
+'iofd': 'str',
+'bar': 'int',
+'*start': 'int',
+'*size':'int' } }
+
 ##
 # @RemoteObjectProperties:
 #
@@ -842,8 +865,10 @@
 'tls-creds-psk',
 'tls-creds-x509',
 'tls-cipher-suites',
-{ 'name': 'x-remote-object', 'features': [ 'unstable' ] }
-  ] }
+{ 'name': 'x-remote-object', 'features': [ 'unstable' ] },
+{ 'name' :'ioregionfd-object',
+  'if': 'CONFIG_IOREGIONFD' }
+ ] }
 
 ##
 # @ObjectOptions:
@@ -905,7 +930,8 @@
   'tls-creds-psk':  'TlsCredsPskProperties',
   'tls-creds-x509': 'TlsCredsX509Properties',
   'tls-cipher-suites':  'TlsCredsProperties',
-  'x-remote-object':'RemoteObjectProperties'
+  'x-remote-object':'RemoteObjectProperties',
+  'ioregionfd-object':  'IORegionFDObjectProperties'
   } }
 
 ##
diff --git a/include/hw/remote/ioregionfd.h b/include/hw/remote/ioregionfd.h
new file mode 100644
index 00..c8a8b32ee0
--- /dev/null
+++ b/include/hw/remote/ioregionfd.h
@@ -0,0 +1,40 @@
+/*
+ * Ioregionfd headers
+ *
+ * Copyright © 2018, 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPY

[RFC 6/8] multiprocess: add MPQEMU_CMD_BAR_INFO

2022-02-08 Thread Elena Ufimtseva
This command is used to request the bar type info from
remote device.

Signed-off-by: Elena Ufimtseva 
---
 include/hw/remote/ioregionfd.h  |  2 ++
 include/hw/remote/machine.h |  1 +
 include/hw/remote/mpqemu-link.h |  2 ++
 hw/remote/ioregionfd.c  | 28 
 hw/remote/message.c | 38 +
 hw/remote/remote-obj.c  |  1 +
 6 files changed, 72 insertions(+)

diff --git a/include/hw/remote/ioregionfd.h b/include/hw/remote/ioregionfd.h
index 85a2ef2c4f..66bb459f76 100644
--- a/include/hw/remote/ioregionfd.h
+++ b/include/hw/remote/ioregionfd.h
@@ -38,4 +38,6 @@ struct IORegionFDObject {
 typedef struct IORegionFDObject IORegionFDObject;
 
 GSList *ioregionfd_get_obj_list(void);
+IORegionFD *ioregionfd_get_by_bar(GSList *list, uint32_t bar);
+void ioregionfd_set_bar_type(GSList *list, uint32_t bar, bool memory);
 #endif /* IOREGIONFD_H */
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
index 2a2a33c4b2..71c53ba0d7 100644
--- a/include/hw/remote/machine.h
+++ b/include/hw/remote/machine.h
@@ -28,6 +28,7 @@ struct RemoteMachineState {
 typedef struct RemoteCommDev {
 PCIDevice *dev;
 QIOChannel *ioc;
+GSList *ioregions_list;
 } RemoteCommDev;
 
 #define TYPE_REMOTE_MACHINE "x-remote-machine"
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
index 4ec0915885..be546e4586 100644
--- a/include/hw/remote/mpqemu-link.h
+++ b/include/hw/remote/mpqemu-link.h
@@ -17,6 +17,7 @@
 #include "exec/hwaddr.h"
 #include "io/channel-socket.h"
 #include "hw/remote/proxy.h"
+#include "hw/remote/ioregionfd.h"
 
 #define REMOTE_MAX_FDS 8
 
@@ -41,6 +42,7 @@ typedef enum {
 MPQEMU_CMD_BAR_READ,
 MPQEMU_CMD_SET_IRQFD,
 MPQEMU_CMD_DEVICE_RESET,
+MPQEMU_CMD_BAR_INFO,
 MPQEMU_CMD_MAX,
 } MPQemuCmd;
 
diff --git a/hw/remote/ioregionfd.c b/hw/remote/ioregionfd.c
index 85ec0f7d38..1d371357c6 100644
--- a/hw/remote/ioregionfd.c
+++ b/hw/remote/ioregionfd.c
@@ -63,6 +63,34 @@ GSList *ioregionfd_get_obj_list(void)
 return list;
 }
 
+IORegionFD *ioregionfd_get_by_bar(GSList *list, uint32_t bar)
+{
+IORegionFDObject *ioregionfd;
+GSList *elem;
+
+for (elem = list; elem; elem = elem->next) {
+ioregionfd = elem->data;
+
+if (ioregionfd->ioregfd.bar == bar) {
+return &ioregionfd->ioregfd;
+}
+}
+return NULL;
+}
+
+void ioregionfd_set_bar_type(GSList *list, uint32_t bar, bool memory)
+{
+IORegionFDObject *ioregionfd;
+GSList *elem;
+
+for (elem = list; elem; elem = elem->next) {
+ioregionfd = elem->data;
+if (ioregionfd->ioregfd.bar == bar) {
+ioregionfd->ioregfd.memory = memory;
+}
+}
+}
+
 static void ioregionfd_object_init(Object *obj)
 {
 IORegionFDObjectClass *k = IOREGIONFD_OBJECT_GET_CLASS(obj);
diff --git a/hw/remote/message.c b/hw/remote/message.c
index 11d729845c..a8fb9764ba 100644
--- a/hw/remote/message.c
+++ b/hw/remote/message.c
@@ -29,6 +29,8 @@ static void process_bar_write(QIOChannel *ioc, MPQemuMsg 
*msg, Error **errp);
 static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
 static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
  Error **errp);
+static void process_device_get_reg_info(QIOChannel *ioc, RemoteCommDev *com,
+MPQemuMsg *msg, Error **errp);
 
 void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
 {
@@ -75,6 +77,9 @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
 case MPQEMU_CMD_DEVICE_RESET:
 process_device_reset_msg(com->ioc, pci_dev, &local_err);
 break;
+case MPQEMU_CMD_BAR_INFO:
+process_device_get_reg_info(com->ioc, com, &msg, &local_err);
+break;
 default:
 error_setg(&local_err,
"Unknown command (%d) received for device %s"
@@ -91,6 +96,39 @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
 }
 }
 
+static void process_device_get_reg_info(QIOChannel *ioc, RemoteCommDev *com,
+MPQemuMsg *msg, Error **errp)
+{
+ERRP_GUARD();
+uint32_t bar = (uint32_t)(msg->data.u64 & MAKE_64BIT_MASK(0, 32));
+bool memory;
+
+memory = (msg->data.u64 && MAKE_64BIT_MASK(32, 32)) == 1 ?  true : false;
+
+IORegionFD *ioregfd;
+MPQemuMsg ret = { 0 };
+
+error_report("Bar is %d, mem %s", bar, memory ? "true" : "false");
+
+memset(&ret, 0, sizeof(MPQemuMsg));
+ret.cmd = MPQEMU_CMD_RET;
+ret.size = sizeof(ret.data.u64);
+
+ioregfd = ioregionfd_get_by_bar(com->ioregions_list, bar);
+if (ioregfd) {
+ret.data.u64 = ioregfd->bar;
+if (ioregfd-

[RFC 5/8] multiprocess: prepare ioregionfds for remote device

2022-02-08 Thread Elena Ufimtseva
Signed-off-by: Elena Ufimtseva 
---
 include/hw/remote/ioregionfd.h |  1 +
 include/hw/remote/remote.h |  1 +
 hw/remote/ioregionfd.c | 26 ++
 hw/remote/remote-obj.c | 93 ++
 4 files changed, 121 insertions(+)

diff --git a/include/hw/remote/ioregionfd.h b/include/hw/remote/ioregionfd.h
index c8a8b32ee0..85a2ef2c4f 100644
--- a/include/hw/remote/ioregionfd.h
+++ b/include/hw/remote/ioregionfd.h
@@ -37,4 +37,5 @@ struct IORegionFDObject {
 
 typedef struct IORegionFDObject IORegionFDObject;
 
+GSList *ioregionfd_get_obj_list(void);
 #endif /* IOREGIONFD_H */
diff --git a/include/hw/remote/remote.h b/include/hw/remote/remote.h
index a2d23178b9..46390c7934 100644
--- a/include/hw/remote/remote.h
+++ b/include/hw/remote/remote.h
@@ -23,6 +23,7 @@ struct RemoteObject {
 
 DeviceState *dev;
 DeviceListener listener;
+GHashTable *ioregionfd_hash;
 };
 
 #endif
diff --git a/hw/remote/ioregionfd.c b/hw/remote/ioregionfd.c
index ae95f702a6..85ec0f7d38 100644
--- a/hw/remote/ioregionfd.c
+++ b/hw/remote/ioregionfd.c
@@ -37,6 +37,32 @@ struct IORegionFDObjectClass {
 unsigned int max_ioregfds;
 };
 
+static int ioregionfd_obj_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_IOREGIONFD_OBJECT)) {
+*list = g_slist_append(*list, obj);
+}
+
+object_child_foreach(obj, ioregionfd_obj_list, opaque);
+return 0;
+}
+
+/*
+ * inquire ioregionfd objects and link them into the list which is
+ * returned to the caller.
+ *
+ * Caller must free the list.
+ */
+GSList *ioregionfd_get_obj_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(object_get_root(), ioregionfd_obj_list, &list);
+return list;
+}
+
 static void ioregionfd_object_init(Object *obj)
 {
 IORegionFDObjectClass *k = IOREGIONFD_OBJECT_GET_CLASS(obj);
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
index f0da696662..9bb61c3a2d 100644
--- a/hw/remote/remote-obj.c
+++ b/hw/remote/remote-obj.c
@@ -24,6 +24,10 @@
 #include "qemu/sockets.h"
 #include "monitor/monitor.h"
 #include "hw/remote/remote.h"
+#include "hw/remote/ioregionfd.h"
+#include "qemu/cutils.h"
+#include "qapi/qapi-visit-qom.h"
+#include "qapi/string-output-visitor.h"
 
 #define TYPE_REMOTE_OBJECT "x-remote-object"
 OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
@@ -74,6 +78,80 @@ static void remote_object_unrealize_listener(DeviceListener 
*listener,
 }
 }
 
+static GSList *ioregions_list;
+
+static unsigned int ioregionfd_bar_hash(const void *key)
+{
+const IORegionFDObject *o = key;
+
+return g_int_hash(&o->ioregfd.bar);
+}
+
+/* TODO: allow for multiple ioregionfds per BAR. */
+static gboolean ioregionfd_bar_equal(const void *a, const void *b)
+{
+const IORegionFDObject *oa = a;
+const IORegionFDObject *ob = b;
+
+error_report("BARS comparing %d %d", oa->ioregfd.bar, ob->ioregfd.bar);
+if (oa->ioregfd.bar == ob->ioregfd.bar) {
+return TRUE;
+}
+return FALSE;
+}
+
+static void ioregionfd_prepare_for_dev(RemoteObject *o, PCIDevice *dev)
+{
+IORegionFDObject *ioregfd_obj = NULL;
+GSList *obj_list, *list;
+
+list = ioregionfd_get_obj_list();
+
+o->ioregionfd_hash = g_hash_table_new(ioregionfd_bar_hash,
+   ioregionfd_bar_equal);
+
+for (obj_list = list; obj_list; obj_list = obj_list->next) {
+ioregfd_obj = obj_list->data;
+if (strcmp(ioregfd_obj->ioregfd.devid, o->devid) != 0) {
+list = g_slist_remove(list, ioregfd_obj);
+error_report("No my dev remove");
+continue;
+}
+if (!g_hash_table_add(o->ioregionfd_hash, ioregfd_obj)) {
+error_report("Cannot use more than one ioregionfd per bar");
+list = g_slist_remove(list, ioregfd_obj);
+object_unparent(OBJECT(ioregfd_obj));
+} else {
+error_report("Added to hash");
+}
+}
+
+if (!list) {
+error_report("Remote device %s will not have ioregionfds.",
+ o->devid);
+goto fatal;
+}
+
+/*
+ * Take first element in the list of ioregions and use its fd
+ * for all regions for this device.
+ * TODO: make this more flexible and allow different fd for the
+ * device.
+ */
+ioregfd_obj = list->data;
+
+/* This is default and will be changed when proxy requests region info. */
+ioregfd_obj->ioregfd.memory = true;
+
+ioregions_list = list;
+return;
+
+ fatal:
+g_slist_free(list);
+g_hash_table_destroy(o->ioregionfd_hash);
+return;
+}
+
 static void remote_object_machine_done(Notifier *notifier, void *data)
 {
 RemoteObject *o = contai

[PATCH RFC 0/2] migration: introduce strict SLA

2024-06-21 Thread Elena Ufimtseva
Hello

This RFC patchset introduces strict downtime SLA for live migration by
restricting how long switchover phase can take and aborts live migration
if this exceeded.

Various consumers of VFIO Live Migration are bound checks on how long
the switchover process lasts. Some things are not accounted for and are
unbounded, such as:
  - Time to quiesce/resume the VF
  - Time to save/resume all system state
  - How fast we can save/restore VF state

These cases lead to the final downtime being larger than what was
configured in by setting a downtime limit.
In some applications it is important to observe the requested downtime
and re-try live migration some other time if the downtime requirements
cannot be satisfied.

This patchset introduces capability to abort live migration if
the downtime exceeds a certain value specified by switchover limit
migration parameter.
When a guest stops at the source, measure the downtime and if
it exceeds a threshold we cancel the migration and resume the guest.
The destination is being notified of the source downtime and its threshold
and starts measuring downtime. Destination will cancel live migration
if downtime exceeds the swithover limit.

The migration with this capability would be used this way for example:

migrate_set_capability return-path on
migrate_set_capability switchover-abort on
migrate_set_parameter downtime-limit 300
migrate_set_parameter switchover-limit 10

The migration will be aborted if the downtime exceeds
10ms (switchover-limit) and total downtime would not
be more than 310ms.

Please send your comments and recommendations.

The patchset idea originally comes from Joao Martins
.


Elena Ufimtseva (2):
  migration: abort when switchover limit exceeded
  migration: abort on destination if switchover limit exceeded

 hw/core/machine.c  |  1 +
 include/migration/client-options.h |  1 +
 migration/migration-hmp-cmds.c | 10 
 migration/migration.c  | 41 +++
 migration/migration.h  | 20 
 migration/options.c| 56 +
 migration/options.h|  1 +
 migration/savevm.c | 81 ++
 migration/savevm.h |  2 +
 migration/trace-events |  3 ++
 qapi/migration.json| 27 --
 11 files changed, 239 insertions(+), 4 deletions(-)

-- 
2.34.1




[PATCH RFC 1/2] migration: abort when switchover limit exceeded

2024-06-21 Thread Elena Ufimtseva
Introduce capability switchover_abort and migration parameter switchover_limit
to allow for live migration abort when the source downtime exceeded by
switchover_limit.

Signed-off-by: Elena Ufimtseva 
---
 hw/core/machine.c  |  1 +
 include/migration/client-options.h |  1 +
 migration/migration-hmp-cmds.c | 10 ++
 migration/migration.c  | 39 +
 migration/migration.h  |  5 +++
 migration/options.c| 56 ++
 migration/options.h|  1 +
 migration/savevm.c | 13 +++
 qapi/migration.json| 27 +++---
 9 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 655d75c21f..9459c7adbb 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -38,6 +38,7 @@ GlobalProperty hw_compat_9_0[] = {
 {"arm-cpu", "backcompat-cntfrq", "true" },
 {"scsi-disk-base", "migrate-emulated-scsi-request", "false" },
 {"vfio-pci", "skip-vsc-check", "false" },
+{ "migration", "x-switchover-abort", "off" },
 };
 const size_t hw_compat_9_0_len = G_N_ELEMENTS(hw_compat_9_0);
 
diff --git a/include/migration/client-options.h 
b/include/migration/client-options.h
index 59f4b55cf4..0e9d17f507 100644
--- a/include/migration/client-options.h
+++ b/include/migration/client-options.h
@@ -16,6 +16,7 @@ bool migrate_background_snapshot(void);
 bool migrate_dirty_limit(void);
 bool migrate_postcopy_ram(void);
 bool migrate_switchover_ack(void);
+bool migrate_switchover_abort(void);
 
 /* parameters */
 
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 9f0e8029e0..4dc8d0ba87 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -312,6 +312,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: '%s'\n",
 MigrationParameter_str(MIGRATION_PARAMETER_TLS_AUTHZ),
 params->tls_authz);
+assert(params->has_switchover_limit);
+monitor_printf(mon, "%s: %" PRIu64 " ms\n",
+MigrationParameter_str(MIGRATION_PARAMETER_SWITCHOVER_LIMIT),
+params->switchover_limit);
+
 
 if (params->has_block_bitmap_mapping) {
 const BitmapMigrationNodeAliasList *bmnal;
@@ -624,6 +629,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->has_mode = true;
 visit_type_MigMode(v, param, &p->mode, &err);
 break;
+case MIGRATION_PARAMETER_SWITCHOVER_LIMIT:
+p->has_switchover_limit = true;
+visit_type_size(v, param, &p->switchover_limit, &err);
+break;
+
 default:
 assert(0);
 }
diff --git a/migration/migration.c b/migration/migration.c
index e1b269624c..5cc304d2db 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -113,6 +113,7 @@ static void migration_downtime_start(MigrationState *s)
 {
 trace_vmstate_downtime_checkpoint("src-downtime-start");
 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+s->downtime_now = s->downtime_start;
 }
 
 static void migration_downtime_end(MigrationState *s)
@@ -204,6 +205,10 @@ static int migration_stop_vm(MigrationState *s, RunState 
state)
 trace_vmstate_downtime_checkpoint("src-vm-stopped");
 trace_migration_completion_vm_stop(ret);
 
+if (migration_downtime_exceeded()) {
+migration_set_downtime_exceeded_error(s, s->to_dst_file);
+ret = -1;
+}
 return ret;
 }
 
@@ -1652,6 +1657,7 @@ int migrate_init(MigrationState *s, Error **errp)
 s->mbps = 0.0;
 s->pages_per_second = 0.0;
 s->downtime = 0;
+s->downtime_now = 0;
 s->expected_downtime = 0;
 s->setup_time = 0;
 s->start_postcopy = false;
@@ -2758,6 +2764,39 @@ static void migration_completion_failed(MigrationState 
*s,
   MIGRATION_STATUS_FAILED);
 }
 
+int64_t migration_get_current_downtime(MigrationState *s)
+{
+s->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+return s->downtime_now - s->downtime_start;
+}
+
+bool migration_downtime_exceeded(void)
+{
+MigrationState *s = migrate_get_current();
+
+if (!migrate_switchover_abort()) {
+return 0;
+}
+
+return migration_get_current_downtime(s) >= s->parameters.downtime_limit +
+s->parameters.switchover_limit;
+}
+
+int migration_set_downtime_exceeded_error(MigrationState *s, QEMUFile *f)
+{
+int64_t limit = s->parameters.downtime_limit;
+Error *errp = NULL;
+
+error_setg(&errp, "Downtime Limit of %" PRIi64" ms exceeded by %&qu

[PATCH RFC 2/2] migration: abort on destination if switchover limit exceeded

2024-06-21 Thread Elena Ufimtseva
During live migration, receive current downtime from source
and start a downtime timer. When the destination dowtime
and added source downtime exceeds downtime limit for more
than switchover limit, abort live migration on destination.

Signed-off-by: Elena Ufimtseva 
---
 migration/migration.c  |  2 ++
 migration/migration.h  | 15 ++
 migration/savevm.c | 68 ++
 migration/savevm.h |  2 ++
 migration/trace-events |  3 ++
 5 files changed, 90 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 5cc304d2db..64d7290997 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -240,6 +240,8 @@ void migration_object_init(void)
 current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 
 current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
+/* Downtime will start when source sends its current downtime. */
+current_incoming->downtime_start = 0;
 
 migration_object_check(current_migration, &error_fatal);
 
diff --git a/migration/migration.h b/migration/migration.h
index aa56b70795..06f4ebe214 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -230,6 +230,21 @@ struct MigrationIncomingState {
 
 /* Do exit on incoming migration failure */
 bool exit_on_error;
+
+/* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */
+uint64_t downtime_start;
+/*
+ * Current donwtime on destination that initially set equal to source by
+ * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself.
+ */
+uint64_t downtime_now;
+/*
+ * Abort live migration on destination when current destination downtime
+ * exceeds the abort_limit. abort_limit is being set by
+ * MIG_CMD_SEND_SRC_DOWNTIME sent from source.
+ */
+uint64_t abort_limit;
+uint64_t src_downtime;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/migration/savevm.c b/migration/savevm.c
index 031ab03915..f3b5ea98bf 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -90,6 +90,7 @@ enum qemu_vm_cmd {
 MIG_CMD_ENABLE_COLO,   /* Enable COLO */
 MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
 MIG_CMD_RECV_BITMAP,   /* Request for recved bitmap on dst */
+MIG_CMD_SEND_SRC_DOWNTIME,/* Send current downtime to dst */
 MIG_CMD_MAX
 };
 
@@ -109,6 +110,7 @@ static struct mig_cmd_args {
 [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 [MIG_CMD_PACKAGED] = { .len =  4, .name = "PACKAGED" },
 [MIG_CMD_RECV_BITMAP]  = { .len = -1, .name = "RECV_BITMAP" },
+[MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" },
 [MIG_CMD_MAX]  = { .len = -1, .name = "MAX" },
 };
 
@@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char 
*block_name)
 qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
 }
 
+void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms,
+   int64_t source_downtime)
+{
+uint64_t tmp[2];
+tmp[0] = cpu_to_be64(abort_limit_ms);
+tmp[1] = cpu_to_be64(source_downtime);
+
+trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime);
+qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME,
+ 16, (uint8_t *)tmp);
+}
+
 bool qemu_savevm_state_blocked(Error **errp)
 {
 SaveStateEntry *se;
@@ -1635,6 +1649,14 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool 
iterable_only,
 }
 }
 
+if (migrate_switchover_abort()) {
+MigrationState *s = migrate_get_current();
+uint64_t abort_limit_ms =
+s->parameters.downtime_limit + s->parameters.switchover_limit;
+qemu_savevm_send_downtime(f, abort_limit_ms,
+  migration_get_current_downtime(s));
+}
+
 if (iterable_only) {
 goto flush;
 }
@@ -1919,6 +1941,20 @@ static int 
loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
 return 0;
 }
 
+static int loadvm_handle_src_downtime(MigrationIncomingState *mis,
+  uint16_t len)
+{
+uint64_t src_abort_limit = qemu_get_be64(mis->from_src_file);
+uint64_t src_current_downtime = qemu_get_be64(mis->from_src_file);
+
+mis->abort_limit = src_abort_limit;
+mis->src_downtime = src_current_downtime;
+mis->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+trace_loadvm_handle_src_downtime(src_abort_limit, src_current_downtime);
+return 0;
+}
+
 /* After postcopy we will be told to throw some pages away since they're
  * dirty and will have to be demand fetched.  Must happen before CPU is
  * started.
@@ -2540,6 +2576,9 @@ static int loadvm_process_command(QEMUFile *f)
 
 case MIG_CMD_ENABLE_COL

Re: Call for agenda for 2023-09-19 QEMU developers call

2023-09-18 Thread Elena Ufimtseva
Hello Juan,

Not sure if this is worth its own topic, would be it possible to hear the 
community thoughts on the live migration series review/pull progress (atomics, 
zero page multifd etc.. )? Seems like there are few outstanding relevant 
patches.


Thank you!

From: Juan Quintela 
Sent: Friday, September 15, 2023 1:45 AM
To: f4...@amsat.org ; Joao Martins 
; Juan Quintela ; 
md...@redhat.com ; fel...@nutanix.com ; 
afaer...@suse.de ; bazu...@redhat.com ; 
bbau...@redhat.com ; c...@f00f.org ; 
dustin.kirkl...@canonical.com ; 
ebl...@redhat.com ; edgar.igles...@gmail.com 
; eric.au...@redhat.com ; 
i...@theiggy.com ; jan.kis...@web.de ; 
jidong.x...@gmail.com ; jjhe...@linux.vnet.ibm.com 
; m...@linux.vnet.ibm.com 
; peter.mayd...@linaro.org ; 
richard.hender...@linaro.org ; stefa...@gmail.com 
; i...@bsdimp.com ; z@139.com 
; zwu.ker...@gmail.com ; j...@nvidia.com 
; c...@nvidia.com ; David Edmondson 
; Elena Ufimtseva ; 
Konrad Wilk ; Alessandro Di Federico ; 
a...@rev.ng ; shameerali.kolothum.th...@huawei.com 
; wei.w.w...@intel.com 
; chao.p.p...@linux.intel.com 
; qemu-devel@nongnu.org ; 
Mark Burton 
Subject: Call for agenda for 2023-09-19 QEMU developers call

Hi

If you have any topics, please add to this email.

Thanks, Juan.


Re: Call for agenda for 2023-09-19 QEMU developers call

2023-09-18 Thread Elena Ufimtseva
On Tue, Sep 19, 2023 at 02:02:49AM +0200, Juan Quintela wrote:
> Elena Ufimtseva  wrote:
> > Hello Juan,
> >
> > Not sure if this is worth its own topic, would be it possible to hear
> > the community thoughts on the live migration series review/pull
> > progress (atomics, zero page multifd etc.. )? Seems like there are few
> > outstanding relevant patches.
> 
> Hi
> 
> If everybody agrees, can we move this topic to next call?
> I am on vacation this week and the next.
> 
> I was planning time to "moderate" the call, but preparing for a call
> about my topics is going to mean a divorce O:-)
> 

Thank you Juan! Understood :)

> Later, Juan.
> 
> PD.  I have had too many problem in the recent past with several things,
>  from my test machines to disappear (and configuring new ones taking
>  forever), to very bad time with the BOTS.  I expect/hope that
>  things are gonig to get better in the near future.



[PATCH 1/4] multifd: wait for channels_ready before sending sync

2023-09-21 Thread Elena Ufimtseva
In multifd_send_sync_main we need to wait for channels_ready
before submitting sync packet as the threads may still be sending
their previous pages.
There is also no need to check for channels_ready in the loop
before the wait for sem_sync, next iteration of sending pages
or another sync will start with waiting for channels_ready
semaphore.
Changes to commit 90b3cec351996dd8ef4eb847ad38607812c5e7f5
("multifd: Fix the number of channels ready")

Signed-off-by: Elena Ufimtseva 
---
 migration/multifd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 0f6b203877..e61e458151 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -595,6 +595,7 @@ int multifd_send_sync_main(QEMUFile *f)
 }
 }
 
+qemu_sem_wait(&multifd_send_state->channels_ready);
 /*
  * When using zero-copy, it's necessary to flush the pages before any of
  * the pages can be sent again, so we'll make sure the new version of the
@@ -630,7 +631,6 @@ int multifd_send_sync_main(QEMUFile *f)
 for (i = 0; i < migrate_multifd_channels(); i++) {
 MultiFDSendParams *p = &multifd_send_state->params[i];
 
-qemu_sem_wait(&multifd_send_state->channels_ready);
 trace_multifd_send_sync_main_wait(p->id);
 qemu_sem_wait(&p->sem_sync);
 
-- 
2.34.1




[PATCH 2/4] migration: check for rate_limit_max for RATE_LIMIT_DISABLED

2023-09-21 Thread Elena Ufimtseva
In migration rate limiting atomic operations are used
to read the rate limit variables and transferred bytes and
they are expensive. Check first if rate_limit_max is equal
to RATE_LIMIT_DISABLED and return false immediately if so.

Signed-off-by: Elena Ufimtseva 
---
 migration/migration-stats.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/migration-stats.c b/migration/migration-stats.c
index 095d6d75bb..abc31483d5 100644
--- a/migration/migration-stats.c
+++ b/migration/migration-stats.c
@@ -24,14 +24,14 @@ bool migration_rate_exceeded(QEMUFile *f)
 return true;
 }
 
-uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
-uint64_t rate_limit_current = migration_transferred_bytes(f);
-uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
 uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max);
-
 if (rate_limit_max == RATE_LIMIT_DISABLED) {
 return false;
 }
+uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
+uint64_t rate_limit_current = migration_transferred_bytes(f);
+uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
+
 if (rate_limit_max > 0 && rate_limit_used > rate_limit_max) {
 return true;
 }
-- 
2.34.1




[PATCH 3/4] multifd: fix counters in multifd_send_thread

2023-09-21 Thread Elena Ufimtseva
Previous commit cbec7eb76879d419e7dbf531ee2506ec0722e825
"migration/multifd: Compute transferred bytes correctly"
removed accounting for packet_len in non-rdma
case, but the next_packet_size only accounts for pages, not for
the header packet (normal_pages * PAGE_SIZE) that is being sent
as iov[0]. The packet_len part should be added to account for
the size of MultiFDPacket and the array of the offsets.

Signed-off-by: Elena Ufimtseva 
---
 migration/multifd.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index e61e458151..3281397b18 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -714,8 +714,6 @@ static void *multifd_send_thread(void *opaque)
 if (ret != 0) {
 break;
 }
-stat64_add(&mig_stats.multifd_bytes, p->packet_len);
-stat64_add(&mig_stats.transferred, p->packet_len);
 } else {
 /* Send header using the same writev call */
 p->iov[0].iov_len = p->packet_len;
@@ -728,8 +726,10 @@ static void *multifd_send_thread(void *opaque)
 break;
 }
 
-stat64_add(&mig_stats.multifd_bytes, p->next_packet_size);
-stat64_add(&mig_stats.transferred, p->next_packet_size);
+stat64_add(&mig_stats.multifd_bytes,
+   p->next_packet_size + p->packet_len);
+stat64_add(&mig_stats.transferred,
+   p->next_packet_size + p->packet_len);
 qemu_mutex_lock(&p->mutex);
 p->pending_job--;
 qemu_mutex_unlock(&p->mutex);
-- 
2.34.1




[PATCH 0/4] multifd: various fixes

2023-09-21 Thread Elena Ufimtseva
Hello

While working and testing various live migration scenarios,
a few issues were found.

This is my first patches in live migration and I will
appreciate the suggestions from the community if these
patches could be done differently.

[PATCH 1/4] multifd: wait for channels_ready before sending sync
I am not certain about this change since it seems that
the sync flag could be the part of the packets with pages that are
being sent out currently.
But the traces show this is not always the case:
multifd_send 230.873 pid=55477 id=0x0 packet_num=0x6f4 normal=0x40 flags=0x1 
next_packet_size=0x4
multifd_send 14.718 pid=55477 id=0x1 packet_num=0x6f5 normal=0x0 flags=0x1 
next_packet_size=0x8
If the sync packet is indeed can be a standalone one, then waiting for
channels_ready before seem to be appropriate, but waisting iteration on
sync only packet.
[PATCH 4/4] is also relevant to 1/4, but fixes the over-accounting in
case of sync only packet.


Thank you in advance and looking forward for your feedback.

Elena

Elena Ufimtseva (4):
  multifd: wait for channels_ready before sending sync
  migration: check for rate_limit_max for RATE_LIMIT_DISABLED
  multifd: fix counters in multifd_send_thread
  multifd: reset next_packet_len after sending pages

 migration/migration-stats.c |  8 
 migration/multifd.c | 11 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

-- 
2.34.1




[PATCH 4/4] multifd: reset next_packet_len after sending pages

2023-09-21 Thread Elena Ufimtseva
Sometimes multifd sends just sync packet with no pages
(normal_num is 0). In this case the old value is being
preserved and being accounted for while only packet_len
is being transferred.
Reset it to 0 after sending and accounting for.

TODO: Fix the same packet ids in the stream.
with this patch, there is still an issue with the duplicated
packets ids being sent (with different number of pages/flags).
See in below multifd_send trace (before this change):
multifd_send 394.774 pid=55477 id=0x1 packet_num=0x6f0 normal=0x57 flags=0x1 
next_packet_size=0x57000
multifd_send 181.244 pid=55477 id=0x1 packet_num=0x6f0 normal=0x0 flags=0x0 
next_packet_size=0x57000

With this commit there are still duplicated packets, but since no pages
are being sent with sync flag set, next_packet_size is 0:
multifd_send 27.814 pid=18602 id=0x1 packet_num=0x574 normal=0x7b flags=0x1 
next_packet_size=0x7b000
multifd_send 136054.792 pid=18602 id=0x1 packet_num=0x574 normal=0x0 flags=0x0 
next_packet_size=0x0
If there is a suggestion how to fix this properly, I will be
glad to use it.

Signed-off-by: Elena Ufimtseva 
---
 migration/multifd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration/multifd.c b/migration/multifd.c
index 3281397b18..8b4e26051b 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -730,6 +730,7 @@ static void *multifd_send_thread(void *opaque)
p->next_packet_size + p->packet_len);
 stat64_add(&mig_stats.transferred,
p->next_packet_size + p->packet_len);
+p->next_packet_size = 0;
 qemu_mutex_lock(&p->mutex);
 p->pending_job--;
 qemu_mutex_unlock(&p->mutex);
-- 
2.34.1




Re: [RFC PATCH 1/3] migration/multifd: Move channels_ready semaphore

2023-09-22 Thread Elena Ufimtseva
On Fri, Sep 22, 2023 at 11:53:17AM -0300, Fabiano Rosas wrote:
> Commit d2026ee117 ("multifd: Fix the number of channels ready") moved
> the "post" of channels_ready to the start of the multifd_send_thread()
> loop and added a missing "wait" at multifd_send_sync_main(). While it
> does work, the placement of the wait goes against what the rest of the
> code does.
> 
> The sequence at multifd_send_thread() is:
> 
> qemu_sem_post(&multifd_send_state->channels_ready);
> qemu_sem_wait(&p->sem);
> 
> if (flags & MULTIFD_FLAG_SYNC) {
> qemu_sem_post(&p->sem_sync);
> }
> 
> Which means that the sending thread makes itself available
> (channels_ready) and waits for more work (sem). So the sequence in the
> migration thread should be to check if any channel is available
> (channels_ready), give it some work and set it off (sem):
> 
> qemu_sem_wait(&multifd_send_state->channels_ready);
> 
> qemu_sem_post(&p->sem);
> if (flags & MULTIFD_FLAG_SYNC) {
> qemu_sem_wait(&p->sem_sync);
> }
> 
> The reason there's no deadlock today is that the migration thread
> enqueues the SYNC packet right before the wait on channels_ready and
> we end up taking advantage of the out-of-order post to sem:
> 
> ...
> qemu_sem_post(&p->sem);
> }
> for (i = 0; i < migrate_multifd_channels(); i++) {
> MultiFDSendParams *p = &multifd_send_state->params[i];
> 
> qemu_sem_wait(&multifd_send_state->channels_ready);
> trace_multifd_send_sync_main_wait(p->id);
> qemu_sem_wait(&p->sem_sync);
>   ...
> 
> Move the channels_ready wait before the sem post to keep the sequence
> consistent. Also fix the error path to post to channels_ready and
> sem_sync in the correct order.
>

Thank you Fabiano,

Your solution is more complete. I also had in mind getting rid of
sem_sync.

With your second patch, this one could be merged with it?

> Signed-off-by: Fabiano Rosas 
> ---
>  migration/multifd.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/multifd.c b/migration/multifd.c
> index a7c7a947e3..d626740f2f 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -618,6 +618,7 @@ int multifd_send_sync_main(QEMUFile *f)
>  
>  trace_multifd_send_sync_main_signal(p->id);
>  
> +qemu_sem_wait(&multifd_send_state->channels_ready);
>  qemu_mutex_lock(&p->mutex);
>  
>  if (p->quit) {
> @@ -635,7 +636,6 @@ int multifd_send_sync_main(QEMUFile *f)
>  for (i = 0; i < migrate_multifd_channels(); i++) {
>  MultiFDSendParams *p = &multifd_send_state->params[i];
>  
> -qemu_sem_wait(&multifd_send_state->channels_ready);
>  trace_multifd_send_sync_main_wait(p->id);
>  qemu_sem_wait(&p->sem_sync);
>  
> @@ -763,8 +763,8 @@ out:
>   * who pay attention to me.
>   */
>  if (ret != 0) {
> -qemu_sem_post(&p->sem_sync);
>  qemu_sem_post(&multifd_send_state->channels_ready);
> +qemu_sem_post(&p->sem_sync);

Can this thread in this error case be woken up again between
these two qemu_sem_posts?
I see in other places p->quit is set to true before it.
Or maybe it should one more patch to make these consistent 
as well.

Elena U.
>  }
>  
>  qemu_mutex_lock(&p->mutex);
> -- 
> 2.35.3
> 



Re: [PATCH 1/4] multifd: wait for channels_ready before sending sync

2023-09-22 Thread Elena Ufimtseva
On Fri, Sep 22, 2023 at 01:06:53PM -0300, Fabiano Rosas wrote:
> Elena Ufimtseva  writes:
> 
> > In multifd_send_sync_main we need to wait for channels_ready
> > before submitting sync packet as the threads may still be sending
> > their previous pages.
> > There is also no need to check for channels_ready in the loop
> > before the wait for sem_sync, next iteration of sending pages
> > or another sync will start with waiting for channels_ready
> > semaphore.
> > Changes to commit 90b3cec351996dd8ef4eb847ad38607812c5e7f5
> > ("multifd: Fix the number of channels ready")
> >
> > Signed-off-by: Elena Ufimtseva 
> > ---
> >  migration/multifd.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 0f6b203877..e61e458151 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -595,6 +595,7 @@ int multifd_send_sync_main(QEMUFile *f)
> >  }
> >  }
> >  
> > +qemu_sem_wait(&multifd_send_state->channels_ready);
> >  /*
> >   * When using zero-copy, it's necessary to flush the pages before any 
> > of
> >   * the pages can be sent again, so we'll make sure the new version of 
> > the
> > @@ -630,7 +631,6 @@ int multifd_send_sync_main(QEMUFile *f)
> >  for (i = 0; i < migrate_multifd_channels(); i++) {
> >  MultiFDSendParams *p = &multifd_send_state->params[i];
> >  
> > -qemu_sem_wait(&multifd_send_state->channels_ready);
> >  trace_multifd_send_sync_main_wait(p->id);
> >  qemu_sem_wait(&p->sem_sync);
> 
> Please take a look at the series I just sent. Basically, I think we
> should wait on 'sem' for the number of existing channels and not just
> once per sync. Otherwise I think we'd hit the same issue this patch is
> trying to fix when we loop into the n+1 channels. I think the
> assert(!p->pending_job) in patch 3 helps prove that's more appropriate.

Thank you!

These patches make sense to me.
Agree on redundant sem_sync. Lets see what others think.

I will run some tests as well with your patches and spend
more time looking at [2/3] patch.

Elena U.

> 
> Let me know what you think.
> 
> Thanks



[PATCH v2 3/4] multifd: fix counters in multifd_send_thread

2023-10-11 Thread Elena Ufimtseva
Previous commit cbec7eb76879d419e7dbf531ee2506ec0722e825
"migration/multifd: Compute transferred bytes correctly"
removed accounting for packet_len in non-rdma
case, but the next_packet_size only accounts for pages, not for
the header packet (normal_pages * PAGE_SIZE) that is being sent
as iov[0]. The packet_len part should be added to account for
the size of MultiFDPacket and the array of the offsets.

Signed-off-by: Elena Ufimtseva 
Reviewed-by: Fabiano Rosas 
---
 migration/multifd.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 0f6b203877..e6e0013c16 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -714,8 +714,6 @@ static void *multifd_send_thread(void *opaque)
 if (ret != 0) {
 break;
 }
-stat64_add(&mig_stats.multifd_bytes, p->packet_len);
-stat64_add(&mig_stats.transferred, p->packet_len);
 } else {
 /* Send header using the same writev call */
 p->iov[0].iov_len = p->packet_len;
@@ -728,8 +726,10 @@ static void *multifd_send_thread(void *opaque)
 break;
 }
 
-stat64_add(&mig_stats.multifd_bytes, p->next_packet_size);
-stat64_add(&mig_stats.transferred, p->next_packet_size);
+stat64_add(&mig_stats.multifd_bytes,
+   p->next_packet_size + p->packet_len);
+stat64_add(&mig_stats.transferred,
+   p->next_packet_size + p->packet_len);
 qemu_mutex_lock(&p->mutex);
 p->pending_job--;
 qemu_mutex_unlock(&p->mutex);
-- 
2.34.1




[PATCH v2 1/4] migration: check for rate_limit_max for RATE_LIMIT_DISABLED

2023-10-11 Thread Elena Ufimtseva
In migration rate limiting atomic operations are used
to read the rate limit variables and transferred bytes and
they are expensive. Check first if rate_limit_max is equal
to RATE_LIMIT_DISABLED and return false immediately if so.

Note that with this patch we will also will stop flushing
by not calling qemu_fflush() from migration_transferred_bytes()
if the migration rate is not exceeded.
This should be fine since migration thread calls in the loop
migration_update_counters from migration_rate_limit() that
calls the migration_transferred_bytes() and flushes there.

Signed-off-by: Elena Ufimtseva 
Reviewed-by: Fabiano Rosas 
Reviewed-by: Peter Xu 
---
 migration/migration-stats.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/migration/migration-stats.c b/migration/migration-stats.c
index 84e11e6dd8..4cc989d975 100644
--- a/migration/migration-stats.c
+++ b/migration/migration-stats.c
@@ -24,14 +24,15 @@ bool migration_rate_exceeded(QEMUFile *f)
 return true;
 }
 
+uint64_t rate_limit_max = migration_rate_get();
+if (rate_limit_max == RATE_LIMIT_DISABLED) {
+return false;
+}
+
 uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
 uint64_t rate_limit_current = migration_transferred_bytes(f);
 uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
-uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max);
 
-if (rate_limit_max == RATE_LIMIT_DISABLED) {
-return false;
-}
 if (rate_limit_max > 0 && rate_limit_used > rate_limit_max) {
 return true;
 }
-- 
2.34.1




[PATCH v2 4/4] multifd: reset next_packet_len after sending pages

2023-10-11 Thread Elena Ufimtseva
Sometimes multifd sends just sync packet with no pages
(normal_num is 0). In this case the old value is being
preserved and being accounted for while only packet_len
is being transferred.
Reset it to 0 after sending and accounting for.

Signed-off-by: Elena Ufimtseva 
Reviewed-by: Fabiano Rosas 
---
 migration/multifd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration/multifd.c b/migration/multifd.c
index e6e0013c16..c45f5015f8 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -730,6 +730,7 @@ static void *multifd_send_thread(void *opaque)
p->next_packet_size + p->packet_len);
 stat64_add(&mig_stats.transferred,
p->next_packet_size + p->packet_len);
+p->next_packet_size = 0;
 qemu_mutex_lock(&p->mutex);
 p->pending_job--;
 qemu_mutex_unlock(&p->mutex);
-- 
2.34.1




[PATCH v2 0/4] multifd: various fixes

2023-10-11 Thread Elena Ufimtseva
Hello

While working and testing various live migration scenarios,
a few issues were found.

This is the version 2 of the changes with few non-functional
modifications minus the dropped patch.
I have dropped the patch [1/4] since the discussion with Fabiano
and his proposed changes:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg995782.html

In new patchset, the [PATCH 1/4] addresses Peter's and Fabiano's
comments and Reviewed-by are added.

I added [PATCH 2/4] to add more description about the packet_len
and next_packet_size.

Patches 3,4 are unchanged, added Reviewed-by and moved discussion
of the other issues under "---".


Thank you in advance and looking forward for your feedback.
 
Elena Ufimtseva (4):
  migration: check for rate_limit_max for RATE_LIMIT_DISABLED
  multifd: document packet_len, next_packet_size
  multifd: fix counters in multifd_send_thread
  multifd: reset next_packet_len after sending pages

 migration/migration-stats.c |  9 +
 migration/multifd.c |  9 +
 migration/multifd.h | 35 ++-
 3 files changed, 40 insertions(+), 13 deletions(-)

-- 
2.34.1




[PATCH v2 2/4] multifd: document packet_len, next_packet_size

2023-10-11 Thread Elena Ufimtseva
next_packet_size name is a bit misleading, so add more comments
where its defined.
We send data in two chunks in multifd thread:
 - send the packet with normal (non-zero) guest pages offsets that are
   dirty.
   This uses the packet_len and we increment number of packets
   for this thread that are sent;
 - send the normal (non-zero) guest dirty pages themselves in iovs.
   The total size of the data pointed by all iovs for this chunk
   is next_packet_size. We do not increment the packet_num for this
   thread when sending actual pages;

When compression is enabled, next_packet_size is used to indicate
the size of the compressed buffer on source and destination.

Will be it helpful to rename it as data_size or dirty_data_size?

Signed-off-by: Elena Ufimtseva 
---
 migration/multifd.h | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/migration/multifd.h b/migration/multifd.h
index a835643b48..37da9b68c2 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -45,7 +45,13 @@ typedef struct {
 uint32_t pages_alloc;
 /* non zero pages */
 uint32_t normal_pages;
-/* size of the next packet that contains pages */
+/*
+ * amount of data to be sent to the destination
+ * that is calculated as
+ *  - number of the normal guest dirty pages * page_size in non
+ *compression case;
+ *  - equals of the compressed data size to be received;
+ */
 uint32_t next_packet_size;
 uint64_t packet_num;
 uint64_t unused[4];/* Reserved for future use */
@@ -79,11 +85,18 @@ typedef struct {
 QIOChannel *c;
 /* is the yank function registered */
 bool registered_yank;
-/* packet allocated len */
+/*
+ * allocated length of a packet to be transferred.
+ * It has a size of MultiFDPacket struct plus
+ * the size of the array of guest page offsets (page_count * page_size).
+ */
 uint32_t packet_len;
 /* guest page size */
 uint32_t page_size;
-/* number of pages in a full packet */
+/*
+ * maximum number of dirty pages in a full packet calculated as
+ * MULTIFD_PACKET_SIZE / qemu_target_page_size()
+ */
 uint32_t page_count;
 /* multifd flags for sending ram */
 int write_flags;
@@ -116,7 +129,13 @@ typedef struct {
 
 /* pointer to the packet */
 MultiFDPacket_t *packet;
-/* size of the next packet that contains pages */
+/*
+ * amount of data to be sent to the destination
+ * that is calculated as
+ *  - number of the normal guest dirty pages * page_size in non
+ *compression case;
+ *  - equals of the compressed data size to be received;
+ */
 uint32_t next_packet_size;
 /* packets sent through this channel */
 uint64_t num_packets;
@@ -171,7 +190,13 @@ typedef struct {
 
 /* pointer to the packet */
 MultiFDPacket_t *packet;
-/* size of the next packet that contains pages */
+/*
+ * amount of data to be received by the destination
+ * that is calculated as
+ *  - number of the normal guest dirty pages * page_size in non
+ *compression case;
+ *  - equals of the compressed data size to be received;
+ */
 uint32_t next_packet_size;
 /* packets sent through this channel */
 uint64_t num_packets;
-- 
2.34.1




Re: [PATCH v2 00/20] Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2023-11-15 Thread Elena Ufimtseva
Hello Hao,

On Mon, Nov 13, 2023 at 9:42 PM Hao Xiang  wrote:
>
> v2
> * Rebase on top of 3e01f1147a16ca566694b97eafc941d62fa1e8d8.
> * Leave Juan's changes in their original form instead of squashing them.
> * Add a new commit to refactor the multifd_send_thread function to prepare 
> for introducing the DSA offload functionality.
> * Use page count to configure multifd-packet-size option.
> * Don't use the FLAKY flag in DSA tests.
> * Test if DSA integration test is setup correctly and skip the test if
> * not.
> * Fixed broken link in the previous patch cover.
>
> * Background:
>
> I posted an RFC about DSA offloading in QEMU:
> https://patchew.org/QEMU/20230529182001.2232069-1-hao.xi...@bytedance.com/
>
> This patchset implements the DSA offloading on zero page checking in
> multifd live migration code path.
>


Do you have performance numbers with different packet sizes for DSA
and non-DSA cases?
What have you found was an optimal size for DSA offloading?

Thank you!
> * Overview:
>
> Intel Data Streaming Accelerator(DSA) is introduced in Intel's 4th generation
> Xeon server, aka Sapphire Rapids.
> https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf
> https://www.intel.com/content/www/us/en/content-details/759709/intel-data-streaming-accelerator-user-guide.html
> One of the things DSA can do is to offload memory comparison workload from
> CPU to DSA accelerator hardware. This patchset implements a solution to 
> offload
> QEMU's zero page checking from CPU to DSA accelerator hardware. We gain
> two benefits from this change:
> 1. Reduces CPU usage in multifd live migration workflow across all use
> cases.
> 2. Reduces migration total time in some use cases.
>
> * Design:
>
> These are the logical steps to perform DSA offloading:
> 1. Configure DSA accelerators and create user space openable DSA work
> queues via the idxd driver.
> 2. Map DSA's work queue into a user space address space.
> 3. Fill an in-memory task descriptor to describe the memory operation.
> 4. Use dedicated CPU instruction _enqcmd to queue a task descriptor to
> the work queue.
> 5. Pull the task descriptor's completion status field until the task
> completes.
> 6. Check return status.
>
> The memory operation is now totally done by the accelerator hardware but
> the new workflow introduces overheads. The overhead is the extra cost CPU
> prepares and submits the task descriptors and the extra cost CPU pulls for
> completion. The design is around minimizing these two overheads.
>
> 1. In order to reduce the overhead on task preparation and submission,
> we use batch descriptors. A batch descriptor will contain N individual
> zero page checking tasks where the default N is 128 (default packet size
> / page size) and we can increase N by setting the packet size via a new
> migration option.
> 2. The multifd sender threads prepares and submits batch tasks to DSA
> hardware and it waits on a synchronization object for task completion.
> Whenever a DSA task is submitted, the task structure is added to a
> thread safe queue. It's safe to have multiple multifd sender threads to
> submit tasks concurrently.
> 3. Multiple DSA hardware devices can be used. During multifd initialization,
> every sender thread will be assigned a DSA device to work with. We
> use a round-robin scheme to evenly distribute the work across all used
> DSA devices.
> 4. Use a dedicated thread dsa_completion to perform busy pulling for all
> DSA task completions. The thread keeps dequeuing DSA tasks from the
> thread safe queue. The thread blocks when there is no outstanding DSA
> task. When pulling for completion of a DSA task, the thread uses CPU
> instruction _mm_pause between the iterations of a busy loop to save some
> CPU power as well as optimizing core resources for the other hypercore.
> 5. DSA accelerator can encounter errors. The most popular error is a
> page fault. We have tested using devices to handle page faults but
> performance is bad. Right now, if DSA hits a page fault, we fallback to
> use CPU to complete the rest of the work. The CPU fallback is done in
> the multifd sender thread.
> 6. Added a new migration option multifd-dsa-accel to set the DSA device
> path. If set, the multifd workflow will leverage the DSA devices for
> offloading.
> 7. Added a new migration option multifd-normal-page-ratio to make
> multifd live migration easier to test. Setting a normal page ratio will
> make live migration recognize a zero page as a normal page and send
> the entire payload over the network. If we want to send a large network
> payload and analyze throughput, this option is useful.
> 8. Added a new migration option multifd-packet-size. This can increase
> the number of pages being zero page checked and sent over the network.
> The extra synchronization between the sender threads and the dsa
> completion thread is an overhead. Using a large packet size can reduce
> that overhead.
>
> * Performance:
>
> We use two Intel 

Re: [Qemu-devel] [RFC PATCH v1 7/8] multi-process QEMU: introduce proxy object

2018-10-18 Thread Elena Ufimtseva
On Fri, Oct 12, 2018 at 07:48:34PM -0400, Jagannathan Raman wrote:
> From: Elena Ufimtseva 
> 
> Define PCI Device proxy object as a parent of TYPE_PCI_DEVICE.
> PCI Proxy Object will register PCI BARs, MemoryRegionOps to handle
> access to the BARs and forward those to the remote device.
> PCI Proxy object intercepts config space reads and writes. In case
> of pci config write it forwards it to the remote device using
> communication channel.
> 
> TODO:
> - Handle interrupt messages from the emulation program and implement
>   DMA operations.
>
And one more to add to TODO:
  - dont use malloc/free.

Elena 
> Signed-off-by: Elena Ufimtseva 
> ---
>  hw/Makefile.objs|   2 +
>  hw/qemu-proxy.c | 215 
> 
>  include/hw/qemu-proxy.h |  56 +
>  3 files changed, 273 insertions(+)
>  create mode 100644 hw/qemu-proxy.c
>  create mode 100644 include/hw/qemu-proxy.h
> 
> diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> index 9c99c29..6bb2eb0 100644
> --- a/hw/Makefile.objs
> +++ b/hw/Makefile.objs
> @@ -44,3 +44,5 @@ scsi-dev-obj-y += scsi/
>  scsi-dev-obj-y += block/
>  scsi-dev-obj-y += pci/
>  scsi-dev-obj-y += nvram/
> +
> +common-obj-y += qemu-proxy.o
> diff --git a/hw/qemu-proxy.c b/hw/qemu-proxy.c
> new file mode 100644
> index 000..1712b41
> --- /dev/null
> +++ b/hw/qemu-proxy.c
> @@ -0,0 +1,215 @@
> +/*
> + * Copyright 2018, Oracle and/or its affiliates. All rights reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "io/proxy-link.h"
> +#include "exec/memory.h"
> +#include "exec/cpu-common.h"
> +#include "exec/address-spaces.h"
> +#include "qemu/int128.h"
> +#include "qemu/range.h"
> +#include "hw/pci/pci.h"
> +#include "qemu/option.h"
> +#include "qemu/config-file.h"
> +#include "qapi/qmp/qjson.h"
> +#include "qapi/qmp/qstring.h"
> +#include "sysemu/sysemu.h"
> +#include "hw/qemu-proxy.h"
> +
> +char command[] = "qemu-scsi-dev";
> +
> +static void pci_proxy_dev_realize(PCIDevice *dev, Error **errp);
> +
> +int config_op_send(PCIProxyDev *dev, uint32_t addr, uint32_t val, int l,
> +unsigned int op)
> +{
> +ProcMsg msg;
> +struct conf_data_msg conf_data;
> +
> +conf_data.addr = addr;
> +conf_data.val = val;
> +conf_data.l = l;
> +
> +
> +msg.data2 = (uint8_t *)malloc(sizeof(conf_data));
> +if (!msg.data2) {
> +printf("Failed to allocate memory for msg.data2\n");
> +return -ENOMEM;
> +}
> +memcpy(msg.data2, (const uint8_t *)&conf_data, sizeof(conf_data));
> +msg.size = sizeof(conf_data);
> +msg.num_fds = 0;
> +msg.cmd = op;
> +msg.bytestream = 1;
> +
> +proxy_proc_send(dev->proxy_dev.proxy_link, &msg);
> +free(msg.data2);
> +
> +return 0;
> +}
> +
> +static uint32_t pci_proxy_read_config(PCIDevice *d,
> +   uint32_t addr, int len)
> +{
> +config_op_send(PCI_PROXY_DEV(d), addr, 0, 0, CONF_READ);
> +return pci_default_read_config(d, addr, len);
> +}
> +
> +static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
&

Re: [PATCH RFC 2/2] migration: abort on destination if switchover limit exceeded

2024-07-26 Thread Elena Ufimtseva
On Wed, Jun 26, 2024 at 02:41:34PM -0400, Peter Xu wrote:
> On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote:
> > Are you thinking in something specifically?
> 
> Not really. I don't think I have any idea on how to make it better,
> unfortunately, but we did some measurement too quite some time ago and I
> can share some below.


Hello Peter

I apologize for such a long delay with the reply.

> 
> > 
> > Many "variables" affect this from the point we decide switchover, and at the
> > worst (likely) case it means having qemu subsystems declare empirical 
> > values on
> > how long it takes to suspend/resume/transfer-state to migration expected
> > downtime prediction equation. Part of the reason that having headroom within
> > downtime-limit was a simple 'catch-all' (from our PoV) in terms of
> > maintainability while giving user something to fallback for characterizing 
> > its
> > SLA.
> 
> Yes, I think this might be a way to go, by starting with something that can
> catch all.


Possibly the title "strict SLA" is not the best choice of
words as it creates impression that the guarantees will be met.
But essentially this switchover limit is a safeguard against the unknowns
that can contribute to the downtime during the stop-copy and can be not
that easy to account for (or even impossible due to hardware
implementation or other issues).

To show what kind of statistics we see in our environments and what
are the main contributors please see below.

Example 1: host migration, default downtime set to 300:

Checkpoints analysis:

  checkpoint=src-downtime-start -> checkpoint=src-vm-stopped:   
74244 (us)
  checkpoint=src-vm-stopped -> checkpoint=src-iterable-saved:   
154493 (us)
  checkpoint=src-iterable-saved -> checkpoint=src-non-iterable-saved:   
4746 (us)
  checkpoint=src-non-iterable-saved -> checkpoint=dst-precopy-loadvm-completed: 
224981 (us)
  checkpoint=dst-precopy-loadvm-completed -> checkpoint=dst-precopy-bh-enter:   
36 (us)
  checkpoint=dst-precopy-bh-enter -> checkpoint=dst-precopy-bh-announced:   
7859 (us)
  checkpoint=dst-precopy-bh-announced -> checkpoint=dst-precopy-bh-vm-started:  
15995 (us)
  checkpoint=dst-precopy-bh-vm-started -> checkpoint=src-downtime-end:  
236 (us)

Iterable device analysis:

  Device SAVE of  ram:  0 took 151054 
(us)
  Device LOAD of  ram:  0 took 146855 
(us)
  Device SAVE of  :20:04.0:00.0:00.0/vfio:  0 took   2127 
(us)
  Device LOAD of  :20:04.0:00.0:00.0/vfio:  0 took 144202 
(us)

Non-iterable device analysis:

  Device LOAD of  :20:04.0:00.0:00.0/vfio:  0 took  67470 
(us)
  Device LOAD of :00:01.0/vga:  0 took   7527 
(us)
  Device LOAD of  :00:02.0/e1000e:  0 took   1715 
(us)
  Device LOAD of  kvm-tpr-opt:  0 took   1697 
(us)
  Device LOAD of  :00:03.0/virtio-blk:  0 took   1340 
(us)
  Device SAVE of  :00:02.0/e1000e:  0 took   1036 
(us)
  Device LOAD of :00:00.0/mch:  0 took   1035 
(us)
  Device LOAD of :20:04.0:00.0/pcie-root-port:  0 took976 
(us)
  Device LOAD of :00:1f.0/ICH9LPC:  0 took851 
(us)
  Device LOAD of   :00:1f.2/ich9_ahci:  0 took578 
(us)

(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 5927 ms
downtime: 483 ms
setup: 78 ms
transferred ram: 883709 kbytes
throughput: 1237.71 mbps
remaining ram: 0 kbytes
total ram: 33571656 kbytes
duplicate: 8192488 pages
skipped: 0 pages
normal: 201300 pages
normal bytes: 805200 kbytes
dirty sync count: 3
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 958776
precopy ram: 480464 kbytes
downtime ram: 398313 kbytes
vfio device transferred: 4496 kbytes

Example 2: different system than above,  live migration over 100Gbit/s 
connection and 2 vfio virtual functions (the guest has no workload and
vfio devices are not engaged in VM and have same amount of data to live
migrate).

Displayed outliers that are larger than 3 us.

Save:
252812@1721976657.700972:vmstate_downtime_checkpoint src-downtime-start
252812@1721976657.829180:vmstate_downtime_checkpoint src-vm-stopped
252812@1721976657.967987:vmstate_downtime_save type=iterable idstr=ram 
instance_id=0 downtime=138005
252812@1721976658.093218:vmstate_downtime_save type=iterable 
idstr=:00:02.0/vfio instance_id=0 downtime=125188
252812@1721976658.318101:vmstate_downtime_save type=iterable 
idstr=:00:03.0/vfio instance_id=0 downtime=224857
252812@1721976658.318125:vmstate_downtime_checkpoint src-iterable-saved
...

Load:
353

Re: [PATCH v5 48/50] multi-process: Validate incoming commands from Proxy

2020-02-28 Thread Elena Ufimtseva
On Thu, Feb 27, 2020 at 05:18:30PM +, Stefan Hajnoczi wrote:
> On Mon, Feb 24, 2020 at 03:55:39PM -0500, Jagannathan Raman wrote:
> > From: Elena Ufimtseva 
> > 
> > Validate the incoming commands to confirm that they would not cause any
> > errors in the remote process.
> > 
> > Signed-off-by: Elena Ufimtseva 
> > Signed-off-by: Jagannathan Raman 
> > Signed-off-by: John G Johnson 
> > ---
> >  hw/proxy/qemu-proxy.c|  6 +++-
> >  include/io/mpqemu-link.h |  2 ++
> >  io/mpqemu-link.c | 75 
> > +++-
> >  remote/remote-main.c |  4 +++
> >  4 files changed, 85 insertions(+), 2 deletions(-)
> 
> Please squash this into the patch(es) that introduced the code.
> 
> Reviewers want to see a logical sequence of patches.  Introducing
> unsafe code in an earlier patch and adding checks in a later patch makes
> it impossible to review the patches in sequence (reviewers would waste
> time pointing out bugs that end up getting fixed later).
>

Thanks Stefan, will merge that with appropriate patches.
 
> > diff --git a/remote/remote-main.c b/remote/remote-main.c
> > index 20d160e..c4aa3e0 100644
> > --- a/remote/remote-main.c
> > +++ b/remote/remote-main.c
> > @@ -435,6 +435,10 @@ static void process_msg(GIOCondition cond, 
> > MPQemuChannel *chan)
> >  if (msg->id > MAX_REMOTE_DEVICES) {
> >  error_setg(&err, "id of the device is larger than max number of "\
> >   "devices per remote process.");
> > +}
> 
> Was goto finalize_loop accidentally dropped?
Yes, thank you.

Elena





Re: [PATCH v5 50/50] multi-process: add configure and usage information

2020-02-28 Thread Elena Ufimtseva
On Thu, Feb 27, 2020 at 04:58:04PM +, Stefan Hajnoczi wrote:
> On Mon, Feb 24, 2020 at 03:55:41PM -0500, Jagannathan Raman wrote:
> > From: Elena Ufimtseva 
> > 
> > Signed-off-by: Elena Ufimtseva 
> > Signed-off-by: Jagannathan Raman 
> > Signed-off-by: John G Johnson 
> > ---
> >  docs/qemu-multiprocess.txt | 86 
> > ++
> >  1 file changed, 86 insertions(+)
> >  create mode 100644 docs/qemu-multiprocess.txt
> > 
> > diff --git a/docs/qemu-multiprocess.txt b/docs/qemu-multiprocess.txt
> > new file mode 100644
> > index 000..f156177
> > --- /dev/null
> > +++ b/docs/qemu-multiprocess.txt
> > @@ -0,0 +1,86 @@
> > +Multi-process QEMU
> > +==
> > +
> > +This document describes how to configure and use multi-process qemu.
> > +For the design document refer to docs/devel/qemu-multiprocess.
> > +
> > +1) Configuration
> > +
> > +
> > +To enable support for multi-process add --enable-mpqemu
> > +to the list of options for the "configure" script.
> > +
> > +
> > +2) Usage
> > +
> > +
> > +To start qemu with devices intended to run in a separate emulation
> > +process without libvirtd support, the following should be used on QEMU
> > +command line. As of now, we only support the emulation of lsi53c895a
> > +in a separate process
> > +
> > +* Since parts of the RAM are shared between QEMU & remote process, a
> > +  memory-backend-file is required to facilitate this, as follows:
> > +
> > +  -object memory-backend-file,id=mem,mem-path=/dev/shm/,size=4096M,share=on
> 
> memory-backend-memfd is more convenient.  It doesn't require a mem-path
> and share=on is the default.
>

Will change this. 
> > +
> > +* The devices to be emulated in the separate process are defined as
> > +  before with addition of "rid" suboption that serves as a remote group
> > +  identificator.
> > +
> > +  -device ,rid="remote process id"
> > +
> > +  For example, for non multi-process qemu:
> > +-device lsi53c895a,id=scsi0 device
> > +-device scsi-hd,drive=drive0,bus=scsi0.0,scsi-id=0
> > +-drive id=drive0,file=data-disk.img
> > +
> > +  and for multi-process qemu and no libvirt
> > +  support (i.e. QEMU forks child processes):
> > +-device lsi53c895a,id=scsi0,rid=0
> > +-device scsi-hd,drive=drive0,bus=scsi0.0,scsi-id=0,rid=0
> 
> This approach is invasive:
>  * lsi53c895a should not need to be modified with a new rid= option.
>  * QEMU should not know about the scsi-hd device or drive0.  Only the
>device emulation process needs to know about scsi-hd.
> 
> In order to cleanly separate QEMU and the device emulation process
> syntax like this is needed:
> 
>   -object remote-device,id=rid0,...
>   -device remote-pci-device,id=scsi0,remote-device=rid0
> 
> The "remote-device" object could be part of remote-pci-device, but
> keeping it separate may be useful in the future in order to support
> things like reconnection.
> 
> The generic "remote-pci-device" device handles any remote PCI device,
> not just the LSI SCSI controller.
> 
> Do you agree with this approach?
> 

We discussed these changes and they seem to be along the lines with
the future work on vfio over socket approach we will be working on later.

Could we for this experimental version have the changes you propose here
with one modification - instead of having generic remote-pci-device imply that 
that is LSI
device? And while we work towards vfio over socket this will become any remote
PCI device?

> > +* The command-line options for the remote process are added to the 
> > "command"
> > +  suboption of the newly added "-remote" option. 
> > +
> > +   -remote [socket],rid=0,exec="...",command="..."
> 
> QEMU has been using the -object TYPE syntax instead of adding new -TYPE
> command-line options.  This gives you object_add hotplug for free, for
> example.  I suggest using -object remote-device,id=,exec=,command=,
> instead of -remote.
> 

We will add these changes.
> > +
> > +  The drives to be emulated by the remote process are specified as part of
> > +  this command sub-option. The device to be used to connect to the monitor
> > +  is also specified as part of this suboption.
> > +
> > +  For example, the following option adds a drive and monitor to the remote
> > +  process:
> > +  -remote rid=0,exec="qemu-scsi-dev",command="-drive 
> 

Re: [PATCH v5 49/50] multi-process: add the concept description to docs/devel/qemu-multiprocess

2020-02-28 Thread Elena Ufimtseva
On Thu, Feb 27, 2020 at 05:11:40PM +, Stefan Hajnoczi wrote:
> On Mon, Feb 24, 2020 at 03:55:40PM -0500, Jagannathan Raman wrote:
> > From: John G Johnson 
> > 
> > Signed-off-by: John G Johnson 
> > Signed-off-by: Elena Ufimtseva 
> > Signed-off-by: Jagannathan Raman 
> > ---
> >  docs/devel/index.rst |1 +
> >  docs/devel/qemu-multiprocess.rst | 1102 
> > ++
> >  2 files changed, 1103 insertions(+)
> >  create mode 100644 docs/devel/qemu-multiprocess.rst
> > 
> > diff --git a/docs/devel/index.rst b/docs/devel/index.rst
> > index 4dc2ca8..1a95871 100644
> > --- a/docs/devel/index.rst
> > +++ b/docs/devel/index.rst
> > @@ -25,3 +25,4 @@ Contents:
> > tcg-plugins
> > bitops
> > reset
> > +   multi-process
> > diff --git a/docs/devel/qemu-multiprocess.rst 
> > b/docs/devel/qemu-multiprocess.rst
> > new file mode 100644
> > index 000..477e246
> > --- /dev/null
> > +++ b/docs/devel/qemu-multiprocess.rst
> > @@ -0,0 +1,1102 @@
> > +Disaggregating QEMU
> 
> Please revise this document and the patch series to use consistent
> terminology.  At least "qemu-multiprocess.rst", "--enable-mpqemu", and
> "disaggregated QEMU" are used to describe this feature (there are
> probably more, I have only looked at 2 patches so far).
> 
> It's confusing for someone who stumbles across one of these terms and
> then has to figure out that we're talking about the same thing when
> encountering other terms later on.
> 
> Please use a single name and use it consistently.
>


Thanks Stefan, will work on this. 
> > +===
> > +
> > +QEMU is often used as the hypervisor for virtual machines running in the
> > +Oracle cloud. Since one of the advantages of cloud computing is the
> > +ability to run many VMs from different tenants in the same cloud
> > +infrastructure, a guest that compromised its hypervisor could
> > +potentially use the hypervisor's access privileges to access data it is
> > +not authorized for.
> > +
> > +QEMU can be susceptible to security attack because it is a large,
> 
> s/attack/attacks/
> 
> > +monolithic program that provides many features to the VMs it services.
> > +Many of these feature can be configured out of QEMU, but even a reduced
> 
> s/feature/features/





Re: [PATCH v5 14/50] mutli-process: build remote command line args

2020-03-02 Thread Elena Ufimtseva
On Mon, Mar 02, 2020 at 05:47:45PM +, Daniel P. Berrangé wrote:
> On Mon, Mar 02, 2020 at 06:36:13PM +0100, Philippe Mathieu-Daudé wrote:
> > typo "multi" in patch subject.
> >
Thank Philippe, will fix.
 
> > On 2/24/20 9:55 PM, Jagannathan Raman wrote:
> > > From: Elena Ufimtseva 
> > > 
> > > Signed-off-by: Elena Ufimtseva 
> > > Signed-off-by: Jagannathan Raman 
> > > Signed-off-by: John G Johnson 
> > > ---
> > >   v4 -> v5:
> > >- Added "exec" suboption to get the executable's name
> > >- Addressed feedback about variable names
> > >- Removed redundant check for spawning a process
> > > 
> > >   hw/proxy/qemu-proxy.c | 68 
> > > +--
> > >   include/hw/proxy/qemu-proxy.h |  2 +-
> > >   2 files changed, 54 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
> > > index 828bbd7..d792e86 100644
> > > --- a/hw/proxy/qemu-proxy.c
> > > +++ b/hw/proxy/qemu-proxy.c
> > > @@ -19,19 +19,50 @@
> > >   static void pci_proxy_dev_realize(PCIDevice *dev, Error **errp);
> > > +static int add_argv(char *opts_str, char **argv, int argc)
> > > +{
> > > +int max_args = 64;
> > > +
> > > +if (argc < max_args - 1) {
> > > +argv[argc++] = opts_str;
> > > +argv[argc] = 0;
> > > +} else {
> > > +return 0;
> > > +}
> > > +
> > > +return argc;
> > > +}
> > > +
> > > +static int make_argv(char *opts_str, char **argv, int argc)
> > > +{
> > > +int max_args = 64;
> > > +
> > > +char *p2 = strtok(opts_str, " ");
> > > +while (p2 && argc < max_args - 1) {
> > > +argv[argc++] = p2;
> > > +p2 = strtok(0, " ");
> > > +}
> > > +argv[argc] = 0;
> > 
> > Is there a GLib function to do that?
>

Hi Daniel

> g_shell_parse_argv() perhaps
>

Thanks for the suggestion.

>   https://developer.gnome.org/glib/stable/glib-Shell-related-Utilities.html
> 
> 
> Though my preference would be to avoid the need to do this at all, by
> not accepting a raw shell command line string in the first place.
>
Can you please clarify? Did you mean that it would be better if Qemu somehow
verifies the options and then passes it to a remote process via a message?

Thanks!

Elena
> 
> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
> 



Re: [PATCH v5 14/50] mutli-process: build remote command line args

2020-03-04 Thread Elena Ufimtseva
On Wed, Mar 04, 2020 at 11:00:32AM +, Daniel P. Berrangé wrote:
> On Mon, Mar 02, 2020 at 02:39:37PM -0800, Elena Ufimtseva wrote:
> > On Mon, Mar 02, 2020 at 05:47:45PM +, Daniel P. Berrangé wrote:
> > > On Mon, Mar 02, 2020 at 06:36:13PM +0100, Philippe Mathieu-Daudé wrote:
> > > > typo "multi" in patch subject.
> > > >
> > Thank Philippe, will fix.
> >  
> > > > On 2/24/20 9:55 PM, Jagannathan Raman wrote:
> > > > > From: Elena Ufimtseva 
> > > > > 
> > > > > Signed-off-by: Elena Ufimtseva 
> > > > > Signed-off-by: Jagannathan Raman 
> > > > > Signed-off-by: John G Johnson 
> > > > > ---
> > > > >   v4 -> v5:
> > > > >- Added "exec" suboption to get the executable's name
> > > > >- Addressed feedback about variable names
> > > > >- Removed redundant check for spawning a process
> > > > > 
> > > > >   hw/proxy/qemu-proxy.c | 68 
> > > > > +--
> > > > >   include/hw/proxy/qemu-proxy.h |  2 +-
> > > > >   2 files changed, 54 insertions(+), 16 deletions(-)
> > > > > 
> > > > > diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
> > > > > index 828bbd7..d792e86 100644
> > > > > --- a/hw/proxy/qemu-proxy.c
> > > > > +++ b/hw/proxy/qemu-proxy.c
> > > > > @@ -19,19 +19,50 @@
> > > > >   static void pci_proxy_dev_realize(PCIDevice *dev, Error **errp);
> > > > > +static int add_argv(char *opts_str, char **argv, int argc)
> > > > > +{
> > > > > +int max_args = 64;
> > > > > +
> > > > > +if (argc < max_args - 1) {
> > > > > +argv[argc++] = opts_str;
> > > > > +argv[argc] = 0;
> > > > > +} else {
> > > > > +return 0;
> > > > > +}
> > > > > +
> > > > > +return argc;
> > > > > +}
> > > > > +
> > > > > +static int make_argv(char *opts_str, char **argv, int argc)
> > > > > +{
> > > > > +int max_args = 64;
> > > > > +
> > > > > +char *p2 = strtok(opts_str, " ");
> > > > > +while (p2 && argc < max_args - 1) {
> > > > > +argv[argc++] = p2;
> > > > > +p2 = strtok(0, " ");
> > > > > +}
> > > > > +argv[argc] = 0;
> > > > 
> > > > Is there a GLib function to do that?
> > >
> > 
> > Hi Daniel
> > 
> > > g_shell_parse_argv() perhaps
> > >
> > 
> > Thanks for the suggestion.
> > 
> > >   
> > > https://developer.gnome.org/glib/stable/glib-Shell-related-Utilities.html
> > > 
> > > 
> > > Though my preference would be to avoid the need to do this at all, by
> > > not accepting a raw shell command line string in the first place.
> > >
> > Can you please clarify? Did you mean that it would be better if Qemu somehow
> > verifies the options and then passes it to a remote process via a message?
> 
> I've not been able to trace the code paths back all the way, so I can't
> point to where I think needs fixing. I assuming that something, somewhere
> in this patch series should starts out with a binary name and a list of argv
> as an array of char *. ie a "char **argv".  At some point this array gets
> mashed together into a single 'char *' string where all the argv are separated
> by a space. This patch now tries to parse this and turn it back into a
> "char **argv" array.
> 
> So my key point is that we should try hard to avoid this intermediate
> shell command line string stage entirely. Always keep the argv in an array
> form, and never mash them together such that they then need parsing again.
>
Hi Daniel

Thank you for explanation.
At this point there is no intermediate stage as we grab the arguments
as a raw string from the command line option -remote:

-remote rid=8,exec=qemu-scsi-dev,command="-drive 
id=drive_image2,,file=/root/remote-process-disk.img"

So the command="" string is being later parsed into the array and remote process
gets spawned with the "char **argv".

Stefan expressed his concern that its not convenient to use due to
the double escaping commas, spaces, quotes and we do agree with that.
We were seeking an advice on what is the better approach.

Few things we discussed internally is to have the remote drive
command line options passed over by messages or using QMP.

Thank you!
Elena


> I understand this is probably more complex, because we're having to pass
> this across processes, via QemuOpts IIUC, but I still believe it is important
> to have this data kept in array format if at all practical.
> 
> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
> 



Re: [PATCH v5 14/50] mutli-process: build remote command line args

2020-03-04 Thread Elena Ufimtseva
On Wed, Mar 04, 2020 at 04:33:57PM +, Daniel P. Berrangé wrote:
> On Wed, Mar 04, 2020 at 08:25:34AM -0800, Elena Ufimtseva wrote:
> > On Wed, Mar 04, 2020 at 11:00:32AM +, Daniel P. Berrangé wrote:
> > > On Mon, Mar 02, 2020 at 02:39:37PM -0800, Elena Ufimtseva wrote:
> > > > On Mon, Mar 02, 2020 at 05:47:45PM +, Daniel P. Berrangé wrote:
> > > > > On Mon, Mar 02, 2020 at 06:36:13PM +0100, Philippe Mathieu-Daudé 
> > > > > wrote:
> > > > > > typo "multi" in patch subject.
> > > > > >
> > > > Thank Philippe, will fix.
> > > >  
> > > > > > On 2/24/20 9:55 PM, Jagannathan Raman wrote:
> > > > > > > From: Elena Ufimtseva 
> > > > > > > 
> > > > > > > Signed-off-by: Elena Ufimtseva 
> > > > > > > Signed-off-by: Jagannathan Raman 
> > > > > > > Signed-off-by: John G Johnson 
> > > > > > > ---
> > > > > > >   v4 -> v5:
> > > > > > >- Added "exec" suboption to get the executable's name
> > > > > > >- Addressed feedback about variable names
> > > > > > >- Removed redundant check for spawning a process
> > > > > > > 
> > > > > > >   hw/proxy/qemu-proxy.c | 68 
> > > > > > > +--
> > > > > > >   include/hw/proxy/qemu-proxy.h |  2 +-
> > > > > > >   2 files changed, 54 insertions(+), 16 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
> > > > > > > index 828bbd7..d792e86 100644
> > > > > > > --- a/hw/proxy/qemu-proxy.c
> > > > > > > +++ b/hw/proxy/qemu-proxy.c
> > > > > > > @@ -19,19 +19,50 @@
> > > > > > >   static void pci_proxy_dev_realize(PCIDevice *dev, Error **errp);
> > > > > > > +static int add_argv(char *opts_str, char **argv, int argc)
> > > > > > > +{
> > > > > > > +int max_args = 64;
> > > > > > > +
> > > > > > > +if (argc < max_args - 1) {
> > > > > > > +argv[argc++] = opts_str;
> > > > > > > +argv[argc] = 0;
> > > > > > > +} else {
> > > > > > > +return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +return argc;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int make_argv(char *opts_str, char **argv, int argc)
> > > > > > > +{
> > > > > > > +int max_args = 64;
> > > > > > > +
> > > > > > > +char *p2 = strtok(opts_str, " ");
> > > > > > > +while (p2 && argc < max_args - 1) {
> > > > > > > +argv[argc++] = p2;
> > > > > > > +p2 = strtok(0, " ");
> > > > > > > +}
> > > > > > > +argv[argc] = 0;
> > > > > > 
> > > > > > Is there a GLib function to do that?
> > > > >
> > > > 
> > > > Hi Daniel
> > > > 
> > > > > g_shell_parse_argv() perhaps
> > > > >
> > > > 
> > > > Thanks for the suggestion.
> > > > 
> > > > >   
> > > > > https://developer.gnome.org/glib/stable/glib-Shell-related-Utilities.html
> > > > > 
> > > > > 
> > > > > Though my preference would be to avoid the need to do this at all, by
> > > > > not accepting a raw shell command line string in the first place.
> > > > >
> > > > Can you please clarify? Did you mean that it would be better if Qemu 
> > > > somehow
> > > > verifies the options and then passes it to a remote process via a 
> > > > message?
> > > 
> > > I've not been able to trace the code paths back all the way, so I can't
> > > point to where I think needs fixing. I assuming that something, somewhere
> > > in this patch series should starts out with a binary name and a list of 
> > > argv
> > > as an array of char *. ie a "char **argv".  At some poin

Re: [PATCH v5 42/50] multi-process/mig: Send VMSD of remote to the Proxy object

2020-03-05 Thread Elena Ufimtseva
On Thu, Mar 05, 2020 at 02:39:49PM +, Dr. David Alan Gilbert wrote:
> * Jagannathan Raman (jag.ra...@oracle.com) wrote:
> > The remote process sends the VMSD to the Proxy object, on the source
> > side
> > 
> > Signed-off-by: Elena Ufimtseva 
> > Signed-off-by: John G Johnson 
> > Signed-off-by: Jagannathan Raman 
> > ---
> >  migration/savevm.c   | 27 +++
> >  migration/savevm.h   |  2 ++
> >  remote/remote-main.c | 43 +++
> >  3 files changed, 72 insertions(+)
> > 
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 1d4220e..09af14d 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -2942,3 +2942,30 @@ bool vmstate_check_only_migratable(const 
> > VMStateDescription *vmsd)
> >  
> >  return !(vmsd && vmsd->unmigratable);
> >  }
> > +
> 
> Can we add something here commenting, e.g.
> /* Called by the remote process to serialise migration back to the qemu
>  * */

Will add this.
> > +int qemu_remote_savevm(QEMUFile *f)
> > +{
> > +SaveStateEntry *se;
> > +int ret;
> > +
> > +QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> > +if (!se->vmsd || !vmstate_save_needed(se->vmsd, se->opaque)) {
> > +continue;
> > +}
> > +
> > +save_section_header(f, se, QEMU_VM_SECTION_FULL);
> > +
> > +ret = vmstate_save(f, se, NULL);
> > +if (ret) {
> > +qemu_file_set_error(f, ret);
> > +return ret;
> > +}
> > +
> > +save_section_footer(f, se);
> > +}
> > +
> > +qemu_put_byte(f, QEMU_VM_EOF);
> > +qemu_fflush(f);
> 
> You have a qemu_fflush in process_start_mig_out  just after you call it
> - so you don't need both; I suggest you remove this one.
>
Ok. 
> > +return 0;
> 
> And make this return qemu_file_get_error(f);  just like
> qemu_save_device_state and then makybe some day we can merge them.
>
Will do.
> > +}
> 
> 
> > diff --git a/migration/savevm.h b/migration/savevm.h
> > index ba64a7e..0491d3a 100644
> > --- a/migration/savevm.h
> > +++ b/migration/savevm.h
> > @@ -65,4 +65,6 @@ void qemu_loadvm_state_cleanup(void);
> >  int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
> >  int qemu_load_device_state(QEMUFile *f);
> >  
> > +int qemu_remote_savevm(QEMUFile *f);
> > +
> >  #endif
> > diff --git a/remote/remote-main.c b/remote/remote-main.c
> > index 58d9905..e97eb76 100644
> > --- a/remote/remote-main.c
> > +++ b/remote/remote-main.c
> > @@ -53,6 +53,16 @@
> >  #include "qemu/log.h"
> >  #include "qemu/cutils.h"
> >  #include "remote-opts.h"
> > +#include "qapi/error.h"
> > +#include "io/channel-util.h"
> > +
> > +#include "io/channel.h"
> > +#include "io/channel-socket.h"
> > +#include "migration/qemu-file-types.h"
> > +#include "migration/savevm.h"
> > +#include "migration/qemu-file-channel.h"
> > +#include "migration/qemu-file.h"
> > +
> >  #include "monitor/monitor.h"
> >  #include "chardev/char.h"
> >  #include "sysemu/reset.h"
> > @@ -322,6 +332,36 @@ static int setup_device(MPQemuMsg *msg, Error **errp)
> >  
> >  }
> >  
> > +static void process_start_mig_out(MPQemuMsg *msg)
> > +{
> > +int wait = msg->fds[1];
> > +Error *err = NULL;
> > +QIOChannel *ioc;
> > +QEMUFile *f;
> > +
> > +ioc = qio_channel_new_fd(msg->fds[0], &err);
> > +if (err) {
> > +error_report_err(err);
> > +return;
> > +}
> > +
> > +qio_channel_set_name(QIO_CHANNEL(ioc), "remote-migration-channel");
> > +
> > +f = qemu_fopen_channel_output(ioc);
> > +
> > +bdrv_drain_all();
> > +(void)bdrv_flush_all();
> 
> Do remote process always have block code? I mean can't we have a remote
> process that's just say a NIC ?

Not always (in the future), we will account for this.

> 
> > +(void)qemu_remote_savevm(f);
> 
> It's probably bad to ignore errors here; what you could do is if there's
> an error, you shoul dprint something and then send a poison value back
> to the QEMU to let it know that you've failed.
> 

Yes, will add this.

> > +qemu_fflush(f);
> > +
> > +notify_proxy(wait, (uint64_t)qemu_ftell(f));
> > +PUT_REMOTE_WAIT(wait);
> > +
> > +qemu_fclose(f);
> > +}
> > +
> >  static void process_msg(GIOCondition cond, MPQemuChannel *chan)
> >  {
> >  MPQemuMsg *msg = NULL;
> > @@ -411,6 +451,9 @@ static void process_msg(GIOCondition cond, 
> > MPQemuChannel *chan)
> >  notify_proxy(msg->fds[0], 0);
> >  }
> >  break;
> > +case START_MIG_OUT:
> > +process_start_mig_out(msg);
> > +break;
> >  default:
> >  error_setg(&err, "Unknown command");
> >  goto finalize_loop;
> > -- 
> > 1.8.3.1
> 
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> 



Re: [PATCH v5 40/50] multi-process/mig: build migration module in the remote process

2020-03-05 Thread Elena Ufimtseva
On Thu, Mar 05, 2020 at 10:10:59AM +, Dr. David Alan Gilbert wrote:
> * Jag Raman (jag.ra...@oracle.com) wrote:
> > 
> > 
> > On 3/4/2020 2:52 PM, Dr. David Alan Gilbert wrote:
> > > * Jag Raman (jag.ra...@oracle.com) wrote:
> > > > 
> > > > 
> > > > On 3/4/2020 10:58 AM, Dr. David Alan Gilbert wrote:
> > > > > * Jagannathan Raman (jag.ra...@oracle.com) wrote:
> > > > > > Add Makefile support to enable migration in remote process
> > > > > > 
> > > > > > Signed-off-by: Elena Ufimtseva 
> > > > > > Signed-off-by: John G Johnson 
> > > > > > Signed-off-by: Jagannathan Raman 
> > > > > > ---
> > > > > >Makefile.objs   |  4 +++-
> > > > > >Makefile.target |  1 +
> > > > > >migration/Makefile.objs | 13 -
> > > > > >net/Makefile.objs   |  2 ++
> > > > > >softmmu/vl.c|  2 --
> > > > > >stubs/migration.c   | 49 
> > > > > > +
> > > > > >stubs/net-stub.c| 21 +
> > > > > >stubs/qapi-misc.c   |  2 ++
> > > > > >stubs/replay.c  |  8 
> > > > > >stubs/vl-stub.c | 24 
> > > > > >vl-parse.c  |  3 +++
> > > > > >11 files changed, 125 insertions(+), 4 deletions(-)
> > > > > > 
> > > > > > diff --git a/Makefile.objs b/Makefile.objs
> > > > > > index 4b5db09..65009da 100644
> > > > > > --- a/Makefile.objs
> > > > > > +++ b/Makefile.objs
> > > > > > @@ -74,6 +74,8 @@ common-obj-y += qdev-monitor.o device-hotplug.o
> > > > > >common-obj-$(CONFIG_WIN32) += os-win32.o
> > > > > >common-obj-$(CONFIG_POSIX) += os-posix.o
> > > > > > +remote-pci-obj-$(CONFIG_POSIX) += os-posix.o
> > > > > > +
> > > > > >common-obj-$(CONFIG_LINUX) += fsdev/
> > > > > >common-obj-y += accel/
> > > > > > @@ -104,11 +106,11 @@ common-obj-y += vl-parse.o
> > > > > >
> > > > > > ###
> > > > > ># qapi
> > > > > > -
> > > > > >common-obj-y += qapi/
> > > > > >endif # CONFIG_SOFTMMU
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += net/
> > > > > >remote-pci-obj-$(CONFIG_MPQEMU) += qapi/
> > > > > >remote-pci-obj-$(CONFIG_MPQEMU) += blockdev-nbd.o
> > > > > >remote-pci-obj-$(CONFIG_MPQEMU) += job-qmp.o
> > > > > > diff --git a/Makefile.target b/Makefile.target
> > > > > > index 4ead5c3..4012ae5 100644
> > > > > > --- a/Makefile.target
> > > > > > +++ b/Makefile.target
> > > > > > @@ -240,6 +240,7 @@ all-remote-pci-obj-y += exec.o
> > > > > >all-remote-pci-obj-y += exec-vary.o
> > > > > >all-remote-pci-obj-y += ioport.o
> > > > > >all-remote-pci-obj-y += cpus.o
> > > > > > +all-remote-pci-obj-y += migration/ram.o
> > > > > >endif
> > > > > >remote-pci-obj-y :=
> > > > > > diff --git a/migration/Makefile.objs b/migration/Makefile.objs
> > > > > > index e7cdc76..21f9d8d 100644
> > > > > > --- a/migration/Makefile.objs
> > > > > > +++ b/migration/Makefile.objs
> > > > > > @@ -15,4 +15,15 @@ common-obj-$(CONFIG_LIVE_BLOCK_MIGRATION) += 
> > > > > > block.o
> > > > > >rdma.o-libs := $(RDMA_LIBS)
> > > > > > -remote-pci-obj-$(CONFIG_MPQEMU) += qemu-file.o vmstate.o qjson.o 
> > > > > > vmstate-types.o
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += migration.o socket.o fd.o exec.o
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += tls.o channel.o savevm.o
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += colo.o colo-failover.o
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += vmstate.o vmstate-types.o 
> > > > > > page_cache.o
> > > > > > +remote-pci-obj-$(CONFIG_MPQEMU) += qemu-file.o global_state.o
> > > &g

Re: [PATCH v8 17/20] multi-process: heartbeat messages to remote

2020-08-15 Thread Elena Ufimtseva
On Tue, Aug 11, 2020 at 03:41:30PM +0100, Stefan Hajnoczi wrote:
> On Fri, Jul 31, 2020 at 02:20:24PM -0400, Jagannathan Raman wrote:
> > @@ -343,3 +349,49 @@ static void probe_pci_info(PCIDevice *dev, Error 
> > **errp)
> >  }
> >  }
> >  }
> > +
> > +static void hb_msg(PCIProxyDev *dev)
> > +{
> > +DeviceState *ds = DEVICE(dev);
> > +Error *local_err = NULL;
> > +MPQemuMsg msg = { 0 };
> > +
> > +msg.cmd = PROXY_PING;
> > +msg.bytestream = 0;
> > +msg.size = 0;
> > +
> > +(void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err);
> > +if (local_err) {
> > +error_report_err(local_err);
> > +qio_channel_close(dev->ioc, &local_err);
> > +error_setg(&error_fatal, "Lost contact with device %s", ds->id);
> > +}
> > +}
> 
> Here is my feedback from the last revision. Was this addressed?
>

Hi Stefan,

Thank you for reviewing the patchset. In this version we decided to 
shutdown the guest when the heartbeat did not get a reply from the
remote by setting the error_fatal.
Should we approach it differently or you prefer us to get rid of the
heartbeat in this form?

Thank you,
Elena

>   This patch seems incomplete since no action is taken when the device
>   fails to respond. vCPU threads that access the device will still get
>   stuck.
> 
>   The simplest way to make this useful is to close the connection when a
>   timeout occurs. Then the G_IO_HUP handler for the UNIX domain socket
>   should perform connection cleanup. At that point there are a few
>   choices:
> 
>   1. Stop guest execution and wait for the host admin to restore the
>  mplink so execution can resume. This is similar to how -drive
>  rerror=stop pauses the guest when a disk I/O error is encountered.
> 
>   2. Stop guest execution but defer it until this stale device is actually
>  accessed. This maximizes guest uptime. Guests that rarely access the
>  device may not notice at all.
> 
>   3. Return 0 from MemoryRegion read operations and ignore writes. The
>  guest continues executing but the device is broken. This is risky
>  because device drivers inside the guest may not be ready to deal with
>  this. The result could be data loss or corruption.
> 
>   4. Raise a bus-level event. Maybe PCI error reporting can be used to
>  offline the device.
> 
>   5. Terminate the guest with an error message.
> 
>   6. ?
> 
>   Until the heartbeat is fully implemented and tested I suggest dropping
>   it from this patch series. Remember the G_IO_HUP will happen anyway if
>   the remote device process terminates.





Re: [PATCH v5 07/50] multi-process: define mpqemu-link object

2020-03-10 Thread Elena Ufimtseva
On Tue, Mar 10, 2020 at 04:09:41PM +, Stefan Hajnoczi wrote:
> On Mon, Feb 24, 2020 at 03:54:58PM -0500, Jagannathan Raman wrote:
> > +/*
> > + * TODO: Dont use mpqemu link object since it is
> > + * not needed to be created via -object.
> > + */
> 
> Please investigate and resolve this TODO.
>
Thank you Stefan for reviewing more patches.
This particular TODO have to be removed and I am guessing
followed us from the earlier code.
 
> > +struct conf_data_msg {
> > +uint32_t addr;
> > +uint32_t val;
> > +int l;
> 
> Please use a self-explanatory field name.  I'm not sure what 'l' is.
> 
> conf_data_msg is not used in this patch.  Please introduce things when
> they are needed to make the patch series easier to review in a linear
> fashion.

Will do.
> 
> > +/*
> > + * TODO: make all communications asynchronous and run in the main
> > + * loop or existing IOThread.
> > + */
> 
> Please investigate and decide how to resolve this TODO.
> 
> > +void mpqemu_msg_send(MPQemuMsg *msg, MPQemuChannel *chan)
> > +{
> > +int rc;
> > +uint8_t *data;
> > +union {
> > +char control[CMSG_SPACE(REMOTE_MAX_FDS * sizeof(int))];
> > +struct cmsghdr align;
> > +} u;
> > +struct msghdr hdr;
> > +struct cmsghdr *chdr;
> > +int sock = chan->sock;
> > +QemuMutex *lock = &chan->send_lock;
> > +
> > +struct iovec iov = {
> > +.iov_base = (char *) msg,
> > +.iov_len = MPQEMU_MSG_HDR_SIZE,
> > +};
> > +
> > +memset(&hdr, 0, sizeof(hdr));
> > +memset(&u, 0, sizeof(u));
> > +
> > +hdr.msg_iov = &iov;
> > +hdr.msg_iovlen = 1;
> > +
> > +if (msg->num_fds > REMOTE_MAX_FDS) {
> > +qemu_log_mask(LOG_REMOTE_DEBUG, "%s: Max FDs exceeded\n", 
> > __func__);
> > +return;
> > +}
> > +
> > +if (msg->num_fds > 0) {
> > +size_t fdsize = msg->num_fds * sizeof(int);
> > +
> > +hdr.msg_control = &u;
> > +hdr.msg_controllen = sizeof(u);
> > +
> > +chdr = CMSG_FIRSTHDR(&hdr);
> > +chdr->cmsg_len = CMSG_LEN(fdsize);
> > +chdr->cmsg_level = SOL_SOCKET;
> > +chdr->cmsg_type = SCM_RIGHTS;
> > +memcpy(CMSG_DATA(chdr), msg->fds, fdsize);
> > +hdr.msg_controllen = CMSG_SPACE(fdsize);
> > +}
> > +
> > +qemu_mutex_lock(lock);
> > +
> > +do {
> > +rc = sendmsg(sock, &hdr, 0);
> > +} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
> > +
> > +if (rc < 0) {
> > +qemu_log_mask(LOG_REMOTE_DEBUG, "%s - sendmsg rc is %d, errno is 
> > %d,"
> > +  " sock %d\n", __func__, rc, errno, sock);
> > +qemu_mutex_unlock(lock);
> > +return;
> > +}
> > +
> > +if (msg->bytestream) {
> > +data = msg->data2;
> > +} else {
> > +data = (uint8_t *)msg + MPQEMU_MSG_HDR_SIZE;
> > +}
> > +
> > +do {
> > +rc = write(sock, data, msg->size);
> > +} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
> > +
> > +qemu_mutex_unlock(lock);
> 
> Can this lock be avoided by using a single sendmsg(2) syscall instead of
> sendmsg() + write()?  I feel deja vu here, like I maybe have raised this
> in a previous revision of this patch series.
> 

Indeed, you did mention this. Sorry, it got forgotten.
It seems to be possible, we will investigate further and include in the
next version.

> > +msg->num_fds = 0;
> > +for (chdr = CMSG_FIRSTHDR(&hdr); chdr != NULL;
> > + chdr = CMSG_NXTHDR(&hdr, chdr)) {
> > +if ((chdr->cmsg_level == SOL_SOCKET) &&
> > +(chdr->cmsg_type == SCM_RIGHTS)) {
> > +fdsize = chdr->cmsg_len - CMSG_LEN(0);
> > +msg->num_fds = fdsize / sizeof(int);
> > +if (msg->num_fds > REMOTE_MAX_FDS) {
> > +/*
> > + * TODO: Security issue detected. Sender never sends more
> > + * than REMOTE_MAX_FDS. This condition should be signaled 
> > to
> > + * the admin
> > + */
> 
> This TODO doesn't seem actionable.  The error is already handled.
> 
> > +qemu_log_mask(LOG_REMOTE_DEBUG,
> > +  "%s: Max FDs exceeded\n", __func__);
> > +return -ERANGE;
> 
> The mutex must be released.

Thank you! Will fix this and above.


Elena



[PATCH RESEND v6 00/36] Initial support for multi-process qemu

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Hello

This is a resend of v6 patchset since we regrettably omitted few comments
from v5 review in the previously sent series 
(see in https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg00828.html).
We also run more tests and fixed the build errors that were found in v6.

Started with the presentation in October 2017 made by Marc-Andre (Red Hat)
and Konrad Wilk (Oracle) [1], and continued by Jag's BoF at KVM Forum 2018,
the multi-process project is now available and presented in this patchset.
This first series enables the emulation of lsi53c895a in a separate process.

We posted the Proof Of Concept patches [2] before the BoF session in 2018.
Subsequently, we posted RFC v1 [3], RFC v2 [4], RFC v3 [5], RFC v4 [6]
and v5 [7] of the patch series.

This is v6 of the patch series and it addresses the previous feedback from
the community.
To make easier to review of the series, we have separated out some of the
patches and will send them in the separate series. As per conversation we
had during the last community call, the live migration support is taken out
from this series as well as asynchronous communication.
The changes include the elimination of fork/exec of the remote process
and instead using the orchestrator which is implemented in this series as
a python script.

Following people contributed to this patchset:

John G Johnson 
Jagannathan Raman 
Elena Ufimtseva 
Kanth Ghatraju 
Konrad Wilk 

For full concept writeup about QEMU disaggregation, refer to
docs/devel/qemu-multiprocess.rst. Please refer to
docs/qemu-multiprocess.txt for usage information.

We will post separate patchsets for the following improvements for
the experimental Qemu multi-process:
 - Live migration;
 - Asynchronous communication channel;
 - Libvirt support;

We welcome all your ideas, concerns, and questions for this patchset.

Testing results

There is an error in travis-ci build test which does not get reproduced.

 TESTiotest-qcow2: 041 [fail]
QEMU  -- 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64"
 -nodefaults -display none -accel qtest
QEMU_IMG  -- 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/../../qemu-img"
 
QEMU_IO   -- 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/../../qemu-io"
  --cache writeback --aio threads -f qcow2
QEMU_NBD  -- 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/../../qemu-nbd"
 
IMGFMT-- qcow2 (compat=1.1)
IMGPROTO  -- file
PLATFORM  -- Linux/x86_64 travis-job-fc4e2553-b470-4a8b-812e-a4fcf8ba094f 
5.0.0-1031-gcp
TEST_DIR  -- 
/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/scratch
SOCK_DIR  -- /tmp/tmp.LOmYANt5Od
SOCKET_SCM_HELPER -- 
/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/socket_scm_helper
--- 
/home/travis/build/elena-ufimtseva/qemu-multiprocess/tests/qemu-iotests/041.out 
2020-04-22 00:17:23.701844698 +
+++ 
/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/041.out.bad
   2020-04-22 00:24:39.234343858 +
@@ -1,5 +1,29 @@
-..
+FF
+==
+FAIL: test_with_other_parent (__main__.TestRepairQuorum)
+--
+Traceback (most recent call last):
+  File "041", line 1049, in test_with_other_parent
+self.assert_qmp(result, 'return', {})
+  File 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/tests/qemu-iotests/iotests.py",
 line 821, in assert_qmp
+result = self.dictpath(d, path)
+  File 
"/home/travis/build/elena-ufimtseva/qemu-multiprocess/tests/qemu-iotests/iotests.py",
 line 797, in dictpath
+self.fail('failed path traversal for "%s" in "%s"' % (path, str(d)))
+AssertionError: failed path traversal for "return" in "{'error': {'class': 
'GenericError', 'desc': "UNIX socket path 
'/home/travis/build/elena-ufimtseva/qemu-multiprocess/out-of-tree/build/dir/tests/qemu-iotests/scratch/nbd.sock'
 is too long"}}"
a
+
Not run: 220 259
Failures: 041
Failed 1 of 116 iotests
/home/travis/build/elena-ufimtseva/qemu-multiprocess/tests/Makefile.include:848:
 recipe for target 'check-tests/check-block.sh' failed
make: *** [check-tests/check-block.sh] Error 1
The command "if [ "$BUILD_RC" -eq 0 ] ; then
${TEST_CM

[PATCH RESEND v6 03/36] command-line: refractor parser code

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Refactor command-line parser code so that it could be used by
other processes as well.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 MAINTAINERS  |  2 +
 Makefile.objs|  2 +
 include/qemu-parse.h | 42 
 qemu-parse.c | 93 
 softmmu/vl.c | 84 +--
 5 files changed, 140 insertions(+), 83 deletions(-)
 create mode 100644 include/qemu-parse.h
 create mode 100644 qemu-parse.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 04b19ac56c..2e700e6e64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2032,6 +2032,8 @@ F: tests/test-keyval.c
 F: tests/test-qemu-opts.c
 F: util/keyval.c
 F: util/qemu-option.c
+F: include/qemu-parse.h
+F: qemu-parse.c
 
 Coverity model
 M: Markus Armbruster 
diff --git a/Makefile.objs b/Makefile.objs
index bfb9271862..f29c60c59d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -78,6 +78,8 @@ qemu-seccomp.o-libs := $(SECCOMP_LIBS)
 
 common-obj-$(CONFIG_FDT) += device_tree.o
 
+common-obj-y += qemu-parse.o
+
 common-obj-y += qapi/
 common-obj-y += util/machine-notify.o
 
diff --git a/include/qemu-parse.h b/include/qemu-parse.h
new file mode 100644
index 00..156b238db6
--- /dev/null
+++ b/include/qemu-parse.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef VL_H
+#define VL_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+/***/
+/* QEMU Block devices */
+
+#define HD_OPTS "media=disk"
+#define CDROM_OPTS "media=cdrom"
+#define FD_OPTS ""
+#define PFLASH_OPTS ""
+#define MTD_OPTS ""
+#define SD_OPTS ""
+
+#define HAS_ARG 0x0001
+
+typedef struct QEMUOption {
+const char *name;
+int flags;
+int index;
+uint32_t arch_mask;
+} QEMUOption;
+
+const QEMUOption *lookup_opt(int argc, char **argv, const char **poptarg,
+ int *poptind);
+
+int drive_init_func(void *opaque, QemuOpts *opts, Error **errp);
+
+int device_init_func(void *opaque, QemuOpts *opts, Error **errp);
+
+#endif /* VL_H */
+
diff --git a/qemu-parse.c b/qemu-parse.c
new file mode 100644
index 00..2535374fdd
--- /dev/null
+++ b/qemu-parse.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "sysemu/blockdev.h"
+#include "sysemu/arch_init.h"
+#include "qemu/option.h"
+#include "qemu-options.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "monitor/qdev.h"
+#include "qom/object.h"
+#include "qemu-parse.h"
+
+/***/
+/* QEMU Block devices */
+
+static const QEMUOption qemu_options[] = {
+{ "h", 0, QEMU_OPTION_h, QEMU_ARCH_ALL },
+#define QEMU_OPTIONS_GENERATE_OPTIONS
+#include "qemu-options-wrapper.h"
+{ NULL },
+};
+
+const QEMUOption *lookup_opt(int argc, char **argv, const char **poptarg,
+ int *poptind)
+{
+const QEMUOption *popt;
+int optind = *poptind;
+char *r = argv[optind];
+const char *optarg;
+
+loc_set_cmdline(argv, optind, 1);
+optind++;
+/* Treat --foo the same as -foo.  */
+if (r[1] == '-') {
+r++;
+}
+popt = qemu_options;
+for (;;) {
+if (!popt->name) {
+error_report("invalid option");
+exit(1);
+}
+if (!strcmp(popt->name, r + 1)) {
+break;
+}
+popt++;
+}
+if (popt->flags & HAS_ARG) {
+if (optind >= argc) {
+error_report("requires an argument");
+exit(1);
+}
+optarg = argv[optind++];
+loc_set_cmdline(argv, optind - 2, 2);
+} else {
+optarg = NULL;
+}
+
+*poptarg = optarg;
+*poptind = optind;
+
+return popt;
+}
+
+int drive_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+BlockInterfaceType *block_default_type = opaque;
+
+return drive_new(opts, *block_default_type, errp) == NULL;
+}
+
+int device_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+DeviceState *dev;
+
+dev = qdev_device_add(opts, errp);
+if (!dev && *errp) {
+error_report_err(*errp);
+return -1;
+} else if (dev) {
+object_unref(OBJECT(dev));
+}
+return 0;
+}
d

[PATCH RESEND v6 01/36] memory: alloc RAM from file at offset

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Allow RAM MemoryRegion to be created from an offset in a file, instead
of allocating at offset of 0 by default. This is needed to synchronize
RAM between QEMU & remote process.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
Reviewed-by: Dr. David Alan Gilbert 
---
 exec.c| 11 +++
 include/exec/ram_addr.h   |  2 +-
 include/qemu/mmap-alloc.h |  3 ++-
 memory.c  |  2 +-
 util/mmap-alloc.c |  7 ---
 util/oslib-posix.c|  2 +-
 6 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/exec.c b/exec.c
index 2874bb5088..d0ac9545f4 100644
--- a/exec.c
+++ b/exec.c
@@ -1801,6 +1801,7 @@ static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 int fd,
 bool truncate,
+off_t offset,
 Error **errp)
 {
 void *area;
@@ -1851,7 +1852,8 @@ static void *file_ram_alloc(RAMBlock *block,
 }
 
 area = qemu_ram_mmap(fd, memory, block->mr->align,
- block->flags & RAM_SHARED, block->flags & RAM_PMEM);
+ block->flags & RAM_SHARED, block->flags & RAM_PMEM,
+ offset);
 if (area == MAP_FAILED) {
 error_setg_errno(errp, errno,
  "unable to map backing store for guest RAM");
@@ -2283,7 +2285,7 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp, bool shared)
 #ifdef CONFIG_POSIX
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
  uint32_t ram_flags, int fd,
- Error **errp)
+ off_t offset, Error **errp)
 {
 RAMBlock *new_block;
 Error *local_err = NULL;
@@ -2328,7 +2330,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
MemoryRegion *mr,
 new_block->used_length = size;
 new_block->max_length = size;
 new_block->flags = ram_flags;
-new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
+new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
+ errp);
 if (!new_block->host) {
 g_free(new_block);
 return NULL;
@@ -2358,7 +2361,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, 
MemoryRegion *mr,
 return NULL;
 }
 
-block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
+block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, errp);
 if (!block) {
 if (created) {
 unlink(mem_path);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5e59a3d8d7..1b9f489ff0 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -121,7 +121,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, 
MemoryRegion *mr,
Error **errp);
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
  uint32_t ram_flags, int fd,
- Error **errp);
+ off_t offset, Error **errp);
 
 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
   MemoryRegion *mr, Error **errp);
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index e786266b92..4f579858bc 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -25,7 +25,8 @@ void *qemu_ram_mmap(int fd,
 size_t size,
 size_t align,
 bool shared,
-bool is_pmem);
+bool is_pmem,
+off_t start);
 
 void qemu_ram_munmap(int fd, void *ptr, size_t size);
 
diff --git a/memory.c b/memory.c
index 601b749906..f5fec476b7 100644
--- a/memory.c
+++ b/memory.c
@@ -1596,7 +1596,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
 mr->destructor = memory_region_destructor_ram;
 mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
share ? RAM_SHARED : 0,
-   fd, &err);
+   fd, 0, &err);
 mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
 if (err) {
 mr->size = int128_zero();
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 27dcccd8ec..a28f7025f0 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -86,7 +86,8 @@ void *qemu_ram_mmap(int fd,
 size_t size,
 size_t align,
 bool shared,
-bool is_pmem)
+bool is_pmem,
+off_t start)
 {
 int flags;
 int map_sync_flags = 0;
@@ -147,7 

[PATCH RESEND v6 04/36] multi-process: Refactor chardev functions out of vl.c

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Some of the initialization helper functions w.r.t chardev
in vl.c are also used by the remote process. Therefore, these functions
are refactored into shared files that both QEMU & remote process
could use.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Reviewed-by: Dr. David Alan Gilbert 
---
 chardev/char.c | 14 ++
 include/chardev/char.h |  2 ++
 softmmu/vl.c   | 14 --
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index e77564060d..47dcf93da7 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -1196,3 +1196,17 @@ static void register_types(void)
 }
 
 type_init(register_types);
+
+int chardev_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+Error *local_err = NULL;
+
+if (!qemu_chr_new_from_opts(opts, NULL, &local_err)) {
+if (local_err) {
+error_propagate(errp, local_err);
+return -1;
+}
+exit(0);
+}
+return 0;
+}
diff --git a/include/chardev/char.h b/include/chardev/char.h
index 00589a6025..0804e78f7a 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -290,4 +290,6 @@ GSource *qemu_chr_timeout_add_ms(Chardev *chr, guint ms,
 /* console.c */
 void qemu_chr_parse_vc(QemuOpts *opts, ChardevBackend *backend, Error **errp);
 
+int chardev_init_func(void *opaque, QemuOpts *opts, Error **errp);
+
 #endif
diff --git a/softmmu/vl.c b/softmmu/vl.c
index d1b32a33a2..6e35f3787d 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -2038,20 +2038,6 @@ static int device_help_func(void *opaque, QemuOpts 
*opts, Error **errp)
 return qdev_device_help(opts);
 }
 
-static int chardev_init_func(void *opaque, QemuOpts *opts, Error **errp)
-{
-Error *local_err = NULL;
-
-if (!qemu_chr_new_from_opts(opts, NULL, &local_err)) {
-if (local_err) {
-error_propagate(errp, local_err);
-return -1;
-}
-exit(0);
-}
-return 0;
-}
-
 #ifdef CONFIG_VIRTFS
 static int fsdev_init_func(void *opaque, QemuOpts *opts, Error **errp)
 {
-- 
2.25.GIT




[PATCH RESEND v6 06/36] monitor: destaticize HMP commands

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Reviewed-by: Dr. David Alan Gilbert 
---
 hmp-commands.hx|  4 +-
 monitor/misc.c | 76 +++---
 monitor/monitor-internal.h | 38 +++
 3 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 7f0f3974ad..02cae25c24 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -11,7 +11,7 @@ HXCOMM HXCOMM can be used for comments, discarded from both 
rST and C.
 .args_type  = "name:S?",
 .params = "[cmd]",
 .help   = "show the help",
-.cmd= do_help_cmd,
+.cmd= hmp_do_help_cmd,
 .flags  = "p",
 },
 
@@ -555,7 +555,7 @@ ERST
 .args_type  = "fmt:/,val:l",
 .params = "/fmt expr",
 .help   = "print expression value (use $reg for CPU register 
access)",
-.cmd= do_print,
+.cmd= hmp_do_print,
 },
 
 SRST
diff --git a/monitor/misc.c b/monitor/misc.c
index 6c45fa490f..c0eee6f4ab 100644
--- a/monitor/misc.c
+++ b/monitor/misc.c
@@ -178,12 +178,12 @@ int hmp_compare_cmd(const char *name, const char *list)
 return 0;
 }
 
-static void do_help_cmd(Monitor *mon, const QDict *qdict)
+void hmp_do_help_cmd(Monitor *mon, const QDict *qdict)
 {
 help_cmd(mon, qdict_get_try_str(qdict, "name"));
 }
 
-static void hmp_trace_event(Monitor *mon, const QDict *qdict)
+void hmp_trace_event(Monitor *mon, const QDict *qdict)
 {
 const char *tp_name = qdict_get_str(qdict, "name");
 bool new_state = qdict_get_bool(qdict, "option");
@@ -227,7 +227,7 @@ static void hmp_trace_file(Monitor *mon, const QDict *qdict)
 }
 #endif
 
-static void hmp_info_help(Monitor *mon, const QDict *qdict)
+void hmp_info_help(Monitor *mon, const QDict *qdict)
 {
 help_cmd(mon, "info");
 }
@@ -315,7 +315,7 @@ int monitor_get_cpu_index(void)
 return cs ? cs->cpu_index : UNASSIGNED_CPU_INDEX;
 }
 
-static void hmp_info_registers(Monitor *mon, const QDict *qdict)
+void hmp_info_registers(Monitor *mon, const QDict *qdict)
 {
 bool all_cpus = qdict_get_try_bool(qdict, "cpustate_all", false);
 CPUState *cs;
@@ -338,7 +338,7 @@ static void hmp_info_registers(Monitor *mon, const QDict 
*qdict)
 }
 
 #ifdef CONFIG_TCG
-static void hmp_info_jit(Monitor *mon, const QDict *qdict)
+void hmp_info_jit(Monitor *mon, const QDict *qdict)
 {
 if (!tcg_enabled()) {
 error_report("JIT information is only available with accel=tcg");
@@ -349,13 +349,13 @@ static void hmp_info_jit(Monitor *mon, const QDict *qdict)
 dump_drift_info();
 }
 
-static void hmp_info_opcount(Monitor *mon, const QDict *qdict)
+void hmp_info_opcount(Monitor *mon, const QDict *qdict)
 {
 dump_opcount_info();
 }
 #endif
 
-static void hmp_info_sync_profile(Monitor *mon, const QDict *qdict)
+void hmp_info_sync_profile(Monitor *mon, const QDict *qdict)
 {
 int64_t max = qdict_get_try_int(qdict, "max", 10);
 bool mean = qdict_get_try_bool(qdict, "mean", false);
@@ -366,7 +366,7 @@ static void hmp_info_sync_profile(Monitor *mon, const QDict 
*qdict)
 qsp_report(max, sort_by, coalesce);
 }
 
-static void hmp_info_history(Monitor *mon, const QDict *qdict)
+void hmp_info_history(Monitor *mon, const QDict *qdict)
 {
 MonitorHMP *hmp_mon = container_of(mon, MonitorHMP, common);
 int i;
@@ -386,7 +386,7 @@ static void hmp_info_history(Monitor *mon, const QDict 
*qdict)
 }
 }
 
-static void hmp_info_cpustats(Monitor *mon, const QDict *qdict)
+void hmp_info_cpustats(Monitor *mon, const QDict *qdict)
 {
 CPUState *cs = mon_get_cpu();
 
@@ -397,7 +397,7 @@ static void hmp_info_cpustats(Monitor *mon, const QDict 
*qdict)
 cpu_dump_statistics(cs, 0);
 }
 
-static void hmp_info_trace_events(Monitor *mon, const QDict *qdict)
+void hmp_info_trace_events(Monitor *mon, const QDict *qdict)
 {
 const char *name = qdict_get_try_str(qdict, "name");
 bool has_vcpu = qdict_haskey(qdict, "vcpu");
@@ -457,7 +457,7 @@ void qmp_client_migrate_info(const char *protocol, const 
char *hostname,
 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "protocol", "spice");
 }
 
-static void hmp_logfile(Monitor *mon, const QDict *qdict)
+void hmp_logfile(Monitor *mon, const QDict *qdict)
 {
 Error *err = NULL;
 
@@ -467,7 +467,7 @@ static void hmp_logfile(Monitor *mon, const QDict *qdict)
 }
 }
 
-static void hmp_log(Monitor *mon, const QDict *qdict)
+void hmp_log(Monitor *mon, const QDict *qdict)
 {
 int mask;
 const char *items = qdict_get_str(qdict, "items");
@@ -484,7 +484,7 @@ static void hmp_log(Monitor *mon, const QDict *qdict)
 qemu_set_log(mask);
 }
 
-stati

[PATCH RESEND v6 19/36] multi-process: Connect Proxy Object with device in the remote process

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Send a message to the remote process to connect PCI device with the
corresponding Proxy object in QEMU

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hw/proxy/qemu-proxy.c| 34 +++
 include/io/mpqemu-link.h |  5 +
 io/mpqemu-link.c |  3 +++
 remote/remote-main.c | 43 
 4 files changed, 85 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 40bf56fd37..9b5e429a88 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -17,11 +17,45 @@
 static void proxy_set_socket(Object *obj, const char *str, Error **errp)
 {
 PCIProxyDev *pdev = PCI_PROXY_DEV(obj);
+DeviceState *dev = DEVICE(obj);
+MPQemuMsg msg = { 0 };
+int wait, fd[2];
 
 pdev->socket = atoi(str);
 
 mpqemu_init_channel(pdev->mpqemu_link, &pdev->mpqemu_link->com,
 pdev->socket);
+
+if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd)) {
+error_setg(errp, "Failed to create socket for device channel");
+return;
+}
+
+wait = GET_REMOTE_WAIT;
+
+msg.cmd = CONNECT_DEV;
+msg.bytestream = 1;
+msg.data2 = (uint8_t *)g_strdup(dev->id);
+msg.size = sizeof(msg.data2);
+msg.num_fds = 2;
+msg.fds[0] = wait;
+msg.fds[1] = fd[1];
+
+mpqemu_msg_send(&msg, pdev->mpqemu_link->com);
+
+if (wait_for_remote(wait)) {
+error_setg(errp, "Failed to connect device to the remote");
+close(fd[0]);
+} else {
+mpqemu_init_channel(pdev->mpqemu_link, &pdev->mpqemu_link->dev,
+fd[0]);
+}
+
+PUT_REMOTE_WAIT(wait);
+
+close(fd[1]);
+
+g_free(msg.data2);
 }
 
 static void proxy_init(Object *obj)
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 73cc59b874..ebae9afc45 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -38,6 +38,7 @@
 typedef enum {
 INIT = 0,
 SYNC_SYSMEM,
+CONNECT_DEV,
 MAX,
 } mpqemu_cmd_t;
 
@@ -120,8 +121,12 @@ struct MPQemuLinkState {
 GMainLoop *loop;
 
 MPQemuChannel *com;
+MPQemuChannel *dev;
 
 mpqemu_link_callback callback;
+
+void *opaque;
+QemuThread thread;
 };
 
 MPQemuLinkState *mpqemu_link_create(void);
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 3f81cef96e..f780b65181 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -46,6 +46,9 @@ MPQemuLinkState *mpqemu_link_create(void)
 MPQemuLinkState *link = MPQEMU_LINK(object_new(TYPE_MPQEMU_LINK));
 
 link->com = NULL;
+link->dev = NULL;
+
+link->opaque = NULL;
 
 return link;
 }
diff --git a/remote/remote-main.c b/remote/remote-main.c
index dbd6ad2529..f541baae6a 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -35,6 +35,9 @@
 #include "exec/ramlist.h"
 #include "remote/remote-common.h"
 
+static void process_msg(GIOCondition cond, MPQemuLinkState *link,
+MPQemuChannel *chan);
+
 static MPQemuLinkState *mpqemu_link;
 
 gchar *print_pid_exec(gchar *str)
@@ -48,6 +51,43 @@ gchar *print_pid_exec(gchar *str)
 return str;
 }
 
+#define LINK_TO_DEV(link) ((PCIDevice *)link->opaque)
+
+static gpointer dev_thread(gpointer data)
+{
+MPQemuLinkState *link = data;
+
+mpqemu_start_coms(link, link->dev);
+
+return NULL;
+}
+
+static void process_connect_dev_msg(MPQemuMsg *msg)
+{
+char *devid = (char *)msg->data2;
+MPQemuLinkState *link = NULL;
+DeviceState *dev = NULL;
+int wait = msg->fds[0];
+int ret = 0;
+
+dev = qdev_find_recursive(sysbus_get_default(), devid);
+if (!dev) {
+ret = 0xff;
+goto exit;
+}
+
+link = mpqemu_link_create();
+link->opaque = (void *)PCI_DEVICE(dev);
+
+mpqemu_init_channel(link, &link->dev, msg->fds[1]);
+mpqemu_link_set_callback(link, process_msg);
+qemu_thread_create(&link->thread, "dev_thread", dev_thread, link,
+   QEMU_THREAD_JOINABLE);
+
+exit:
+notify_proxy(wait, ret);
+}
+
 static void process_msg(GIOCondition cond, MPQemuLinkState *link,
 MPQemuChannel *chan)
 {
@@ -72,6 +112,9 @@ static void process_msg(GIOCondition cond, MPQemuLinkState 
*link,
 switch (msg->cmd) {
 case INIT:
 break;
+case CONNECT_DEV:
+process_connect_dev_msg(msg);
+break;
 default:
 error_setg(&err, "Unknown command in %s", print_pid_exec(pid_exec));
 goto finalize_loop;
-- 
2.25.GIT




[PATCH RESEND v6 13/36] multi-process: setup PCI host bridge for remote device

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

PCI host bridge is setup for the remote device process. It is
implemented using remote-pcihost object. It is an extension of the PCI
host bridge setup by QEMU.
Remote-pcihost configures a PCI bus which could be used by the remote
 PCI device to latch on to.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS  |  2 ++
 hw/pci/Makefile.objs |  2 +-
 include/remote/pcihost.h | 45 
 remote/Makefile.objs |  1 +
 remote/pcihost.c | 64 
 5 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 include/remote/pcihost.h
 create mode 100644 remote/pcihost.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 93ad693da4..0cda5ee06a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2859,6 +2859,8 @@ F: remote/Makefile.objs
 F: remote/remote-main.c
 F: include/io/mpqemu-link.h
 F: io/mpqemu-link.c
+F: include/remote/pcihost.h
+F: remote/pcihost.c
 
 Build and test automation
 -
diff --git a/hw/pci/Makefile.objs b/hw/pci/Makefile.objs
index 955be54472..90693a7695 100644
--- a/hw/pci/Makefile.objs
+++ b/hw/pci/Makefile.objs
@@ -13,6 +13,6 @@ common-obj-$(CONFIG_PCI_EXPRESS) += pcie_port.o pcie_host.o
 common-obj-$(call lnot,$(CONFIG_PCI)) += pci-stub.o
 common-obj-$(CONFIG_ALL) += pci-stub.o
 
-remote-pci-obj-$(CONFIG_MPQEMU) += pci.o pci_bridge.o
+remote-pci-obj-$(CONFIG_MPQEMU) += pci.o pci_bridge.o pci_host.o pcie_host.o
 remote-pci-obj-$(CONFIG_MPQEMU) += msi.o msix.o
 remote-pci-obj-$(CONFIG_MPQEMU) += pcie.o
diff --git a/include/remote/pcihost.h b/include/remote/pcihost.h
new file mode 100644
index 00..7aca9ccaf1
--- /dev/null
+++ b/include/remote/pcihost.h
@@ -0,0 +1,45 @@
+/*
+ * PCI Host for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_PCIHOST_H
+#define REMOTE_PCIHOST_H
+
+#include 
+#include 
+
+#include "exec/memory.h"
+#include "hw/pci/pcie_host.h"
+
+#define TYPE_REMOTE_HOST_DEVICE "remote-pcihost"
+#define REMOTE_HOST_DEVICE(obj) \
+OBJECT_CHECK(RemPCIHost, (obj), TYPE_REMOTE_HOST_DEVICE)
+
+typedef struct RemPCIHost {
+/*< private >*/
+PCIExpressHost parent_obj;
+/*< public >*/
+
+/*
+ * Memory Controller Hub (MCH) may not be necessary for the emulation
+ * program. The two important reasons for implementing a PCI host in the
+ * emulation program are:
+ * - Provide a PCI bus for IO devices
+ * - Enable translation of guest PA to the PCI bar regions
+ *
+ * For both the above mentioned purposes, it doesn't look like we would
+ * need the MCH
+ */
+
+MemoryRegion *mr_pci_mem;
+MemoryRegion *mr_sys_mem;
+MemoryRegion *mr_sys_io;
+} RemPCIHost;
+
+#endif
diff --git a/remote/Makefile.objs b/remote/Makefile.objs
index a9b2256b2a..2757f5a265 100644
--- a/remote/Makefile.objs
+++ b/remote/Makefile.objs
@@ -1 +1,2 @@
 remote-pci-obj-$(CONFIG_MPQEMU) += remote-main.o
+remote-pci-obj-$(CONFIG_MPQEMU) += pcihost.o
diff --git a/remote/pcihost.c b/remote/pcihost.c
new file mode 100644
index 00..dbe081903e
--- /dev/null
+++ b/remote/pcihost.c
@@ -0,0 +1,64 @@
+/*
+ * Remote PCI host device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include 
+#include 
+
+#include "qemu/osdep.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/qdev-properties.h"
+#include "remote/pcihost.h"
+#include "exec/memory.h"
+
+static const char *remote_host_root_bus_path(PCIHostState *host_bridge,
+ PCIBus *rootbus)
+{
+return ":00";
+}
+
+static void remote_host_realize(DeviceState *dev, Error **errp)
+{
+char *busname = g_strdup_printf("remote-pci-%ld", (unsigned long)getpid());
+PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+RemPCIHost *s = REMOTE_HOST_DEVICE(dev);
+
+pci->bus = pci_root_bus_new(DEVICE(s), busname,
+s->mr_pci_mem, s->mr_sys_io,
+0, TYPE_PCIE_BUS);
+}
+
+static void remote_host_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+
+hc->root_bus_path = remote_host_root_bus_path;
+dc->realize = remote_host_realize;
+
+dc->user_creatable = false;
+set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+dc->fw_name = "pci&

[PATCH RESEND v6 05/36] multi-process: Refactor monitor functions out of vl.c

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Some of the initialization helper functions w.r.t monitor
in vl.c are also used by the remote process. Therefore, these functions
are refactored into shared files that both QEMU & remote process
could use.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Reviewed-by: Dr. David Alan Gilbert 
---
 include/monitor/monitor.h |  3 +++
 monitor/monitor.c | 37 +
 softmmu/vl.c  | 35 ---
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 1018d754a6..3803e904f2 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -48,4 +48,7 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd);
 void monitor_fdset_dup_fd_remove(int dup_fd);
 int64_t monitor_fdset_dup_fd_find(int dup_fd);
 
+int mon_init_func(void *opaque, QemuOpts *opts, Error **errp);
+void monitor_parse(const char *optarg, const char *mode, bool pretty);
+
 #endif /* MONITOR_H */
diff --git a/monitor/monitor.c b/monitor/monitor.c
index 125494410a..88423b38b6 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -35,6 +35,8 @@
 #include "sysemu/qtest.h"
 #include "sysemu/sysemu.h"
 #include "trace.h"
+#include "qemu/cutils.h"
+#include "qemu/config-file.h"
 
 /*
  * To prevent flooding clients, events can be throttled. The
@@ -611,6 +613,41 @@ void monitor_init_globals_core(void)
NULL);
 }
 
+int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
+{
+return monitor_init_opts(opts, errp);
+}
+
+void monitor_parse(const char *optarg, const char *mode, bool pretty)
+{
+static int monitor_device_index;
+QemuOpts *opts;
+const char *p;
+char label[32];
+
+if (strstart(optarg, "chardev:", &p)) {
+snprintf(label, sizeof(label), "%s", p);
+} else {
+snprintf(label, sizeof(label), "compat_monitor%d",
+ monitor_device_index);
+opts = qemu_chr_parse_compat(label, optarg, true);
+if (!opts) {
+error_report("parse error: %s", optarg);
+exit(1);
+}
+}
+
+opts = qemu_opts_create(qemu_find_opts("mon"), label, 1, &error_fatal);
+qemu_opt_set(opts, "mode", mode, &error_abort);
+qemu_opt_set(opts, "chardev", label, &error_abort);
+if (!strcmp(mode, "control")) {
+qemu_opt_set_bool(opts, "pretty", pretty, &error_abort);
+} else {
+assert(pretty == false);
+}
+monitor_device_index++;
+}
+
 int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp)
 {
 Chardev *chr;
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 6e35f3787d..abc746c1b5 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -2045,41 +2045,6 @@ static int fsdev_init_func(void *opaque, QemuOpts *opts, 
Error **errp)
 }
 #endif
 
-static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
-{
-return monitor_init_opts(opts, errp);
-}
-
-static void monitor_parse(const char *optarg, const char *mode, bool pretty)
-{
-static int monitor_device_index = 0;
-QemuOpts *opts;
-const char *p;
-char label[32];
-
-if (strstart(optarg, "chardev:", &p)) {
-snprintf(label, sizeof(label), "%s", p);
-} else {
-snprintf(label, sizeof(label), "compat_monitor%d",
- monitor_device_index);
-opts = qemu_chr_parse_compat(label, optarg, true);
-if (!opts) {
-error_report("parse error: %s", optarg);
-exit(1);
-}
-}
-
-opts = qemu_opts_create(qemu_find_opts("mon"), label, 1, &error_fatal);
-qemu_opt_set(opts, "mode", mode, &error_abort);
-qemu_opt_set(opts, "chardev", label, &error_abort);
-if (!strcmp(mode, "control")) {
-qemu_opt_set_bool(opts, "pretty", pretty, &error_abort);
-} else {
-assert(pretty == false);
-}
-monitor_device_index++;
-}
-
 struct device_config {
 enum {
 DEV_USB,   /* -usbdevice */
-- 
2.25.GIT




[PATCH RESEND v6 09/36] multi-process: Add config option for multi-process QEMU

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Add a configuration option to separate multi-process code

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 configure | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/configure b/configure
index 23b5e93752..913c78d4ef 100755
--- a/configure
+++ b/configure
@@ -509,6 +509,7 @@ libpmem=""
 default_devices="yes"
 plugins="no"
 fuzzing="no"
+mpqemu="no"
 
 supported_cpu="no"
 supported_os="no"
@@ -1601,6 +1602,10 @@ for opt do
   ;;
   --gdb=*) gdb_bin="$optarg"
   ;;
+  --enable-mpqemu) mpqemu=yes
+  ;;
+  --disable-mpqemu) mpqemu=no
+  ;;
   *)
   echo "ERROR: unknown option $opt"
   echo "Try '$0 --help' for more information"
@@ -1894,6 +1899,7 @@ disabled with --disable-FEATURE, default is enabled if 
available:
   debug-mutex mutex debugging support
   libpmem libpmem support
   xkbcommon   xkbcommon support
+  mpqemu  multi-process QEMU support
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -6733,6 +6739,7 @@ echo "default devices   $default_devices"
 echo "plugin support$plugins"
 echo "fuzzing support   $fuzzing"
 echo "gdb   $gdb_bin"
+echo "multiprocess QEMU $mpqemu"
 
 if test "$supported_cpu" = "no"; then
 echo
@@ -7551,6 +7558,10 @@ if test "$libpmem" = "yes" ; then
   echo "CONFIG_LIBPMEM=y" >> $config_host_mak
 fi
 
+if test "$mpqemu" = "yes" ; then
+  echo "CONFIG_MPQEMU=y" >> $config_host_mak
+fi
+
 if test "$bochs" = "yes" ; then
   echo "CONFIG_BOCHS=y" >> $config_host_mak
 fi
-- 
2.25.GIT




[PATCH RESEND v6 21/36] multi-process: PCI BAR read/write handling for proxy & remote endpoints

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Proxy device object implements handler for PCI BAR writes and reads.
The handler uses BAR_WRITE/BAR_READ message to communicate to the
remote process with the BAR address and value to be written/read.
The remote process implements handler for BAR_WRITE/BAR_READ
message.

Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
---
 hw/proxy/qemu-proxy.c | 64 ++
 include/hw/proxy/qemu-proxy.h | 20 -
 include/io/mpqemu-link.h  | 12 +
 io/mpqemu-link.c  |  6 +++
 remote/remote-main.c  | 84 +++
 5 files changed, 184 insertions(+), 2 deletions(-)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 87cf39c672..7fd0a312a5 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -169,3 +169,67 @@ static void pci_proxy_dev_register_types(void)
 
 type_init(pci_proxy_dev_register_types)
 
+static void send_bar_access_msg(PCIProxyDev *dev, MemoryRegion *mr,
+bool write, hwaddr addr, uint64_t *val,
+unsigned size, bool memory)
+{
+MPQemuLinkState *mpqemu_link = dev->mpqemu_link;
+MPQemuMsg msg;
+int wait;
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+
+msg.bytestream = 0;
+msg.size = sizeof(msg.data1);
+msg.data1.bar_access.addr = mr->addr + addr;
+msg.data1.bar_access.size = size;
+msg.data1.bar_access.memory = memory;
+
+if (write) {
+msg.cmd = BAR_WRITE;
+msg.data1.bar_access.val = *val;
+} else {
+wait = GET_REMOTE_WAIT;
+
+msg.cmd = BAR_READ;
+msg.num_fds = 1;
+msg.fds[0] = wait;
+}
+
+mpqemu_msg_send(&msg, mpqemu_link->dev);
+
+if (!write) {
+*val = wait_for_remote(wait);
+PUT_REMOTE_WAIT(wait);
+}
+}
+
+void proxy_default_bar_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ProxyMemoryRegion *pmr = opaque;
+
+send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
+pmr->memory);
+}
+
+uint64_t proxy_default_bar_read(void *opaque, hwaddr addr, unsigned size)
+{
+ProxyMemoryRegion *pmr = opaque;
+uint64_t val;
+
+send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
+pmr->memory);
+
+ return val;
+}
+
+const MemoryRegionOps proxy_default_ops = {
+.read = proxy_default_bar_read,
+.write = proxy_default_bar_write,
+.endianness = DEVICE_NATIVE_ENDIAN,
+.impl = {
+.min_access_size = 1,
+.max_access_size = 1,
+},
+};
diff --git a/include/hw/proxy/qemu-proxy.h b/include/hw/proxy/qemu-proxy.h
index d7eaf26f29..9e4127eccb 100644
--- a/include/hw/proxy/qemu-proxy.h
+++ b/include/hw/proxy/qemu-proxy.h
@@ -26,14 +26,25 @@
 #define PCI_PROXY_DEV_GET_CLASS(obj) \
 OBJECT_GET_CLASS(PCIProxyDevClass, (obj), TYPE_PCI_PROXY_DEV)
 
-typedef struct PCIProxyDev {
+typedef struct PCIProxyDev PCIProxyDev;
+
+typedef struct ProxyMemoryRegion {
+PCIProxyDev *dev;
+MemoryRegion mr;
+bool memory;
+bool present;
+uint8_t type;
+} ProxyMemoryRegion;
+
+struct PCIProxyDev {
 PCIDevice parent_dev;
 
 MPQemuLinkState *mpqemu_link;
 
 int socket;
 
-} PCIProxyDev;
+ProxyMemoryRegion region[PCI_NUM_REGIONS];
+};
 
 typedef struct PCIProxyDevClass {
 PCIDeviceClass parent_class;
@@ -43,4 +54,9 @@ typedef struct PCIProxyDevClass {
 char *command;
 } PCIProxyDevClass;
 
+void proxy_default_bar_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size);
+
+uint64_t proxy_default_bar_read(void *opaque, hwaddr addr, unsigned size);
+
 #endif /* QEMU_PROXY_H */
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 7228a1915e..41cf092f9e 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -31,6 +31,8 @@
 /**
  * mpqemu_cmd_t:
  * SYNC_SYSMEM  Shares QEMU's RAM with remote device's RAM
+ * BAR_WRITEWrites to PCI BAR region
+ * BAR_READ Reads from PCI BAR region
  *
  * proc_cmd_t enum type to specify the command to be executed on the remote
  * device.
@@ -41,6 +43,8 @@ typedef enum {
 CONNECT_DEV,
 PCI_CONFIG_WRITE,
 PCI_CONFIG_READ,
+BAR_WRITE,
+BAR_READ,
 MAX,
 } mpqemu_cmd_t;
 
@@ -56,6 +60,13 @@ typedef struct {
 ram_addr_t offsets[REMOTE_MAX_FDS];
 } sync_sysmem_msg_t;
 
+typedef struct {
+hwaddr addr;
+uint64_t val;
+unsigned size;
+bool memory;
+} bar_access_msg_t;
+
 /**
  * MPQemuMsg:
  * @cmd: The remote command
@@ -78,6 +89,7 @@ typedef struct {
 union {
 uint64_t u64;
 sync_sysmem_msg_t sync_sysmem;
+bar_access_msg_t bar_access;
 } data1;
 
 int fds[REMOTE_MAX_FDS];
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 

[PATCH RESEND v6 31/36] multi-process/mon: choose HMP commands based on target

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Add "targets" field to HMP command definition to select the targets
which would be supported by each command

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hmp-commands-info.hx | 10 ++
 hmp-commands.hx  | 20 
 scripts/hxtool   | 35 ++-
 3 files changed, 64 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 scripts/hxtool

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index ca5198438d..1fbca7a18d 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -23,6 +23,7 @@ ERST
 .params = "",
 .help   = "show the version of QEMU",
 .cmd= hmp_info_version,
+.targets= "scsi",
 .flags  = "p",
 },
 
@@ -50,6 +51,7 @@ ERST
 .params = "",
 .help   = "show the character devices",
 .cmd= hmp_info_chardev,
+.targets= "scsi",
 .flags  = "p",
 },
 
@@ -65,6 +67,7 @@ ERST
 .help   = "show info of one block device or all block devices "
   "(-n: show named nodes; -v: show details)",
 .cmd= hmp_info_block,
+.targets= "scsi",
 },
 
 SRST
@@ -78,6 +81,7 @@ ERST
 .params = "",
 .help   = "show block device statistics",
 .cmd= hmp_info_blockstats,
+.targets= "scsi",
 },
 
 SRST
@@ -91,6 +95,7 @@ ERST
 .params = "",
 .help   = "show progress of ongoing block device operations",
 .cmd= hmp_info_block_jobs,
+.targets= "scsi",
 },
 
 SRST
@@ -161,6 +166,7 @@ ERST
 .params = "",
 .help   = "show the command line history",
 .cmd= hmp_info_history,
+.targets= "scsi",
 .flags  = "p",
 },
 
@@ -214,6 +220,7 @@ ERST
 .params = "",
 .help   = "show PCI info",
 .cmd= hmp_info_pci,
+.targets= "scsi",
 },
 
 SRST
@@ -598,6 +605,7 @@ ERST
 .params = "",
 .help   = "show device tree",
 .cmd= hmp_info_qtree,
+.targets= "scsi",
 },
 
 SRST
@@ -611,6 +619,7 @@ ERST
 .params = "",
 .help   = "show qdev device model list",
 .cmd= hmp_info_qdm,
+.targets= "scsi",
 },
 
 SRST
@@ -624,6 +633,7 @@ ERST
 .params = "[path]",
 .help   = "show QOM composition tree",
 .cmd= hmp_info_qom_tree,
+.targets= "scsi",
 .flags  = "p",
 },
 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 02cae25c24..1b60676d7c 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -45,6 +45,7 @@ ERST
 .params = "",
 .help   = "quit the emulator",
 .cmd= hmp_quit,
+.targets= "scsi",
 },
 
 SRST
@@ -76,6 +77,7 @@ ERST
 .params = "device size",
 .help   = "resize a block image",
 .cmd= hmp_block_resize,
+.targets= "scsi",
 },
 
 SRST
@@ -92,6 +94,7 @@ ERST
 .params = "device [speed [base]]",
 .help   = "copy data from a backing file into a block device",
 .cmd= hmp_block_stream,
+.targets= "scsi",
 },
 
 SRST
@@ -105,6 +108,7 @@ ERST
 .params = "device speed",
 .help   = "set maximum speed for a background block operation",
 .cmd= hmp_block_job_set_speed,
+.targets= "scsi",
 },
 
 SRST
@@ -120,6 +124,7 @@ ERST
   "\n\t\t\t if you want to abort the operation immediately"
   "\n\t\t\t instead of keep running until data is in 
sync)",
 .cmd= hmp_block_job_cancel,
+.targets= "scsi",
 },
 
 SRST
@@ -133,6 +138,7 @@ ERST
 .params = "device",
 .help   = "stop an active background block operation",
 .cmd= hmp_block_job_complete,
+.targets= "scsi",
 },
 
 SRST
@@ -147,6 +153,7 @@ ERST
 .params = "device",
 .help   = "pause an active background block operation",
 .cmd= hmp_block_job_pause,
+.targets= "scsi",
 },
 
 SRST
@@ -160,6 +167,7 @@ ERST
 .p

[PATCH RESEND v6 07/36] multi-process: add a command line option for debug file

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Can be used with -d rdebug command options when starting qemu.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Reviewed-by: Stefan Hajnoczi 
---
 include/qemu/log.h | 1 +
 util/log.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/qemu/log.h b/include/qemu/log.h
index f4724f7330..a039ddb61a 100644
--- a/include/qemu/log.h
+++ b/include/qemu/log.h
@@ -64,6 +64,7 @@ static inline bool qemu_log_separate(void)
 #define CPU_LOG_PLUGIN (1 << 18)
 /* LOG_STRACE is used for user-mode strace logging. */
 #define LOG_STRACE (1 << 19)
+#define LOG_REMOTE_DEBUG   (1 << 20)
 
 /* Lock output for a series of related logs.  Since this is not needed
  * for a single qemu_log / qemu_log_mask / qemu_log_mask_and_addr, we
diff --git a/util/log.c b/util/log.c
index 2da6cb31dc..1f90e70cdd 100644
--- a/util/log.c
+++ b/util/log.c
@@ -334,6 +334,8 @@ const QEMULogItem qemu_log_items[] = {
 #endif
 { LOG_STRACE, "strace",
   "log every user-mode syscall, its input, and its result" },
+{ LOG_REMOTE_DEBUG, "rdebug",
+  "log remote debug" },
 { 0, NULL, NULL },
 };
 
-- 
2.25.GIT




[PATCH RESEND v6 33/36] multi-process/mon: enable QMP module support in the remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Build system changes to enable QMP module in the remote process

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 Makefile.objs   | 10 ++
 Makefile.target | 35 +--
 block/Makefile.objs |  3 +++
 block/monitor/Makefile.objs |  2 ++
 hmp-commands.hx |  1 +
 hw/core/Makefile.objs   |  1 +
 monitor/Makefile.objs   |  4 
 monitor/misc.c  |  8 
 qapi/Makefile.objs  |  2 ++
 qom/Makefile.objs   |  1 +
 stubs/monitor.c |  1 +
 ui/Makefile.objs|  2 ++
 12 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/Makefile.objs b/Makefile.objs
index ff3f06b146..cdb55b2f82 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -32,6 +32,7 @@ remote-pci-obj-$(CONFIG_MPQEMU) += migration/
 remote-pci-obj-$(CONFIG_MPQEMU) += remote/
 remote-pci-obj-$(CONFIG_MPQEMU) += accel/
 remote-pci-obj-$(CONFIG_MPQEMU) += util/
+remote-pci-obj-$(CONFIG_MPQEMU) += monitor/
 
 remote-pci-obj-$(CONFIG_MPQEMU) += cpus-common.o
 remote-pci-obj-$(CONFIG_MPQEMU) += dma-helpers.o
@@ -45,6 +46,10 @@ remote-pci-obj-$(CONFIG_MPQEMU) += qemu-parse.o
 # remote-lsi-obj-y is code used to implement remote LSI device
 
 remote-lsi-obj-$(CONFIG_MPQEMU) += hw/
+remote-lsi-obj-$(CONFIG_MPQEMU) += ui/
+remote-lsi-obj-$(CONFIG_MPQEMU) += block/
+
+#remote-lsi-obj-$(CONFIG_MPQEMU) += device-hotplug.o
 
 ###
 # crypto-obj-y is code used by both qemu system emulation and qemu-img
@@ -113,6 +118,11 @@ common-obj-y += util/machine-notify.o
 
 endif # CONFIG_SOFTMMU
 
+remote-pci-obj-$(CONFIG_MPQEMU) += qapi/
+remote-pci-obj-$(CONFIG_MPQEMU) += blockdev-nbd.o
+remote-pci-obj-$(CONFIG_MPQEMU) += job-qmp.o
+remote-pci-obj-$(CONFIG_MPQEMU) += balloon.o
+
 ###
 # Target-independent parts used in system and user emulation
 common-obj-y += cpus-common.o
diff --git a/Makefile.target b/Makefile.target
index adc76886f8..1e9e102df8 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -142,13 +142,32 @@ remote-pci-tgt-obj-$(CONFIG_MPQEMU) += 
accel/stubs/hax-stub.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += accel/stubs/whpx-stub.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/vl-stub.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/net-stub.o
-remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/monitor.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/replay.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/xen-mapcache.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/audio.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/monitor.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/migration.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/ui-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/gdbstub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/qapi-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/qapi-misc.o
 
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += remote/memory.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += arch_init.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += monitor/misc.o
+
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-introspect.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands-block-core.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands-block.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands-misc.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands-machine-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-commands-misc-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-visit-machine-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-visit-misc-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-types-machine-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-types-misc-target.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += qapi/qapi-init-commands.o
 endif
 
 #
@@ -204,6 +223,10 @@ endif
 generated-files-y += hmp-commands.h hmp-commands-info.h
 generated-files-y += config-devices.h
 
+ifdef CONFIG_MPQEMU
+generated-files-y += hmp-scsi-commands.h hmp-scsi-commands-info.h
+endif
+
 endif # CONFIG_SOFTMMU
 
 dummy := $(call unnest-vars,,obj-y)
@@ -289,10 +312,18 @@ hmp-commands.h: $(SRC_PATH)/hmp-commands.hx 
$(SRC_PATH)/scripts/hxtool
 hmp-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx 
$(SRC_PATH)/scripts/hxtool
$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > 
$@,"GEN","$(TARGET_DIR)$@")
 
+ifdef CONFIG_MPQEMU
+hmp-scsi-commands.h: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
+   $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -tgt scsi < $< > $@)
+
+hmp-scsi-commands-info.h: $(SRC_PATH)/hmp-commands-info.hx 
$(SRC_PATH)/scripts/hxtool
+   $(cal

[PATCH RESEND v6 22/36] multi-process: Synchronize remote memory

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Add memory-listener object which is used to keep the view of the RAM
in sync between QEMU and remote process.
A MemoryListener is registered for system-memory AddressSpace. The
listener sends SYNC_SYSMEM message to the remote process when memory
listener commits the changes to memory, the remote process receives
the message and processes it in the handler for SYNC_SYSMEM message.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS|   2 +
 Makefile.target|   3 +
 hw/proxy/memory-sync.c | 217 +
 hw/proxy/qemu-proxy.c  |   6 +
 include/hw/proxy/memory-sync.h |  37 ++
 include/hw/proxy/qemu-proxy.h  |   6 +
 remote/remote-main.c   |  11 ++
 7 files changed, 282 insertions(+)
 create mode 100644 hw/proxy/memory-sync.c
 create mode 100644 include/hw/proxy/memory-sync.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3da3dcd311..9ebb46722a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2869,6 +2869,8 @@ F: remote/memory.c
 F: hw/proxy/Makefile.objs
 F: hw/proxy/qemu-proxy.c
 F: include/hw/proxy/qemu-proxy.h
+F: include/hw/proxy/memory-sync.h
+F: hw/proxy/memory-sync.c
 
 Build and test automation
 -
diff --git a/Makefile.target b/Makefile.target
index 500fa07fda..c64d860895 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -127,6 +127,9 @@ obj-$(CONFIG_TCG) += fpu/softfloat.o
 obj-y += target/$(TARGET_BASE_ARCH)/
 obj-y += disas.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
+ifeq ($(TARGET_NAME)-$(CONFIG_MPQEMU)-$(CONFIG_USER_ONLY), x86_64-y-)
+obj-$(CONFIG_MPQEMU) += hw/proxy/memory-sync.o
+endif
 LIBS := $(libs_cpu) $(LIBS)
 
 obj-$(CONFIG_PLUGIN) += plugins/
diff --git a/hw/proxy/memory-sync.c b/hw/proxy/memory-sync.c
new file mode 100644
index 00..b3f57747f3
--- /dev/null
+++ b/hw/proxy/memory-sync.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include 
+#include 
+#include 
+
+#include "qemu/osdep.h"
+#include "qemu/compiler.h"
+#include "qemu/int128.h"
+#include "qemu/range.h"
+#include "exec/memory.h"
+#include "exec/cpu-common.h"
+#include "cpu.h"
+#include "exec/ram_addr.h"
+#include "exec/address-spaces.h"
+#include "io/mpqemu-link.h"
+#include "hw/proxy/memory-sync.h"
+
+static const TypeInfo remote_mem_sync_type_info = {
+.name  = TYPE_MEMORY_LISTENER,
+.parent= TYPE_OBJECT,
+.instance_size = sizeof(RemoteMemSync),
+};
+
+static void remote_mem_sync_register_types(void)
+{
+type_register_static(&remote_mem_sync_type_info);
+}
+
+type_init(remote_mem_sync_register_types)
+
+static void proxy_ml_begin(MemoryListener *listener)
+{
+RemoteMemSync *sync = container_of(listener, RemoteMemSync, listener);
+int mrs;
+
+for (mrs = 0; mrs < sync->n_mr_sections; mrs++) {
+memory_region_unref(sync->mr_sections[mrs].mr);
+}
+
+g_free(sync->mr_sections);
+sync->mr_sections = NULL;
+sync->n_mr_sections = 0;
+}
+
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
+{
+MemoryRegion *mr;
+ram_addr_t off;
+
+/**
+ * Assumes that the host address is a valid address as it's
+ * coming from the MemoryListener system. In the case host
+ * address is not valid, the following call would return
+ * the default subregion of "system_memory" region, and
+ * not NULL. So it's not possible to check for NULL here.
+ */
+mr = memory_region_from_host((void *)(uintptr_t)host, &off);
+
+if (offset) {
+*offset = off;
+}
+
+return memory_region_get_fd(mr);
+}
+
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
+{
+bool merge;
+int fd1, fd2;
+
+fd1 = get_fd_from_hostaddr(host, NULL);
+
+fd2 = get_fd_from_hostaddr(prev_host, NULL);
+
+merge = (fd1 == fd2);
+
+merge &= ((prev_host + size) == host);
+
+return merge;
+}
+
+static void proxy_ml_region_addnop(MemoryListener *listener,
+   MemoryRegionSection *section)
+{
+RemoteMemSync *sync = container_of(listener, RemoteMemSync, listener);
+bool need_add = true;
+uint64_t mrs_size, mrs_gpa, mrs_page;
+uintptr_t mrs_host;
+RAMBlock *mrs_rb;
+MemoryRegionSection *prev_sec;
+
+if (!(memory_region_is_ram(section->mr) &&
+  !memory_region_is_rom(section->mr))) {
+return;
+}
+
+mrs_rb = section->mr->ram_block;
+mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
+mrs_size = int128_get64(section->size);
+mrs_gpa = s

[PATCH RESEND v6 16/36] multi-process: remote process initialization

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Adds the handler to process message from QEMU,
Initialize remote process main loop, handles SYNC_SYSMEM
message by updating its "system_memory" container using
shared file descriptors received from QEMU.

Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
---
 MAINTAINERS|   1 +
 remote/remote-common.h |  21 +
 remote/remote-main.c   | 104 +
 3 files changed, 126 insertions(+)
 create mode 100644 remote/remote-common.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 98237fff62..96f8d7ff19 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2857,6 +2857,7 @@ M: John G Johnson 
 S: Maintained
 F: remote/Makefile.objs
 F: remote/remote-main.c
+F: remote/remote-common.h
 F: include/io/mpqemu-link.h
 F: io/mpqemu-link.c
 F: include/remote/pcihost.h
diff --git a/remote/remote-common.h b/remote/remote-common.h
new file mode 100644
index 00..12c8d066cf
--- /dev/null
+++ b/remote/remote-common.h
@@ -0,0 +1,21 @@
+/*
+ * Remote device initialization
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef REMOTE_COMMON_H
+#define REMOTE_COMMON_H
+
+extern const char *__progname;
+
+#define PROC_INFO_LENGTH 1024
+gchar *print_pid_exec(gchar *str);
+
+#endif /* REMOTE_COMMON_H */
+
+
+
diff --git a/remote/remote-main.c b/remote/remote-main.c
index ecf30e0cba..dbd6ad2529 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -12,6 +12,7 @@
 #include "qemu-common.h"
 
 #include 
+#include 
 
 #include "qemu/module.h"
 #include "remote/pcihost.h"
@@ -19,12 +20,115 @@
 #include "hw/boards.h"
 #include "hw/qdev-core.h"
 #include "qemu/main-loop.h"
+#include "remote/memory.h"
+#include "io/mpqemu-link.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "sysemu/cpus.h"
+#include "qemu-common.h"
+#include "hw/pci/pci.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "qemu/config-file.h"
+#include "sysemu/sysemu.h"
+#include "block/block.h"
+#include "exec/ramlist.h"
+#include "remote/remote-common.h"
+
+static MPQemuLinkState *mpqemu_link;
+
+gchar *print_pid_exec(gchar *str)
+{
+if (!str) {
+return NULL;
+}
+memset(str, 0, PROC_INFO_LENGTH);
+g_snprintf(str, PROC_INFO_LENGTH, "pid %d, exec name %s ",
+   getpid(), __progname);
+return str;
+}
+
+static void process_msg(GIOCondition cond, MPQemuLinkState *link,
+MPQemuChannel *chan)
+{
+MPQemuMsg *msg = NULL;
+Error *err = NULL;
+g_autofree gchar *pid_exec = NULL;
+
+pid_exec = g_malloc(PROC_INFO_LENGTH);
+
+if ((cond & G_IO_HUP) || (cond & G_IO_ERR)) {
+goto finalize_loop;
+}
+
+msg = g_malloc0(sizeof(MPQemuMsg));
+
+if (mpqemu_msg_recv(msg, chan) < 0) {
+error_setg(&err, "Failed to receive message in remote process %s",
+   print_pid_exec(pid_exec));
+goto finalize_loop;
+}
+
+switch (msg->cmd) {
+case INIT:
+break;
+default:
+error_setg(&err, "Unknown command in %s", print_pid_exec(pid_exec));
+goto finalize_loop;
+}
+
+g_free(msg->data2);
+g_free(msg);
+
+return;
+
+finalize_loop:
+if (err) {
+error_report_err(err);
+}
+g_free(msg);
+mpqemu_link_finalize(mpqemu_link);
+mpqemu_link = NULL;
+}
 
 int main(int argc, char *argv[])
 {
+Error *err = NULL;
+
 module_call_init(MODULE_INIT_QOM);
 
+bdrv_init_with_whitelist();
+
+if (qemu_init_main_loop(&err)) {
+error_report_err(err);
+return -EBUSY;
+}
+
+qemu_init_cpu_loop();
+
+page_size_init();
+
+qemu_mutex_init(&ram_list.mutex);
+
 current_machine = MACHINE(REMOTE_MACHINE(object_new(TYPE_REMOTE_MACHINE)));
 
+mpqemu_link = mpqemu_link_create();
+if (!mpqemu_link) {
+printf("Could not create MPQemu link pid %d, exec_name %s",
+   getpid(), __progname);
+return -1;
+}
+
+mpqemu_init_channel(mpqemu_link, &mpqemu_link->com, STDIN_FILENO);
+
+mpqemu_link_set_callback(mpqemu_link, process_msg);
+
+qdev_machine_creation_done();
+qemu_mutex_lock_iothread();
+qemu_run_machine_init_done_notifiers();
+qemu_mutex_unlock_iothread();
+
+mpqemu_start_coms(mpqemu_link, mpqemu_link->com);
+
 return 0;
 }
-- 
2.25.GIT




[PATCH RESEND v6 15/36] multi-process: setup memory manager for remote device

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

sync_sysmem_msg_t message format is defined. It is used to send
file descriptors of the RAM regions to remote device.
RAM on the remote device is configured with a set of file descriptors.
Old RAM regions are deleted and new regions, each with an fd, is
added to the RAM.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS  |  2 ++
 Makefile.target  |  2 ++
 exec.c   | 17 +++
 include/exec/ram_addr.h  |  2 ++
 include/io/mpqemu-link.h | 12 
 include/remote/memory.h  | 20 +
 io/mpqemu-link.c | 13 +
 remote/memory.c  | 63 
 8 files changed, 131 insertions(+)
 create mode 100644 include/remote/memory.h
 create mode 100644 remote/memory.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 09764e461c..98237fff62 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2863,6 +2863,8 @@ F: include/remote/pcihost.h
 F: remote/pcihost.c
 F: include/remote/machine.h
 F: remote/machine.c
+F: include/remote/memory.h
+F: remote/memory.c
 
 Build and test automation
 -
diff --git a/Makefile.target b/Makefile.target
index 70fa1eeca5..500fa07fda 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -143,6 +143,8 @@ remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/replay.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/xen-mapcache.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/audio.o
 remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/monitor.o
+
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += remote/memory.o
 endif
 
 #
diff --git a/exec.c b/exec.c
index 5b1e414099..1e02e00f00 100644
--- a/exec.c
+++ b/exec.c
@@ -2371,6 +2371,23 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, 
MemoryRegion *mr,
 
 return block;
 }
+
+void qemu_ram_init_from_fd(MemoryRegion *mr, int fd, uint64_t size,
+   ram_addr_t offset, Error **errp)
+{
+char *name = g_strdup_printf("%d", fd);
+
+memory_region_init(mr, NULL, name, size);
+mr->ram = true;
+mr->terminates = true;
+mr->destructor = NULL;
+mr->align = 0;
+mr->ram_block = qemu_ram_alloc_from_fd(size, mr, RAM_SHARED, fd, offset,
+   errp);
+mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
+
+g_free(name);
+}
 #endif
 
 static
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 1b9f489ff0..d9d7314f51 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -122,6 +122,8 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, 
MemoryRegion *mr,
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
  uint32_t ram_flags, int fd,
  off_t offset, Error **errp);
+void qemu_ram_init_from_fd(MemoryRegion *mr, int fd, uint64_t size,
+   ram_addr_t offset, Error **errp);
 
 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
   MemoryRegion *mr, Error **errp);
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index ef95599bca..d46cb81058 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -16,6 +16,8 @@
 
 #include "qom/object.h"
 #include "qemu/thread.h"
+#include "exec/cpu-common.h"
+#include "exec/hwaddr.h"
 
 #define TYPE_MPQEMU_LINK "mpqemu-link"
 #define MPQEMU_LINK(obj) \
@@ -27,15 +29,23 @@
 
 /**
  * mpqemu_cmd_t:
+ * SYNC_SYSMEM  Shares QEMU's RAM with remote device's RAM
  *
  * proc_cmd_t enum type to specify the command to be executed on the remote
  * device.
  */
 typedef enum {
 INIT = 0,
+SYNC_SYSMEM,
 MAX,
 } mpqemu_cmd_t;
 
+typedef struct {
+hwaddr gpas[REMOTE_MAX_FDS];
+uint64_t sizes[REMOTE_MAX_FDS];
+ram_addr_t offsets[REMOTE_MAX_FDS];
+} sync_sysmem_msg_t;
+
 /**
  * MPQemuMsg:
  * @cmd: The remote command
@@ -49,6 +59,7 @@ typedef enum {
  * MPQemuMsg Format of the message sent to the remote device from QEMU.
  *
  */
+
 typedef struct {
 mpqemu_cmd_t cmd;
 int bytestream;
@@ -56,6 +67,7 @@ typedef struct {
 
 union {
 uint64_t u64;
+sync_sysmem_msg_t sync_sysmem;
 } data1;
 
 int fds[REMOTE_MAX_FDS];
diff --git a/include/remote/memory.h b/include/remote/memory.h
new file mode 100644
index 00..e2e479bb6f
--- /dev/null
+++ b/include/remote/memory.h
@@ -0,0 +1,20 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_MEMORY_H
+#define REMOTE_MEMORY_H
+
+#include "qemu/osdep.h"
+#include "exec/hwaddr

[PATCH RESEND v6 18/36] multi-process: Initialize Proxy Object's communication channel

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Add "socket" object property which initializes the communication channel

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 hw/proxy/qemu-proxy.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index bf6c4117ef..40bf56fd37 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -14,6 +14,25 @@
 #include "hw/proxy/qemu-proxy.h"
 #include "hw/pci/pci.h"
 
+static void proxy_set_socket(Object *obj, const char *str, Error **errp)
+{
+PCIProxyDev *pdev = PCI_PROXY_DEV(obj);
+
+pdev->socket = atoi(str);
+
+mpqemu_init_channel(pdev->mpqemu_link, &pdev->mpqemu_link->com,
+pdev->socket);
+}
+
+static void proxy_init(Object *obj)
+{
+PCIProxyDev *pdev = PCI_PROXY_DEV(obj);
+
+pdev->mpqemu_link = mpqemu_link_create();
+
+object_property_add_str(obj, "socket", NULL, proxy_set_socket, NULL);
+}
+
 static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
 {
 PCIProxyDev *dev = PCI_PROXY_DEV(device);
@@ -41,6 +60,7 @@ static const TypeInfo pci_proxy_dev_type_info = {
 .instance_size = sizeof(PCIProxyDev),
 .class_size= sizeof(PCIProxyDevClass),
 .class_init= pci_proxy_dev_class_init,
+.instance_init = proxy_init,
 .interfaces = (InterfaceInfo[]) {
 { INTERFACE_CONVENTIONAL_PCI_DEVICE },
 { },
-- 
2.25.GIT




[PATCH RESEND v6 20/36] multi-process: Forward PCI config space acceses to the remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

The Proxy Object sends the PCI config space accesses as messages
to the remote process over the communication channel

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 hw/proxy/qemu-proxy.c| 61 
 include/io/mpqemu-link.h |  8 ++
 io/mpqemu-link.c |  6 
 remote/remote-main.c | 32 +
 4 files changed, 107 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 9b5e429a88..87cf39c672 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -14,6 +14,65 @@
 #include "hw/proxy/qemu-proxy.h"
 #include "hw/pci/pci.h"
 
+static int config_op_send(PCIProxyDev *dev, uint32_t addr, uint32_t *val, int 
l,
+  unsigned int op)
+{
+MPQemuMsg msg;
+struct conf_data_msg conf_data;
+int wait;
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+conf_data.addr = addr;
+conf_data.val = (op == PCI_CONFIG_WRITE) ? *val : 0;
+conf_data.l = l;
+
+msg.data2 = (uint8_t *)&conf_data;
+if (!msg.data2) {
+return -ENOMEM;
+}
+
+msg.size = sizeof(conf_data);
+msg.cmd = op;
+msg.bytestream = 1;
+
+if (op == PCI_CONFIG_WRITE) {
+msg.num_fds = 0;
+} else {
+/* TODO: Dont create fd each time for send. */
+wait = GET_REMOTE_WAIT;
+msg.num_fds = 1;
+msg.fds[0] = wait;
+}
+
+mpqemu_msg_send(&msg, dev->mpqemu_link->dev);
+
+if (op == PCI_CONFIG_READ) {
+*val = (uint32_t)wait_for_remote(wait);
+PUT_REMOTE_WAIT(wait);
+}
+
+return 0;
+}
+
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+uint32_t val;
+
+(void)pci_default_read_config(d, addr, len);
+
+config_op_send(PCI_PROXY_DEV(d), addr, &val, len, PCI_CONFIG_READ);
+
+return val;
+}
+
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
+   int l)
+{
+pci_default_write_config(d, addr, val, l);
+
+config_op_send(PCI_PROXY_DEV(d), addr, &val, l, PCI_CONFIG_WRITE);
+}
+
 static void proxy_set_socket(Object *obj, const char *str, Error **errp)
 {
 PCIProxyDev *pdev = PCI_PROXY_DEV(obj);
@@ -86,6 +145,8 @@ static void pci_proxy_dev_class_init(ObjectClass *klass, 
void *data)
 PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
 k->realize = pci_proxy_dev_realize;
+k->config_read = pci_proxy_read_config;
+k->config_write = pci_proxy_write_config;
 }
 
 static const TypeInfo pci_proxy_dev_type_info = {
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index ebae9afc45..7228a1915e 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -39,9 +39,17 @@ typedef enum {
 INIT = 0,
 SYNC_SYSMEM,
 CONNECT_DEV,
+PCI_CONFIG_WRITE,
+PCI_CONFIG_READ,
 MAX,
 } mpqemu_cmd_t;
 
+struct conf_data_msg {
+uint32_t addr;
+uint32_t val;
+int l;
+};
+
 typedef struct {
 hwaddr gpas[REMOTE_MAX_FDS];
 uint64_t sizes[REMOTE_MAX_FDS];
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index f780b65181..ef4a07b81a 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -381,6 +381,12 @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
 return false;
 }
 break;
+case PCI_CONFIG_WRITE:
+case PCI_CONFIG_READ:
+if (msg->size != sizeof(struct conf_data_msg)) {
+return false;
+}
+break;
 default:
 break;
 }
diff --git a/remote/remote-main.c b/remote/remote-main.c
index f541baae6a..834574e172 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -53,6 +53,32 @@ gchar *print_pid_exec(gchar *str)
 
 #define LINK_TO_DEV(link) ((PCIDevice *)link->opaque)
 
+static void process_config_write(PCIDevice *dev, MPQemuMsg *msg)
+{
+struct conf_data_msg *conf = (struct conf_data_msg *)msg->data2;
+
+qemu_mutex_lock_iothread();
+pci_default_write_config(dev, conf->addr, conf->val, conf->l);
+qemu_mutex_unlock_iothread();
+}
+
+static void process_config_read(PCIDevice *dev, MPQemuMsg *msg)
+{
+struct conf_data_msg *conf = (struct conf_data_msg *)msg->data2;
+uint32_t val;
+int wait;
+
+wait = msg->fds[0];
+
+qemu_mutex_lock_iothread();
+val = pci_default_read_config(dev, conf->addr, conf->l);
+qemu_mutex_unlock_iothread();
+
+notify_proxy(wait, val);
+
+PUT_REMOTE_WAIT(wait);
+}
+
 static gpointer dev_thread(gpointer data)
 {
 MPQemuLinkState *link = data;
@@ -115,6 +141,12 @@ static void process_msg(GIOCondition cond, MPQemuLinkState 
*link,
 case CONNECT_DEV:
 process_connect_dev_msg(msg);
 break;
+case PCI_CONFIG_WRITE:
+process_config_write(LINK_TO_DEV(link), msg);
+break;
+case PCI_CONFIG_READ:
+process_config_read(LINK

[PATCH RESEND v6 29/36] multi-process: handle heartbeat messages in remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

If the remote process is alive, it responds to proxy's heartbeat
messages

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 remote/remote-main.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/remote/remote-main.c b/remote/remote-main.c
index b37802151a..beac591fac 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -216,6 +216,7 @@ static void process_msg(GIOCondition cond, MPQemuLinkState 
*link,
 g_autofree gchar *pid_exec = NULL;
 
 pid_exec = g_malloc(PROC_INFO_LENGTH);
+int wait;
 
 if ((cond & G_IO_HUP) || (cond & G_IO_ERR)) {
 goto finalize_loop;
@@ -269,6 +270,10 @@ static void process_msg(GIOCondition cond, MPQemuLinkState 
*link,
 case GET_PCI_INFO:
 process_get_pci_info_msg(link, msg);
 break;
+case PROXY_PING:
+wait = msg->fds[0];
+notify_proxy(wait, 0);
+break;
 default:
 error_setg(&err, "Unknown command in %s", print_pid_exec(pid_exec));
 goto finalize_loop;
-- 
2.25.GIT




[PATCH RESEND v6 30/36] multi-process: perform device reset in the remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Perform device reset in the remote process when QEMU performs
device reset. This is required to reset the internal state
(like registers, etc...) of emulated devices

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hw/proxy/qemu-proxy.c| 25 +
 include/io/mpqemu-link.h |  1 +
 remote/remote-main.c | 13 +
 3 files changed, 39 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 162014353f..cffc227acd 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -291,14 +291,39 @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
 stop_hb_timer(dev);
 }
 
+static void proxy_device_reset(DeviceState *dev)
+{
+PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+MPQemuMsg msg;
+int wait = -1;
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+
+msg.bytestream = 0;
+msg.size = sizeof(msg.data1);
+msg.cmd = DEVICE_RESET;
+
+wait = GET_REMOTE_WAIT;
+msg.num_fds = 1;
+msg.fds[0] = wait;
+
+mpqemu_msg_send(&msg, pdev->mpqemu_link->com);
+
+(void)wait_for_remote(wait);
+
+PUT_REMOTE_WAIT(wait);
+}
+
 static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
 {
 PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+DeviceClass *dc = DEVICE_CLASS(klass);
 
 k->realize = pci_proxy_dev_realize;
 k->exit = pci_proxy_dev_exit;
 k->config_read = pci_proxy_read_config;
 k->config_write = pci_proxy_write_config;
+dc->reset = proxy_device_reset;
 }
 
 static const TypeInfo pci_proxy_dev_type_info = {
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 45ea1fcafa..22fb234744 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -51,6 +51,7 @@ typedef enum {
 GET_PCI_INFO,
 RET_PCI_INFO,
 PROXY_PING,
+DEVICE_RESET,
 MAX,
 } mpqemu_cmd_t;
 
diff --git a/remote/remote-main.c b/remote/remote-main.c
index beac591fac..562db4ccc3 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -39,6 +39,7 @@
 #include "exec/address-spaces.h"
 #include "remote/iohub.h"
 #include "remote-opts.h"
+#include "sysemu/reset.h"
 
 static void process_msg(GIOCondition cond, MPQemuLinkState *link,
 MPQemuChannel *chan);
@@ -208,6 +209,15 @@ static void process_get_pci_info_msg(MPQemuLinkState 
*link, MPQemuMsg *msg)
 mpqemu_msg_send(&ret, link->dev);
 }
 
+static void process_device_reset_msg(MPQemuMsg *msg)
+{
+qemu_devices_reset();
+
+if (msg->num_fds == 1) {
+notify_proxy(msg->fds[0], 0);
+}
+}
+
 static void process_msg(GIOCondition cond, MPQemuLinkState *link,
 MPQemuChannel *chan)
 {
@@ -274,6 +284,9 @@ static void process_msg(GIOCondition cond, MPQemuLinkState 
*link,
 wait = msg->fds[0];
 notify_proxy(wait, 0);
 break;
+case DEVICE_RESET:
+process_device_reset_msg(msg);
+break;
 default:
 error_setg(&err, "Unknown command in %s", print_pid_exec(pid_exec));
 goto finalize_loop;
-- 
2.25.GIT




[PATCH RESEND v6 23/36] multi-process: create IOHUB object to handle irq

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

IOHUB object is added to manage PCI IRQs. It uses KVM_IRQFD
ioctl to create irqfd to injecting PCI interrupts to the guest.
IOHUB object forwards the irqfd to the remote process. Remote process
uses this fd to directly send interrupts to the guest, bypassing QEMU.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS   |   2 +
 Makefile.target   |   1 +
 hw/Makefile.objs  |   2 -
 hw/proxy/Makefile.objs|   1 -
 hw/proxy/qemu-proxy.c |  52 
 include/hw/pci/pci_ids.h  |   3 +
 include/hw/proxy/qemu-proxy.h |   8 ++
 include/io/mpqemu-link.h  |   8 ++
 include/remote/iohub.h|  50 
 include/remote/machine.h  |   2 +
 io/mpqemu-link.c  |   1 +
 remote/Makefile.objs  |   1 +
 remote/iohub.c| 148 ++
 remote/machine.c  |  15 
 remote/remote-main.c  |   4 +
 15 files changed, 295 insertions(+), 3 deletions(-)
 delete mode 100644 hw/proxy/Makefile.objs
 create mode 100644 include/remote/iohub.h
 create mode 100644 remote/iohub.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9ebb46722a..198c9f69bc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2871,6 +2871,8 @@ F: hw/proxy/qemu-proxy.c
 F: include/hw/proxy/qemu-proxy.h
 F: include/hw/proxy/memory-sync.h
 F: hw/proxy/memory-sync.c
+F: include/remote/iohub.h
+F: remote/iohub.c
 
 Build and test automation
 -
diff --git a/Makefile.target b/Makefile.target
index c64d860895..b956ab24f6 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -129,6 +129,7 @@ obj-y += disas.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 ifeq ($(TARGET_NAME)-$(CONFIG_MPQEMU)-$(CONFIG_USER_ONLY), x86_64-y-)
 obj-$(CONFIG_MPQEMU) += hw/proxy/memory-sync.o
+obj-$(CONFIG_MPQEMU) += hw/proxy/qemu-proxy.o
 endif
 LIBS := $(libs_cpu) $(LIBS)
 
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 7b489b12a5..af9235b6f2 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -45,8 +45,6 @@ endif
 common-obj-y += $(devices-dirs-y)
 obj-y += $(devices-dirs-y)
 
-common-obj-$(CONFIG_MPQEMU) += proxy/
-
 remote-pci-obj-$(CONFIG_MPQEMU) += core/
 remote-pci-obj-$(CONFIG_MPQEMU) += block/
 remote-pci-obj-$(CONFIG_MPQEMU) += pci/
diff --git a/hw/proxy/Makefile.objs b/hw/proxy/Makefile.objs
deleted file mode 100644
index eb81624cf8..00
--- a/hw/proxy/Makefile.objs
+++ /dev/null
@@ -1 +0,0 @@
-common-obj-$(CONFIG_MPQEMU) += qemu-proxy.o
diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 2ac4c1528a..a78694736b 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -15,6 +15,9 @@
 #include "hw/pci/pci.h"
 #include "hw/proxy/memory-sync.h"
 #include "qom/object.h"
+#include "qemu/event_notifier.h"
+#include "sysemu/kvm.h"
+#include "util/event_notifier-posix.c"
 
 static int config_op_send(PCIProxyDev *dev, uint32_t addr, uint32_t *val, int 
l,
   unsigned int op)
@@ -75,6 +78,53 @@ static void pci_proxy_write_config(PCIDevice *d, uint32_t 
addr, uint32_t val,
 config_op_send(PCI_PROXY_DEV(d), addr, &val, l, PCI_CONFIG_WRITE);
 }
 
+static void proxy_intx_update(PCIDevice *pci_dev)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
+PCIINTxRoute route;
+int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+if (dev->irqfd.fd) {
+dev->irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+(void) kvm_vm_ioctl(kvm_state, KVM_IRQFD, &dev->irqfd);
+memset(&dev->irqfd, 0, sizeof(struct kvm_irqfd));
+}
+
+route = pci_device_route_intx_to_irq(pci_dev, pin);
+
+dev->irqfd.fd = event_notifier_get_fd(&dev->intr);
+dev->irqfd.resamplefd = event_notifier_get_fd(&dev->resample);
+dev->irqfd.gsi = route.irq;
+dev->irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
+(void) kvm_vm_ioctl(kvm_state, KVM_IRQFD, &dev->irqfd);
+}
+
+static void setup_irqfd(PCIProxyDev *dev)
+{
+PCIDevice *pci_dev = PCI_DEVICE(dev);
+MPQemuMsg msg;
+
+event_notifier_init(&dev->intr, 0);
+event_notifier_init(&dev->resample, 0);
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+msg.cmd = SET_IRQFD;
+msg.num_fds = 2;
+msg.fds[0] = event_notifier_get_fd(&dev->intr);
+msg.fds[1] = event_notifier_get_fd(&dev->resample);
+msg.data1.set_irqfd.intx =
+pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+msg.size = sizeof(msg.data1);
+
+mpqemu_msg_send(&msg, dev->mpqemu_link->dev);
+
+memset(&dev->irqfd, 0, sizeof(struct kvm_irqfd));
+
+proxy_intx_update(pci_dev);
+
+pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
+}
+
 static void proxy_set_socket(Object *obj, const char *str, Error **er

[PATCH RESEND v6 27/36] multi-process: add support to parse device option

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Enable remote process to parse device command line option

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 remote/remote-opts.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/remote/remote-opts.c b/remote/remote-opts.c
index cb7837bf13..f077221c71 100644
--- a/remote/remote-opts.c
+++ b/remote/remote-opts.c
@@ -57,6 +57,13 @@ void parse_cmdline(int argc, char **argv, char **envp)
 exit(1);
 }
 break;
+case QEMU_OPTION_device:
+if (!qemu_opts_parse_noisily(qemu_find_opts("device"),
+optarg, true)) {
+error_report("Unable to process device command");
+exit(1);
+}
+break;
 default:
 break;
 }
@@ -71,5 +78,8 @@ void parse_cmdline(int argc, char **argv, char **envp)
 exit(0);
 }
 
+qemu_opts_foreach(qemu_find_opts("device"), device_init_func, NULL,
+  &error_fatal);
+
 return;
 }
-- 
2.25.GIT




[PATCH RESEND v6 25/36] multi-process: Introduce build flags to separate remote process code

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Introduce SCSI_PROCESS & REMOTE_PROCESS build flags to separate
code that applies only to remote processes.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 Makefile.target | 4 
 rules.mak   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile.target b/Makefile.target
index b956ab24f6..adc76886f8 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -269,6 +269,10 @@ ifdef CONFIG_DARWIN
$(call quiet-command,SetFile -a C $@,"SETFILE","$(TARGET_DIR)$@")
 endif
 
+ifdef CONFIG_MPQEMU
+$(SCSI_DEV_BUILD): REMOTE_FLAGS = -DREMOTE_PROCESS -DSCSI_PROCESS
+endif
+
 $(SCSI_DEV_BUILD): $(all-remote-lsi-obj-y) $(COMMON_LDADDS)
$(call LINK, $(filter-out %.mak, $^))
 ifdef CONFIG_DARWIN
diff --git a/rules.mak b/rules.mak
index 694865b63e..257f07e322 100644
--- a/rules.mak
+++ b/rules.mak
@@ -67,7 +67,7 @@ expand-objs = $(strip $(sort $(filter %.o,$1)) \
 
 %.o: %.c
$(call quiet-command,$(CC) $(QEMU_LOCAL_INCLUDES) $(QEMU_INCLUDES) \
-  $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($@-cflags) \
+  $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($@-cflags) 
$(REMOTE_FLAGS) \
   -c -o $@ $<,"CC","$(TARGET_DIR)$@")
 %.o: %.rc
$(call quiet-command,$(WINDRES) -I. -o $@ $<,"RC","$(TARGET_DIR)$@")
-- 
2.25.GIT




[PATCH RESEND v6 34/36] multi-process/mon: Initialize QMP module for remote processes

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 remote/remote-main.c | 13 +
 remote/remote-opts.c | 11 +++
 2 files changed, 24 insertions(+)

diff --git a/remote/remote-main.c b/remote/remote-main.c
index 562db4ccc3..199af144dd 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -40,6 +40,9 @@
 #include "remote/iohub.h"
 #include "remote-opts.h"
 #include "sysemu/reset.h"
+#include "qemu-parse.h"
+#include "monitor/monitor.h"
+#include "chardev/char.h"
 
 static void process_msg(GIOCondition cond, MPQemuLinkState *link,
 MPQemuChannel *chan);
@@ -313,6 +316,8 @@ int main(int argc, char *argv[])
 
 module_call_init(MODULE_INIT_QOM);
 
+monitor_init_globals();
+
 bdrv_init_with_whitelist();
 
 if (qemu_init_main_loop(&err)) {
@@ -330,6 +335,8 @@ int main(int argc, char *argv[])
 
 qemu_add_opts(&qemu_device_opts);
 qemu_add_opts(&qemu_drive_opts);
+qemu_add_opts(&qemu_chardev_opts);
+qemu_add_opts(&qemu_mon_opts);
 qemu_add_drive_opts(&qemu_legacy_drive_opts);
 qemu_add_drive_opts(&qemu_common_drive_opts);
 qemu_add_drive_opts(&qemu_drive_opts);
@@ -351,6 +358,12 @@ int main(int argc, char *argv[])
 
 parse_cmdline(argc - 2, argv + 2, NULL);
 
+qemu_opts_foreach(qemu_find_opts("chardev"),
+  chardev_init_func, NULL, &error_fatal);
+
+qemu_opts_foreach(qemu_find_opts("mon"),
+  mon_init_func, NULL, &error_fatal);
+
 mpqemu_init_channel(mpqemu_link, &mpqemu_link->com, fd);
 
 mpqemu_link_set_callback(mpqemu_link, process_msg);
diff --git a/remote/remote-opts.c b/remote/remote-opts.c
index f077221c71..ac3a9be6b8 100644
--- a/remote/remote-opts.c
+++ b/remote/remote-opts.c
@@ -15,6 +15,7 @@
 #include "qemu-options.h"
 #include "qemu-parse.h"
 #include "remote-opts.h"
+#include "monitor/monitor.h"
 
 /*
  * In remote process, we parse only subset of options. The code
@@ -64,6 +65,16 @@ void parse_cmdline(int argc, char **argv, char **envp)
 exit(1);
 }
 break;
+case QEMU_OPTION_qmp:
+monitor_parse(optarg, "control", false);
+break;
+case QEMU_OPTION_monitor:
+if (!strncmp(optarg, "stdio", 5)) {
+warn_report("STDIO not supported in remote process");
+} else if (strncmp(optarg, "none", 4)) {
+monitor_parse(optarg, "readline", false);
+}
+break;
 default:
 break;
 }
-- 
2.25.GIT




[PATCH RESEND v6 32/36] multi-process/mon: stub functions to enable QMP module for remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

QMP module doesn't need some functions to run independently on the
remote processes. However, these functions are necessary for
compilation. Therefore, these functions are stub'ed out. The
stub functions raise an assert if QEMU is built in debug mode
(--enable-debug).

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 MAINTAINERS|   4 +
 accel/stubs/tcg-stub.c |  10 +++
 configure  |   4 +
 include/qemu-common.h  |   8 ++
 stubs/gdbstub.c|  23 ++
 stubs/migration.c  | 162 +
 stubs/monitor.c|  31 
 stubs/net-stub.c   |  69 ++
 stubs/qapi-misc.c  |  41 +++
 stubs/qapi-target.c|  56 ++
 stubs/ui-stub.c| 130 +
 stubs/vl-stub.c|  92 +++
 12 files changed, 630 insertions(+)
 create mode 100644 stubs/migration.c
 create mode 100644 stubs/qapi-misc.c
 create mode 100644 stubs/qapi-target.c
 create mode 100644 stubs/ui-stub.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6a4b5e16be..216291f545 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1996,6 +1996,8 @@ F: blockdev-hmp-cmds.c
 F: block/qapi.c
 F: qapi/block*.json
 F: qapi/transaction.json
+F: stubs/qapi-misc.c
+F: stubs/qapi-target.c
 T: git https://repo.or.cz/qemu/armbru.git block-next
 
 Dirty Bitmaps
@@ -2120,6 +2122,7 @@ F: ui/
 F: include/ui/
 F: qapi/ui.json
 F: util/drm.c
+F: stubs/ui-stub.c
 
 Cocoa graphics
 M: Peter Maydell 
@@ -2152,6 +2155,7 @@ F: include/monitor/hmp-target.h
 F: tests/qtest/test-hmp.c
 F: include/qemu/qemu-print.h
 F: util/qemu-print.c
+F: stubs/migration.c
 
 Network device backends
 M: Jason Wang 
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 2e4e8741fb..cdbceb5fff 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -119,4 +119,14 @@ page_collection_lock(tb_page_addr_t start, tb_page_addr_t 
end)
 void page_collection_unlock(struct page_collection *set)
 {
 }
+
+void dump_exec_info(void)
+{
+qemu_debug_assert(0);
+}
+
+void dump_opcount_info(void)
+{
+qemu_debug_assert(0);
+}
 #endif
diff --git a/configure b/configure
index 913c78d4ef..cd90cc5d02 100755
--- a/configure
+++ b/configure
@@ -7562,6 +7562,10 @@ if test "$mpqemu" = "yes" ; then
   echo "CONFIG_MPQEMU=y" >> $config_host_mak
 fi
 
+if test "$debug" = "yes" ; then
+  echo "CONFIG_DEBUG=y" >> $config_host_mak
+fi
+
 if test "$bochs" = "yes" ; then
   echo "CONFIG_BOCHS=y" >> $config_host_mak
 fi
diff --git a/include/qemu-common.h b/include/qemu-common.h
index d0142f29ac..b76e309d98 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -10,6 +10,8 @@
 #ifndef QEMU_COMMON_H
 #define QEMU_COMMON_H
 
+#include 
+
 #define TFR(expr) do { if ((expr) != -1) break; } while (errno == EINTR)
 
 /* Copyright string for -version arguments, About dialogs, etc */
@@ -135,4 +137,10 @@ void page_size_init(void);
  * returned. */
 bool dump_in_progress(void);
 
+#ifdef CONFIG_DEBUG
+#define qemu_debug_assert(x) assert(x)
+#else
+#define qemu_debug_assert(x)
+#endif
+
 #endif
diff --git a/stubs/gdbstub.c b/stubs/gdbstub.c
index 2b7aee50d3..1e65b54755 100644
--- a/stubs/gdbstub.c
+++ b/stubs/gdbstub.c
@@ -1,6 +1,29 @@
 #include "qemu/osdep.h"
+#include "qemu-common.h"
 #include "exec/gdbstub.h"   /* xml_builtin */
 
+#pragma weak gdbserver_start
+
 const char *const xml_builtin[][2] = {
   { NULL, NULL }
 };
+
+#ifdef CONFIG_USER_ONLY
+
+int gdbserver_start(int port)
+{
+qemu_debug_assert(0);
+
+return -ENOSYS;
+}
+
+#else
+
+int gdbserver_start(const char *device)
+{
+qemu_debug_assert(0);
+
+return -ENOSYS;
+}
+
+#endif
diff --git a/stubs/migration.c b/stubs/migration.c
new file mode 100644
index 00..28ccf80f21
--- /dev/null
+++ b/stubs/migration.c
@@ -0,0 +1,162 @@
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "migration/misc.h"
+#include "migration/snapshot.h"
+#include "qapi/qapi-types-migration.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qapi/qapi-types-net.h"
+
+MigrationInfo *qmp_query_migrate(Error **errp)
+{
+qemu_debug_assert(0);
+
+return NULL;
+}
+
+void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
+  Error **errp)
+{
+qemu_debug_assert(0);
+}
+
+MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
+{
+qemu_debug_assert(0);
+
+return NULL;
+}
+
+void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
+{
+qemu_debug_assert(0);
+}
+
+MigrationParameters *qmp_query_migrate_parameters(Error **errp)
+{
+qemu_debug_assert(0);
+
+return NULL

[PATCH RESEND v6 10/36] multi-process: build system for remote device process

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Modify Makefile to support the building of the remote
device process. Implements main() function of remote
device process.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS |  8 ++
 Makefile|  2 ++
 Makefile.objs   | 27 ++
 Makefile.target | 61 -
 accel/Makefile.objs |  2 ++
 backends/Makefile.objs  |  2 ++
 block/Makefile.objs |  2 ++
 hw/Makefile.objs|  7 +
 hw/block/Makefile.objs  |  2 ++
 hw/core/Makefile.objs   | 18 
 hw/nvram/Makefile.objs  |  2 ++
 hw/pci/Makefile.objs|  4 +++
 hw/scsi/Makefile.objs   |  2 ++
 migration/Makefile.objs |  2 ++
 qom/Makefile.objs   |  3 ++
 remote/Makefile.objs|  1 +
 remote/remote-main.c| 23 
 stubs/replay.c  |  4 +++
 18 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 remote/Makefile.objs
 create mode 100644 remote/remote-main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c5fba124e4..965f34d4f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2850,6 +2850,14 @@ S: Maintained
 F: hw/semihosting/
 F: include/hw/semihosting/
 
+Multi-process QEMU
+M: Jagannathan Raman 
+M: Elena Ufimtseva 
+M: John G Johnson 
+S: Maintained
+F: remote/Makefile.objs
+F: remote/remote-main.c
+
 Build and test automation
 -
 Build and test automation
diff --git a/Makefile b/Makefile
index 8a9113e666..69337d6f8c 100644
--- a/Makefile
+++ b/Makefile
@@ -474,6 +474,8 @@ dummy := $(call unnest-vars,, \
 qom-obj-y \
 io-obj-y \
 common-obj-y \
+remote-pci-obj-y \
+remote-lsi-obj-y \
 common-obj-m \
 trace-obj-y)
 
diff --git a/Makefile.objs b/Makefile.objs
index f29c60c59d..f6654633b4 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -21,6 +21,33 @@ block-obj-$(CONFIG_REPLICATION) += replication.o
 
 block-obj-m = block/
 
+#
+# remote-pci-obj-y is common code used by remote devices
+
+remote-pci-obj-$(CONFIG_MPQEMU) += hw/
+remote-pci-obj-$(CONFIG_MPQEMU) += qom/
+remote-pci-obj-$(CONFIG_MPQEMU) += backends/
+remote-pci-obj-$(CONFIG_MPQEMU) += block/
+remote-pci-obj-$(CONFIG_MPQEMU) += migration/
+remote-pci-obj-$(CONFIG_MPQEMU) += remote/
+remote-pci-obj-$(CONFIG_MPQEMU) += accel/
+
+remote-pci-obj-$(CONFIG_MPQEMU) += cpus-common.o
+remote-pci-obj-$(CONFIG_MPQEMU) += dma-helpers.o
+remote-pci-obj-$(CONFIG_MPQEMU) += blockdev.o
+remote-pci-obj-$(CONFIG_MPQEMU) += qdev-monitor.o
+remote-pci-obj-$(CONFIG_MPQEMU) += bootdevice.o
+remote-pci-obj-$(CONFIG_MPQEMU) += iothread.o
+remote-pci-obj-$(CONFIG_MPQEMU) += qemu-parse.o
+
+##
+# remote-lsi-obj-y is code used to implement remote LSI device
+
+remote-lsi-obj-$(CONFIG_MPQEMU) += hw/
+
+###
+# crypto-obj-y is code used by both qemu system emulation and qemu-img
+
 crypto-obj-y = crypto/
 
 io-obj-y = io/
diff --git a/Makefile.target b/Makefile.target
index 8ed1eba95b..70fa1eeca5 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -36,7 +36,17 @@ QEMU_PROG_BUILD = $(QEMU_PROG)
 endif
 endif
 
+ifdef CONFIG_MPQEMU
+SCSI_DEV_PROG=qemu-scsi-dev
+SCSI_DEV_BUILD = $(SCSI_DEV_PROG)
+endif
+
 PROGS=$(QEMU_PROG) $(QEMU_PROGW)
+
+ifeq ($(TARGET_NAME)-$(CONFIG_MPQEMU)-$(CONFIG_USER_ONLY), x86_64-y-)
+PROGS += $(SCSI_DEV_PROG)
+endif
+
 STPFILES=
 
 config-target.h: config-target.h-timestamp
@@ -121,6 +131,20 @@ LIBS := $(libs_cpu) $(LIBS)
 
 obj-$(CONFIG_PLUGIN) += plugins/
 
+ifeq ($(TARGET_NAME)-$(CONFIG_MPQEMU)-$(CONFIG_USER_ONLY), x86_64-y-)
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += accel/stubs/kvm-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += accel/stubs/tcg-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += accel/stubs/hax-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += accel/stubs/whpx-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/vl-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/net-stub.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/monitor.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/replay.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/xen-mapcache.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/audio.o
+remote-pci-tgt-obj-$(CONFIG_MPQEMU) += stubs/monitor.o
+endif
+
 #
 # Linux user emulator target
 
@@ -179,6 +203,20 @@ endif # CONFIG_SOFTMMU
 dummy := $(call unnest-vars,,obj-y)
 all-obj-y := $(obj-y)
 
+dummy := $(call unnest-vars,..,remote-pci-tgt-obj-y)
+all-remote-pci-obj-y := $(remote-pci-tgt-obj-y)
+
+ifeq ($(TARGET_NAME)-$(CONFIG_MPQEMU)-$(CONFIG_USER_ONLY), x86_64-y-)
+all-remote-pci-obj-y += memory.o
+all-remote-pci-obj-y += exec.o
+all-remote-pci-obj-y += exec-vary.o

[PATCH RESEND v6 02/36] multi-process: Refactor machine_init and exit notifiers

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Relocate machine_int and exit notifiers into common code

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 MAINTAINERS |  1 +
 Makefile.objs   |  1 +
 include/sysemu/sysemu.h |  2 ++
 softmmu/vl.c| 42 --
 stubs/Makefile.objs |  2 ++
 stubs/machine-init-add.c|  7 
 stubs/machine-init-done.c   |  5 ++-
 stubs/machine-init-remove.c |  8 +
 util/machine-notify.c   | 69 +
 9 files changed, 92 insertions(+), 45 deletions(-)
 create mode 100644 stubs/machine-init-add.c
 create mode 100644 stubs/machine-init-remove.c
 create mode 100644 util/machine-notify.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 8cbc1fac2b..04b19ac56c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2133,6 +2133,7 @@ F: util/qemu-timer.c
 F: softmmu/vl.c
 F: softmmu/main.c
 F: qapi/run-state.json
+F: util/machine-notify.c
 
 Human Monitor (HMP)
 M: Dr. David Alan Gilbert 
diff --git a/Makefile.objs b/Makefile.objs
index a7c967633a..bfb9271862 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -79,6 +79,7 @@ qemu-seccomp.o-libs := $(SECCOMP_LIBS)
 common-obj-$(CONFIG_FDT) += device_tree.o
 
 common-obj-y += qapi/
+common-obj-y += util/machine-notify.o
 
 endif # CONFIG_SOFTMMU
 
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index ef81302e1a..2438dd7bea 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -17,11 +17,13 @@ extern bool qemu_uuid_set;
 
 void qemu_add_exit_notifier(Notifier *notify);
 void qemu_remove_exit_notifier(Notifier *notify);
+void qemu_run_exit_notifiers(void);
 
 extern bool machine_init_done;
 
 void qemu_add_machine_init_done_notifier(Notifier *notify);
 void qemu_remove_machine_init_done_notifier(Notifier *notify);
+void qemu_run_machine_init_done_notifiers(void);
 
 extern int autostart;
 
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 32c0047889..39cbb6b50d 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -172,12 +172,6 @@ int icount_align_option;
 QemuUUID qemu_uuid;
 bool qemu_uuid_set;
 
-static NotifierList exit_notifiers =
-NOTIFIER_LIST_INITIALIZER(exit_notifiers);
-
-static NotifierList machine_init_done_notifiers =
-NOTIFIER_LIST_INITIALIZER(machine_init_done_notifiers);
-
 bool xen_allowed;
 uint32_t xen_domid;
 enum xen_mode xen_mode = XEN_EMULATE;
@@ -2325,21 +2319,6 @@ static MachineClass *machine_parse(const char *name, 
GSList *machines)
 return mc;
 }
 
-void qemu_add_exit_notifier(Notifier *notify)
-{
-notifier_list_add(&exit_notifiers, notify);
-}
-
-void qemu_remove_exit_notifier(Notifier *notify)
-{
-notifier_remove(notify);
-}
-
-static void qemu_run_exit_notifiers(void)
-{
-notifier_list_notify(&exit_notifiers, NULL);
-}
-
 static const char *pid_file;
 static Notifier qemu_unlink_pidfile_notifier;
 
@@ -2350,27 +2329,6 @@ static void qemu_unlink_pidfile(Notifier *n, void *data)
 }
 }
 
-bool machine_init_done;
-
-void qemu_add_machine_init_done_notifier(Notifier *notify)
-{
-notifier_list_add(&machine_init_done_notifiers, notify);
-if (machine_init_done) {
-notify->notify(notify, NULL);
-}
-}
-
-void qemu_remove_machine_init_done_notifier(Notifier *notify)
-{
-notifier_remove(notify);
-}
-
-static void qemu_run_machine_init_done_notifiers(void)
-{
-machine_init_done = true;
-notifier_list_notify(&machine_init_done_notifiers, NULL);
-}
-
 static const QEMUOption *lookup_opt(int argc, char **argv,
 const char **poptarg, int *poptind)
 {
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 45be5dc0ed..f884bb6180 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -43,4 +43,6 @@ stub-obj-y += pci-host-piix.o
 stub-obj-y += ram-block.o
 stub-obj-y += ramfb.o
 stub-obj-y += fw_cfg.o
+stub-obj-y += machine-init-add.o
+stub-obj-y += machine-init-remove.o
 stub-obj-$(CONFIG_SOFTMMU) += semihost.o
diff --git a/stubs/machine-init-add.c b/stubs/machine-init-add.c
new file mode 100644
index 00..520dcb9801
--- /dev/null
+++ b/stubs/machine-init-add.c
@@ -0,0 +1,7 @@
+#include "qemu/osdep.h"
+#include "sysemu/sysemu.h"
+
+void qemu_add_machine_init_done_notifier(Notifier *notify)
+{
+}
+
diff --git a/stubs/machine-init-done.c b/stubs/machine-init-done.c
index cd8e81392d..a34d838f7a 100644
--- a/stubs/machine-init-done.c
+++ b/stubs/machine-init-done.c
@@ -3,6 +3,5 @@
 
 bool machine_init_done = true;
 
-void qemu_add_machine_init_done_notifier(Notifier *notify)
-{
-}
+NotifierList machine_init_done_notifiers =
+NOTIFIER_LIST_INITIALIZER(machine_init_done_notifiers);
diff --git a/stubs/machine-init-remove.c b/stubs/machine-init-remove.c
new file mode 100644
index 00..30aee27c2d
--- /dev/null
+++ b/stubs/machine-init-remove.c
@@ -0,0 +1,8 @@
+#include "qemu/osdep.h"
+#include "sysemu/sy

[PATCH RESEND v6 35/36] multi-process: add the concept description to docs/devel/qemu-multiprocess

2020-04-22 Thread elena . ufimtseva
From: John G Johnson 

Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
---
 MAINTAINERS  |   1 +
 docs/devel/index.rst |   1 +
 docs/devel/multi-process.rst | 957 +++
 3 files changed, 959 insertions(+)
 create mode 100644 docs/devel/multi-process.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index 216291f545..ed48615e15 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2879,6 +2879,7 @@ F: include/remote/iohub.h
 F: remote/iohub.c
 F: remote/remote-opts.h
 F: remote/remote-opts.c
+F: docs/devel/multi-process.rst
 
 Build and test automation
 -
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
index a9e1200dff..95cea0d474 100644
--- a/docs/devel/index.rst
+++ b/docs/devel/index.rst
@@ -27,3 +27,4 @@ Contents:
bitops
reset
s390-dasd-ipl
+   multi-process
diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst
new file mode 100644
index 00..406728854c
--- /dev/null
+++ b/docs/devel/multi-process.rst
@@ -0,0 +1,957 @@
+Multi-process QEMU
+===
+
+QEMU is often used as the hypervisor for virtual machines running in the
+Oracle cloud. Since one of the advantages of cloud computing is the
+ability to run many VMs from different tenants in the same cloud
+infrastructure, a guest that compromised its hypervisor could
+potentially use the hypervisor's access privileges to access data it is
+not authorized for.
+
+QEMU can be susceptible to security attacks because it is a large,
+monolithic program that provides many features to the VMs it services.
+Many of these features can be configured out of QEMU, but even a reduced
+configuration QEMU has a large amount of code a guest can potentially
+attack. Separating QEMU reduces the attack surface by aiding to
+limit each component in the system to only access the resources that
+it needs to perform its job.
+
+QEMU services
+-
+
+QEMU can be broadly described as providing three main services. One is a
+VM control point, where VMs can be created, migrated, re-configured, and
+destroyed. A second is to emulate the CPU instructions within the VM,
+often accelerated by HW virtualization features such as Intel's VT
+extensions. Finally, it provides IO services to the VM by emulating HW
+IO devices, such as disk and network devices.
+
+A multi-process QEMU
+
+
+A multi-process QEMU involves separating QEMU services into separate
+host processes. Each of these processes can be given only the privileges
+it needs to provide its service, e.g., a disk service could be given
+access only to the disk images it provides, and not be allowed to
+access other files, or any network devices. An attacker who compromised
+this service would not be able to use this exploit to access files or
+devices beyond what the disk service was given access to.
+
+A QEMU control process would remain, but in multi-process mode, will
+have no direct interfaces to the VM. During VM execution, it would still
+provide the user interface to hot-plug devices or live migrate the VM.
+
+A first step in creating a multi-process QEMU is to separate IO services
+from the main QEMU program, which would continue to provide CPU
+emulation. i.e., the control process would also be the CPU emulation
+process. In a later phase, CPU emulation could be separated from the
+control process.
+
+Separating IO services
+--
+
+Separating IO services into individual host processes is a good place to
+begin for a couple of reasons. One is the sheer number of IO devices QEMU
+can emulate provides a large surface of interfaces which could potentially
+be exploited, and, indeed, have been a source of exploits in the past.
+Another is the modular nature of QEMU device emulation code provides
+interface points where the QEMU functions that perform device emulation
+can be separated from the QEMU functions that manage the emulation of
+guest CPU instructions. The devices emulated in the separate process are
+referred to as remote devices.
+
+QEMU device emulation
+~
+
+QEMU uses an object oriented SW architecture for device emulation code.
+Configured objects are all compiled into the QEMU binary, then objects
+are instantiated by name when used by the guest VM. For example, the
+code to emulate a device named "foo" is always present in QEMU, but its
+instantiation code is only run when the device is included in the target
+VM. (e.g., via the QEMU command line as *-device foo*)
+
+The object model is hierarchical, so device emulation code names its
+parent object (such as "pci-device" for a PCI device) and QEMU will
+instantiate a parent object before calling the device's instantiation
+code.
+
+Current separation models
+~
+
+In order to separate the device emulation code from the CPU emulation
+code, the device object c

[PATCH RESEND v6 08/36] multi-process: Add stub functions to facilitate build of multi-process

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Add stub functions that are needed during compile time but not in
runtime.
To avoid duplicate symbol while linking for monitor_get_fd, put in
a separate file.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS|  3 ++
 accel/stubs/kvm-stub.c |  5 +++
 accel/stubs/tcg-stub.c | 98 ++
 stubs/Makefile.objs|  1 +
 stubs/audio.c  | 12 ++
 stubs/get-fd.c | 10 +
 stubs/monitor.c| 53 +--
 stubs/net-stub.c   | 31 +
 stubs/replay.c | 14 ++
 stubs/vl-stub.c| 79 ++
 stubs/vmstate.c| 19 
 stubs/xen-mapcache.c   | 22 ++
 12 files changed, 344 insertions(+), 3 deletions(-)
 create mode 100644 stubs/audio.c
 create mode 100644 stubs/get-fd.c
 create mode 100644 stubs/net-stub.c
 create mode 100644 stubs/vl-stub.c
 create mode 100644 stubs/xen-mapcache.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2e700e6e64..c5fba124e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -450,6 +450,7 @@ F: hw/pci-host/xen_igd_pt.c
 F: include/hw/block/dataplane/xen*
 F: include/hw/xen/
 F: include/sysemu/xen-mapcache.h
+F: stubs/xen-mapcache.c
 
 Guest CPU Cores (HAXM)
 -
@@ -1927,6 +1928,7 @@ F: include/hw/audio/
 F: tests/qtest/ac97-test.c
 F: tests/qtest/es1370-test.c
 F: tests/qtest/intel-hda-test.c
+F: stubs/audio.c
 
 Block layer core
 M: Kevin Wolf 
@@ -2159,6 +2161,7 @@ F: include/net/
 F: qemu-bridge-helper.c
 T: git https://github.com/jasowang/qemu.git net
 F: qapi/net.json
+F: stubs/net-stub.c
 
 Netmap network backend
 M: Luigi Rizzo 
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 82f118d2df..baa6b38da4 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -31,6 +31,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_ioeventfd_any_length_allowed;
 bool kvm_msi_use_devid;
+bool kvm_halt_in_kernel_allowed;
 
 int kvm_destroy_vcpu(CPUState *cpu)
 {
@@ -58,6 +59,10 @@ void kvm_cpu_synchronize_post_init(CPUState *cpu)
 {
 }
 
+void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+}
+
 int kvm_cpu_exec(CPUState *cpu)
 {
 abort();
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 677191a69c..2e4e8741fb 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -11,14 +11,112 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu-common.h"
 #include "cpu.h"
 #include "tcg/tcg.h"
 #include "exec/exec-all.h"
+#include "translate-all.h"
+#include "exec/ram_addr.h"
+
+bool parallel_cpus;
 
 void tb_flush(CPUState *cpu)
 {
 }
 
+#ifdef CONFIG_MPQEMU
+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
+{
+}
+
+void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
+{
+}
+
+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+}
+
+void tb_invalidate_phys_page_fast(struct page_collection *pages,
+  tb_page_addr_t start, int len,
+  uintptr_t retaddr)
+{
+}
+
+void tlb_init(CPUState *cpu)
+{
+}
+
 void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 {
 }
+
+void tlb_flush(CPUState *cpu)
+{
+}
+
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+}
+
+void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
+{
+}
+
+void tcg_region_init(void)
+{
+}
+
+void tcg_register_thread(void)
+{
+}
+
+void tcg_flush_softmmu_tlb(CPUState *cs)
+{
+}
+
+void cpu_loop_exit_noexc(CPUState *cpu)
+{
+cpu->exception_index = -1;
+cpu_loop_exit(cpu);
+}
+
+void cpu_loop_exit(CPUState *cpu)
+{
+cpu->can_do_io = 1;
+siglongjmp(cpu->jmp_env, 1);
+}
+
+void cpu_reloading_memory_map(void)
+{
+}
+
+int cpu_exec(CPUState *cpu)
+{
+return 0;
+}
+
+void cpu_exec_step_atomic(CPUState *cpu)
+{
+}
+
+bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit)
+{
+return false;
+}
+
+void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
+{
+cpu_loop_exit(cpu);
+}
+
+struct page_collection *
+page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
+{
+return NULL;
+}
+
+void page_collection_unlock(struct page_collection *set)
+{
+}
+#endif
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index f884bb6180..f74c7e927b 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -20,6 +20,7 @@ stub-obj-y += migr-blocker.o
 stub-obj-y += change-state-handler.o
 stub-obj-y += monitor.o
 stub-obj-y += monitor-core.o
+stub-obj-y += get-fd.o
 stub-obj-y += notify-event.o
 stub-obj-y += qtest.o
 stub-obj-y += replay.o
diff --git a/stubs/audio.c b/stubs/audio.c
new file mode 100644
index 00..8ae3b0f568
--- /dev/null
+++ b/stubs/audio.c
@@ -0,0 +1,12 @@
+#include "qemu/osdep.h"
+#include "audio/audio.h"
+
+Au

[PATCH RESEND v6 17/36] multi-process: introduce proxy object

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Defines a PCI Device proxy object as a parent of TYPE_PCI_DEVICE.

PCI Proxy Object registers as a PCI device with QEMU and forwards all
PCI accesses to the remote process using the communication channel.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 MAINTAINERS   |  3 ++
 hw/Makefile.objs  |  2 ++
 hw/proxy/Makefile.objs|  1 +
 hw/proxy/qemu-proxy.c | 56 +++
 include/hw/proxy/qemu-proxy.h | 46 
 include/io/mpqemu-link.h  |  1 +
 6 files changed, 109 insertions(+)
 create mode 100644 hw/proxy/Makefile.objs
 create mode 100644 hw/proxy/qemu-proxy.c
 create mode 100644 include/hw/proxy/qemu-proxy.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 96f8d7ff19..3da3dcd311 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2866,6 +2866,9 @@ F: include/remote/machine.h
 F: remote/machine.c
 F: include/remote/memory.h
 F: remote/memory.c
+F: hw/proxy/Makefile.objs
+F: hw/proxy/qemu-proxy.c
+F: include/hw/proxy/qemu-proxy.h
 
 Build and test automation
 -
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index af9235b6f2..7b489b12a5 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -45,6 +45,8 @@ endif
 common-obj-y += $(devices-dirs-y)
 obj-y += $(devices-dirs-y)
 
+common-obj-$(CONFIG_MPQEMU) += proxy/
+
 remote-pci-obj-$(CONFIG_MPQEMU) += core/
 remote-pci-obj-$(CONFIG_MPQEMU) += block/
 remote-pci-obj-$(CONFIG_MPQEMU) += pci/
diff --git a/hw/proxy/Makefile.objs b/hw/proxy/Makefile.objs
new file mode 100644
index 00..eb81624cf8
--- /dev/null
+++ b/hw/proxy/Makefile.objs
@@ -0,0 +1 @@
+common-obj-$(CONFIG_MPQEMU) += qemu-proxy.o
diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
new file mode 100644
index 00..bf6c4117ef
--- /dev/null
+++ b/hw/proxy/qemu-proxy.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qapi/error.h"
+#include "io/mpqemu-link.h"
+#include "hw/proxy/qemu-proxy.h"
+#include "hw/pci/pci.h"
+
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(device);
+PCIProxyDevClass *k = PCI_PROXY_DEV_GET_CLASS(dev);
+Error *local_err = NULL;
+
+if (k->realize) {
+k->realize(dev, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+}
+}
+}
+
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
+{
+PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+k->realize = pci_proxy_dev_realize;
+}
+
+static const TypeInfo pci_proxy_dev_type_info = {
+.name  = TYPE_PCI_PROXY_DEV,
+.parent= TYPE_PCI_DEVICE,
+.instance_size = sizeof(PCIProxyDev),
+.class_size= sizeof(PCIProxyDevClass),
+.class_init= pci_proxy_dev_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
+{ },
+},
+};
+
+static void pci_proxy_dev_register_types(void)
+{
+type_register_static(&pci_proxy_dev_type_info);
+}
+
+type_init(pci_proxy_dev_register_types)
+
diff --git a/include/hw/proxy/qemu-proxy.h b/include/hw/proxy/qemu-proxy.h
new file mode 100644
index 00..d7eaf26f29
--- /dev/null
+++ b/include/hw/proxy/qemu-proxy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_PROXY_H
+#define QEMU_PROXY_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "io/mpqemu-link.h"
+#include "hw/pci/pci.h"
+
+#define TYPE_PCI_PROXY_DEV "pci-proxy-dev"
+
+#define PCI_PROXY_DEV(obj) \
+OBJECT_CHECK(PCIProxyDev, (obj), TYPE_PCI_PROXY_DEV)
+
+#define PCI_PROXY_DEV_CLASS(klass) \
+OBJECT_CLASS_CHECK(PCIProxyDevClass, (klass), TYPE_PCI_PROXY_DEV)
+
+#define PCI_PROXY_DEV_GET_CLASS(obj) \
+OBJECT_GET_CLASS(PCIProxyDevClass, (obj), TYPE_PCI_PROXY_DEV)
+
+typedef struct PCIProxyDev {
+PCIDevice parent_dev;
+
+MPQemuLinkState *mpqemu_link;
+
+int socket;
+
+} PCIProxyDev;
+
+typedef struct PCIProxyDevClass {
+PCIDeviceClass parent_class;
+
+void (*realize)(PCIProxyDev *dev, Error **errp);
+
+char *command;
+} PCIProxyDevClass;
+
+#endif /* QEMU_PROXY_H */
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index d46cb81058..73cc59b874 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -14,6 +14,7 @@
 #include "qemu/osdep.h"
 #incl

[PATCH RESEND v6 12/36] multi-process: add functions to synchronize proxy and remote endpoints

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

In some cases, for example MMIO read, QEMU has to wait for the remote to
complete a command before proceeding. An eventfd based mechanism is
added to synchronize QEMU & remote process.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 include/io/mpqemu-link.h |  7 +
 io/mpqemu-link.c | 61 
 2 files changed, 68 insertions(+)

diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index af401e640c..ef95599bca 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -124,4 +124,11 @@ void mpqemu_link_set_callback(MPQemuLinkState *s,
 void mpqemu_start_coms(MPQemuLinkState *s, MPQemuChannel* chan);
 bool mpqemu_msg_valid(MPQemuMsg *msg);
 
+#define GET_REMOTE_WAIT eventfd(0, EFD_CLOEXEC)
+#define PUT_REMOTE_WAIT(wait) close(wait)
+#define PROXY_LINK_WAIT_DONE 1
+
+uint64_t wait_for_remote(int efd);
+void notify_proxy(int fd, uint64_t val);
+
 #endif
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 48f53a8928..cc0a7aecd4 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -10,6 +10,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu-common.h"
+#include 
 
 #include "qemu/module.h"
 #include "io/mpqemu-link.h"
@@ -204,6 +205,66 @@ int mpqemu_msg_recv(MPQemuMsg *msg, MPQemuChannel *chan)
 return rc;
 }
 
+/*
+ * wait_for_remote() Synchronizes QEMU and the remote process. The maximum
+ *   wait time is 1s, after which the wait times out.
+ *   The function alse returns a 64 bit return value after
+ *   the wait. The function uses eventfd() to do the wait
+ *   and pass the return values. eventfd() can't return a
+ *   value of '0'. Therefore, all return values are offset
+ *   by '1' at the sending end, and corrected at the
+ *   receiving end.
+ */
+
+uint64_t wait_for_remote(int efd)
+{
+struct pollfd pfd = { .fd = efd, .events = POLLIN };
+uint64_t val;
+int ret;
+
+ret = poll(&pfd, 1, 1000);
+
+switch (ret) {
+case 0:
+qemu_log_mask(LOG_REMOTE_DEBUG, "Error wait_for_remote: Timed out\n");
+/* TODO: Kick-off error recovery */
+return UINT64_MAX;
+case -1:
+qemu_log_mask(LOG_REMOTE_DEBUG, "Poll error wait_for_remote: %s\n",
+  strerror(errno));
+return UINT64_MAX;
+default:
+if (read(efd, &val, sizeof(val)) == -1) {
+qemu_log_mask(LOG_REMOTE_DEBUG, "Error wait_for_remote: %s\n",
+  strerror(errno));
+return UINT64_MAX;
+}
+}
+
+/*
+ * The remote process could write a non-zero value
+ * to the eventfd to wake QEMU up. However, the drawback of using eventfd
+ * for this purpose is that a return value of zero wouldn't wake QEMU up.
+ * Therefore, we offset the return value by one at the remote process and
+ * correct it in the QEMU end.
+ */
+val = (val == UINT64_MAX) ? val : (val - 1);
+
+return val;
+}
+
+void notify_proxy(int efd, uint64_t val)
+{
+val = (val == UINT64_MAX) ? val : (val + 1);
+ssize_t len = -1;
+
+len = write(efd, &val, sizeof(val));
+if (len == -1 || len != sizeof(val)) {
+qemu_log_mask(LOG_REMOTE_DEBUG, "Error notify_proxy: %s\n",
+  strerror(errno));
+}
+}
+
 static gboolean mpqemu_link_handler_prepare(GSource *gsrc, gint *timeout)
 {
 g_assert(timeout);
-- 
2.25.GIT




[PATCH RESEND v6 11/36] multi-process: define mpqemu-link object

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Defines mpqemu-link object which forms the communication link between
QEMU & emulation program.
Adds functions to configure members of mpqemu-link object instance.
Adds functions to send and receive messages over the communication
channel.
Adds GMainLoop to handle events received on the communication channel.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS  |   2 +
 include/io/mpqemu-link.h | 127 
 io/Makefile.objs |   2 +
 io/mpqemu-link.c | 312 +++
 4 files changed, 443 insertions(+)
 create mode 100644 include/io/mpqemu-link.h
 create mode 100644 io/mpqemu-link.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 965f34d4f9..93ad693da4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2857,6 +2857,8 @@ M: John G Johnson 
 S: Maintained
 F: remote/Makefile.objs
 F: remote/remote-main.c
+F: include/io/mpqemu-link.h
+F: io/mpqemu-link.c
 
 Build and test automation
 -
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
new file mode 100644
index 00..af401e640c
--- /dev/null
+++ b/include/io/mpqemu-link.h
@@ -0,0 +1,127 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef MPQEMU_LINK_H
+#define MPQEMU_LINK_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+
+#define TYPE_MPQEMU_LINK "mpqemu-link"
+#define MPQEMU_LINK(obj) \
+OBJECT_CHECK(MPQemuLinkState, (obj), TYPE_MPQEMU_LINK)
+
+#define REMOTE_MAX_FDS 8
+
+#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data1.u64)
+
+/**
+ * mpqemu_cmd_t:
+ *
+ * proc_cmd_t enum type to specify the command to be executed on the remote
+ * device.
+ */
+typedef enum {
+INIT = 0,
+MAX,
+} mpqemu_cmd_t;
+
+/**
+ * MPQemuMsg:
+ * @cmd: The remote command
+ * @bytestream: Indicates if the data to be shared is structured (data1)
+ *  or unstructured (data2)
+ * @size: Size of the data to be shared
+ * @data1: Structured data
+ * @fds: File descriptors to be shared with remote device
+ * @data2: Unstructured data
+ *
+ * MPQemuMsg Format of the message sent to the remote device from QEMU.
+ *
+ */
+typedef struct {
+mpqemu_cmd_t cmd;
+int bytestream;
+size_t size;
+
+union {
+uint64_t u64;
+} data1;
+
+int fds[REMOTE_MAX_FDS];
+int num_fds;
+
+uint8_t *data2;
+} MPQemuMsg;
+
+/**
+ * MPQemuChannel:
+ * @gsrc: GSource object to be used by loop
+ * @gpfd: GPollFD object containing the socket & events to monitor
+ * @sock: Socket to send/receive communication, same as the one in gpfd
+ * @send_lock: Mutex to synchronize access to the send stream
+ * @recv_lock: Mutex to synchronize access to the recv stream
+ *
+ * Defines the channel that make up the communication link
+ * between QEMU and remote process
+ */
+
+typedef struct MPQemuChannel {
+GSource gsrc;
+GPollFD gpfd;
+int sock;
+QemuMutex send_lock;
+QemuMutex recv_lock;
+} MPQemuChannel;
+
+typedef struct MPQemuLinkState MPQemuLinkState;
+
+typedef void (*mpqemu_link_callback)(GIOCondition cond, MPQemuLinkState *link,
+ MPQemuChannel *chan);
+
+/*
+ * MPQemuLinkState Instance info. of the communication
+ * link between QEMU and remote process. The Link could
+ * be made up of multiple channels.
+ *
+ * ctxGMainContext to be used for communication
+ * loop   Main loop that would be used to poll for incoming data
+ * comCommunication channel to transport control messages
+ *
+ */
+
+struct MPQemuLinkState {
+Object obj;
+
+GMainContext *ctx;
+GMainLoop *loop;
+
+MPQemuChannel *com;
+
+mpqemu_link_callback callback;
+};
+
+MPQemuLinkState *mpqemu_link_create(void);
+void mpqemu_link_finalize(MPQemuLinkState *s);
+
+void mpqemu_msg_send(MPQemuMsg *msg, MPQemuChannel *chan);
+int mpqemu_msg_recv(MPQemuMsg *msg, MPQemuChannel *chan);
+
+void mpqemu_init_channel(MPQemuLinkState *s, MPQemuChannel **chan, int fd);
+void mpqemu_destroy_channel(MPQemuChannel *chan);
+void mpqemu_link_set_callback(MPQemuLinkState *s,
+  mpqemu_link_callback callback);
+void mpqemu_start_coms(MPQemuLinkState *s, MPQemuChannel* chan);
+bool mpqemu_msg_valid(MPQemuMsg *msg);
+
+#endif
diff --git a/io/Makefile.objs b/io/Makefile.objs
index 9a20fce4ed..5875ab0697 100644
--- a/io/Makefile.objs
+++ b/io/Makefile.objs
@@ -10,3 +10,5 @@ io-obj-y += channel-util.o
 io-obj-y += dns-resolver.o
 io-obj-y += net-listener.o
 io-obj-y += task.o
+
+io-obj-$(CONFIG_MPQEMU) += mpqemu-link.o
diff --git a/io/mpqemu-link.c b/io/mpqem

[PATCH RESEND v6 24/36] multi-process: Retrieve PCI info from remote process

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

Retrieve PCI configuration info about the remote device and
configure the Proxy PCI object based on the returned information

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hw/proxy/qemu-proxy.c| 84 
 include/io/mpqemu-link.h | 10 +
 remote/remote-main.c | 21 ++
 3 files changed, 115 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index a78694736b..730e28483e 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -19,6 +19,8 @@
 #include "sysemu/kvm.h"
 #include "util/event_notifier-posix.c"
 
+static void probe_pci_info(PCIDevice *dev);
+
 static int config_op_send(PCIProxyDev *dev, uint32_t addr, uint32_t *val, int 
l,
   unsigned int op)
 {
@@ -182,8 +184,12 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error 
**errp)
 {
 PCIProxyDev *dev = PCI_PROXY_DEV(device);
 PCIProxyDevClass *k = PCI_PROXY_DEV_GET_CLASS(dev);
+uint8_t *pci_conf = device->config;
 Error *local_err = NULL;
 
+pci_conf[PCI_LATENCY_TIMER] = 0xff;
+pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
 if (k->realize) {
 k->realize(dev, &local_err);
 if (local_err) {
@@ -196,6 +202,8 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error 
**errp)
 configure_memory_sync(dev->sync, dev->mpqemu_link);
 
 setup_irqfd(dev);
+
+probe_pci_info(PCI_DEVICE(dev));
 }
 
 static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
@@ -291,3 +299,79 @@ const MemoryRegionOps proxy_default_ops = {
 .max_access_size = 1,
 },
 };
+
+static void probe_pci_info(PCIDevice *dev)
+{
+PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+DeviceClass *dc = DEVICE_CLASS(pc);
+PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+MPQemuLinkState *mpqemu_link = pdev->mpqemu_link;
+MPQemuMsg msg, ret;
+uint32_t orig_val, new_val, class;
+uint8_t type;
+int i, size;
+char *name;
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+msg.bytestream = 0;
+msg.size = 0;
+msg.cmd = GET_PCI_INFO;
+mpqemu_msg_send(&msg, mpqemu_link->dev);
+
+mpqemu_msg_recv(&ret, mpqemu_link->dev);
+
+pc->vendor_id = ret.data1.ret_pci_info.vendor_id;
+pc->device_id = ret.data1.ret_pci_info.device_id;
+pc->class_id = ret.data1.ret_pci_info.class_id;
+pc->subsystem_id = ret.data1.ret_pci_info.subsystem_id;
+
+config_op_send(pdev, 11, &class, 1, PCI_CONFIG_READ);
+switch (class) {
+case PCI_BASE_CLASS_BRIDGE:
+set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+break;
+case PCI_BASE_CLASS_STORAGE:
+set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+break;
+case PCI_BASE_CLASS_NETWORK:
+set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+break;
+case PCI_BASE_CLASS_INPUT:
+set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+break;
+case PCI_BASE_CLASS_DISPLAY:
+set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+break;
+case PCI_BASE_CLASS_PROCESSOR:
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
+break;
+default:
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+break;
+}
+
+for (i = 0; i < 6; i++) {
+config_op_send(pdev, 0x10 + (4 * i), &orig_val, 4, PCI_CONFIG_READ);
+new_val = 0x;
+config_op_send(pdev, 0x10 + (4 * i), &new_val, 4, PCI_CONFIG_WRITE);
+config_op_send(pdev, 0x10 + (4 * i), &new_val, 4, PCI_CONFIG_READ);
+size = (~(new_val & 0xFFF0)) + 1;
+config_op_send(pdev, 0x10 + (4 * i), &orig_val, 4, PCI_CONFIG_WRITE);
+type = (new_val & 0x1) ?
+   PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+if (size) {
+pdev->region[i].dev = pdev;
+pdev->region[i].present = true;
+if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+pdev->region[i].memory = true;
+}
+name = g_strdup_printf("bar-region-%d", i);
+memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+  &proxy_default_ops, &pdev->region[i],
+  name, size);
+pci_register_bar(dev, i, type, &pdev->region[i].mr);
+g_free(name);
+}
+}
+}
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 14e4be2bd0..102c736705 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -48,6 +48,8 @@ typedef enum {
 BAR_WRITE,
 BAR_READ,
 SET_IRQFD,
+GET_PCI_INFO,
+RET_PCI_INFO,
 MAX,
 } mpqemu_cmd_t;
 
@@ -70,6 +72,13 @@ typedef struct {
 bool memory;
 } bar_access_msg_t;
 
+type

[PATCH RESEND v6 26/36] multi-process: add parse_cmdline in remote process

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 MAINTAINERS  |  2 ++
 remote/Makefile.objs |  1 +
 remote/remote-main.c | 21 -
 remote/remote-opts.c | 75 
 remote/remote-opts.h | 15 +
 5 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 remote/remote-opts.c
 create mode 100644 remote/remote-opts.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 198c9f69bc..6a4b5e16be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2873,6 +2873,8 @@ F: include/hw/proxy/memory-sync.h
 F: hw/proxy/memory-sync.c
 F: include/remote/iohub.h
 F: remote/iohub.c
+F: remote/remote-opts.h
+F: remote/remote-opts.c
 
 Build and test automation
 -
diff --git a/remote/Makefile.objs b/remote/Makefile.objs
index ed3e5bd8c4..74c3396786 100644
--- a/remote/Makefile.objs
+++ b/remote/Makefile.objs
@@ -1,4 +1,5 @@
 remote-pci-obj-$(CONFIG_MPQEMU) += remote-main.o
+remote-pci-obj-$(CONFIG_MPQEMU) += remote-opts.o
 remote-pci-obj-$(CONFIG_MPQEMU) += pcihost.o
 remote-pci-obj-$(CONFIG_MPQEMU) += machine.o
 remote-pci-obj-$(CONFIG_MPQEMU) += ../util/machine-notify.o
diff --git a/remote/remote-main.c b/remote/remote-main.c
index f5a479e9b2..b37802151a 100644
--- a/remote/remote-main.c
+++ b/remote/remote-main.c
@@ -24,6 +24,7 @@
 #include "io/mpqemu-link.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/cutils.h"
 #include "sysemu/cpus.h"
 #include "qemu-common.h"
 #include "hw/pci/pci.h"
@@ -37,6 +38,7 @@
 #include "exec/memattrs.h"
 #include "exec/address-spaces.h"
 #include "remote/iohub.h"
+#include "remote-opts.h"
 
 static void process_msg(GIOCondition cond, MPQemuLinkState *link,
 MPQemuChannel *chan);
@@ -289,6 +291,7 @@ finalize_loop:
 int main(int argc, char *argv[])
 {
 Error *err = NULL;
+int fd = -1;
 
 module_call_init(MODULE_INIT_QOM);
 
@@ -307,6 +310,13 @@ int main(int argc, char *argv[])
 
 current_machine = MACHINE(REMOTE_MACHINE(object_new(TYPE_REMOTE_MACHINE)));
 
+qemu_add_opts(&qemu_device_opts);
+qemu_add_opts(&qemu_drive_opts);
+qemu_add_drive_opts(&qemu_legacy_drive_opts);
+qemu_add_drive_opts(&qemu_common_drive_opts);
+qemu_add_drive_opts(&qemu_drive_opts);
+qemu_add_drive_opts(&bdrv_runtime_opts);
+
 mpqemu_link = mpqemu_link_create();
 if (!mpqemu_link) {
 printf("Could not create MPQemu link pid %d, exec_name %s",
@@ -314,7 +324,16 @@ int main(int argc, char *argv[])
 return -1;
 }
 
-mpqemu_init_channel(mpqemu_link, &mpqemu_link->com, STDIN_FILENO);
+fd = qemu_parse_fd(argv[1]);
+if (fd == -1) {
+printf("Failed to parse fd for remote process pid %d, exec_name %s\n",
+   getpid(), __progname);
+return -EINVAL;
+}
+
+parse_cmdline(argc - 2, argv + 2, NULL);
+
+mpqemu_init_channel(mpqemu_link, &mpqemu_link->com, fd);
 
 mpqemu_link_set_callback(mpqemu_link, process_msg);
 
diff --git a/remote/remote-opts.c b/remote/remote-opts.c
new file mode 100644
index 00..cb7837bf13
--- /dev/null
+++ b/remote/remote-opts.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/boards.h"
+#include "sysemu/blockdev.h"
+#include "qapi/error.h"
+#include "qemu-options.h"
+#include "qemu-parse.h"
+#include "remote-opts.h"
+
+/*
+ * In remote process, we parse only subset of options. The code
+ * taken from vl.c to re-use in remote command line parser.
+ */
+void parse_cmdline(int argc, char **argv, char **envp)
+{
+int optind;
+const char *optarg;
+MachineClass *mc;
+
+/* from vl.c */
+optind = 0;
+
+/* second pass of option parsing */
+
+for (;;) {
+if (optind >= argc) {
+break;
+}
+if (argv[optind][0] != '-') {
+loc_set_cmdline(argv, optind, 1);
+drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+} else {
+const QEMUOption *popt;
+
+popt = lookup_opt(argc, argv, &optarg, &optind);
+#ifndef REMOTE_PROCESS
+if (!(popt->arch_mask & arch_type)) {
+error_report("Option not supported for this target,"
+ " %x arch_mask, %x arch_type",
+ popt->arch_mask, arch_type);
+exit(1);
+}
+#endif
+

[PATCH RESEND v6 14/36] multi-process: setup a machine object for remote device process

2020-04-22 Thread elena . ufimtseva
From: Jagannathan Raman 

remote-machine object sets up various subsystems of the remote device
process. Instantiate PCI host bridge object and initialize RAM, IO &
PCI memory regions.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS   |  2 +
 Makefile.objs |  1 +
 exec.c|  3 +-
 include/exec/address-spaces.h |  2 +
 include/remote/machine.h  | 30 +
 remote/Makefile.objs  |  2 +
 remote/machine.c  | 84 +++
 remote/remote-main.c  |  7 +++
 util/Makefile.objs|  2 +
 9 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 include/remote/machine.h
 create mode 100644 remote/machine.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 0cda5ee06a..09764e461c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2861,6 +2861,8 @@ F: include/io/mpqemu-link.h
 F: io/mpqemu-link.c
 F: include/remote/pcihost.h
 F: remote/pcihost.c
+F: include/remote/machine.h
+F: remote/machine.c
 
 Build and test automation
 -
diff --git a/Makefile.objs b/Makefile.objs
index f6654633b4..ff3f06b146 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,7 @@ remote-pci-obj-$(CONFIG_MPQEMU) += block/
 remote-pci-obj-$(CONFIG_MPQEMU) += migration/
 remote-pci-obj-$(CONFIG_MPQEMU) += remote/
 remote-pci-obj-$(CONFIG_MPQEMU) += accel/
+remote-pci-obj-$(CONFIG_MPQEMU) += util/
 
 remote-pci-obj-$(CONFIG_MPQEMU) += cpus-common.o
 remote-pci-obj-$(CONFIG_MPQEMU) += dma-helpers.o
diff --git a/exec.c b/exec.c
index d0ac9545f4..5b1e414099 100644
--- a/exec.c
+++ b/exec.c
@@ -161,7 +161,6 @@ typedef struct subpage_t {
 #define PHYS_SECTION_UNASSIGNED 0
 
 static void io_mem_init(void);
-static void memory_map_init(void);
 static void tcg_log_global_after_sync(MemoryListener *listener);
 static void tcg_commit(MemoryListener *listener);
 
@@ -2963,7 +2962,7 @@ static void tcg_commit(MemoryListener *listener)
 tlb_flush(cpuas->cpu);
 }
 
-static void memory_map_init(void)
+void memory_map_init(void)
 {
 system_memory = g_malloc(sizeof(*system_memory));
 
diff --git a/include/exec/address-spaces.h b/include/exec/address-spaces.h
index db8bfa9a92..56a877b7ba 100644
--- a/include/exec/address-spaces.h
+++ b/include/exec/address-spaces.h
@@ -33,6 +33,8 @@ MemoryRegion *get_system_memory(void);
  */
 MemoryRegion *get_system_io(void);
 
+void memory_map_init(void);
+
 extern AddressSpace address_space_memory;
 extern AddressSpace address_space_io;
 
diff --git a/include/remote/machine.h b/include/remote/machine.h
new file mode 100644
index 00..7e9bdbe28e
--- /dev/null
+++ b/include/remote/machine.h
@@ -0,0 +1,30 @@
+/*
+ * Remote machine configuration
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_MACHINE_H
+#define REMOTE_MACHINE_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/boards.h"
+#include "remote/pcihost.h"
+#include "qemu/notify.h"
+
+typedef struct RemMachineState {
+MachineState parent_obj;
+
+RemPCIHost *host;
+} RemMachineState;
+
+#define TYPE_REMOTE_MACHINE "remote-machine"
+#define REMOTE_MACHINE(obj) \
+OBJECT_CHECK(RemMachineState, (obj), TYPE_REMOTE_MACHINE)
+
+#endif
diff --git a/remote/Makefile.objs b/remote/Makefile.objs
index 2757f5a265..55f405d048 100644
--- a/remote/Makefile.objs
+++ b/remote/Makefile.objs
@@ -1,2 +1,4 @@
 remote-pci-obj-$(CONFIG_MPQEMU) += remote-main.o
 remote-pci-obj-$(CONFIG_MPQEMU) += pcihost.o
+remote-pci-obj-$(CONFIG_MPQEMU) += machine.o
+remote-pci-obj-$(CONFIG_MPQEMU) += ../util/machine-notify.o
diff --git a/remote/machine.c b/remote/machine.c
new file mode 100644
index 00..97e4f194ea
--- /dev/null
+++ b/remote/machine.c
@@ -0,0 +1,84 @@
+/*
+ * Machine for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include 
+#include 
+
+#include "qemu/osdep.h"
+#include "remote/pcihost.h"
+#include "remote/machine.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "exec/ioport.h"
+#include "qemu/thread.h"
+#include "qom/object.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "qemu-common.h"
+#include "sysemu/sysemu.h"
+#include "qemu/notify.h"
+
+static void remote_machine_init(Object *obj)
+{
+RemMachineState *s = REMOTE_MACHINE(obj);
+RemPCIHost *rem_host;
+MemoryRegion *system_memo

[PATCH RESEND v6 28/36] multi-process: send heartbeat messages to remote

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

In order to detect remote processes which are hung, the
proxy periodically sends heartbeat messages to confirm if
the remote process is alive

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 hw/proxy/qemu-proxy.c | 86 +++
 include/hw/proxy/qemu-proxy.h |  3 ++
 include/io/mpqemu-link.h  |  1 +
 io/mpqemu-link.c  |  5 ++
 4 files changed, 95 insertions(+)

diff --git a/hw/proxy/qemu-proxy.c b/hw/proxy/qemu-proxy.c
index 730e28483e..162014353f 100644
--- a/hw/proxy/qemu-proxy.c
+++ b/hw/proxy/qemu-proxy.c
@@ -21,6 +21,78 @@
 
 static void probe_pci_info(PCIDevice *dev);
 
+static void childsig_handler(int sig, siginfo_t *siginfo, void *ctx)
+{
+/* TODO: Add proper handler. */
+printf("Child (pid %d) is dead? Signal is %d, Exit code is %d.\n",
+   siginfo->si_pid, siginfo->si_signo, siginfo->si_code);
+}
+
+static void hb_msg(PCIProxyDev *dev)
+{
+DeviceState *ds = DEVICE(dev);
+MPQemuMsg msg = { 0 };
+uint64_t ret;
+
+if (event_notifier_get_fd(&dev->en_ping) == -1) {
+return;
+}
+
+memset(&msg, 0, sizeof(MPQemuMsg));
+
+msg.num_fds = 1;
+msg.cmd = PROXY_PING;
+msg.bytestream = 0;
+msg.size = 0;
+msg.fds[0] = event_notifier_get_fd(&dev->en_ping);
+
+mpqemu_msg_send(&msg, dev->mpqemu_link->com);
+
+ret = wait_for_remote(msg.fds[0]);
+
+if (ret) {
+printf("Lost contact with remote device %s\n", ds->id);
+/* TODO: Initiate error recovery */
+}
+}
+
+#define NOP_INTERVAL 1000
+
+static void remote_ping(void *opaque)
+{
+PCIProxyDev *dev = opaque;
+
+hb_msg(dev);
+
+timer_mod(dev->hb_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void start_hb_timer(PCIProxyDev *dev)
+{
+dev->hb_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+ remote_ping,
+ dev);
+
+timer_mod(dev->hb_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void stop_hb_timer(PCIProxyDev *dev)
+{
+timer_del(dev->hb_timer);
+timer_free(dev->hb_timer);
+}
+
+static void set_sigchld_handler(void)
+{
+struct sigaction sa_sigterm;
+memset(&sa_sigterm, 0, sizeof(sa_sigterm));
+sa_sigterm.sa_sigaction = childsig_handler;
+sa_sigterm.sa_flags = SA_SIGINFO | SA_NOCLDWAIT | SA_NOCLDSTOP;
+sigaction(SIGCHLD, &sa_sigterm, NULL);
+}
+
 static int config_op_send(PCIProxyDev *dev, uint32_t addr, uint32_t *val, int 
l,
   unsigned int op)
 {
@@ -204,6 +276,19 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error 
**errp)
 setup_irqfd(dev);
 
 probe_pci_info(PCI_DEVICE(dev));
+
+set_sigchld_handler();
+
+event_notifier_init(&dev->en_ping, 0);
+
+start_hb_timer(dev);
+}
+
+static void pci_proxy_dev_exit(PCIDevice *pdev)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
+
+stop_hb_timer(dev);
 }
 
 static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
@@ -211,6 +296,7 @@ static void pci_proxy_dev_class_init(ObjectClass *klass, 
void *data)
 PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
 k->realize = pci_proxy_dev_realize;
+k->exit = pci_proxy_dev_exit;
 k->config_read = pci_proxy_read_config;
 k->config_write = pci_proxy_write_config;
 }
diff --git a/include/hw/proxy/qemu-proxy.h b/include/hw/proxy/qemu-proxy.h
index 0d8ec6d686..26f0a41110 100644
--- a/include/hw/proxy/qemu-proxy.h
+++ b/include/hw/proxy/qemu-proxy.h
@@ -55,6 +55,9 @@ struct PCIProxyDev {
 EventNotifier intr;
 EventNotifier resample;
 
+EventNotifier en_ping;
+QEMUTimer *hb_timer;
+
 int socket;
 
 ProxyMemoryRegion region[PCI_NUM_REGIONS];
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 102c736705..45ea1fcafa 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -50,6 +50,7 @@ typedef enum {
 SET_IRQFD,
 GET_PCI_INFO,
 RET_PCI_INFO,
+PROXY_PING,
 MAX,
 } mpqemu_cmd_t;
 
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index ea519a980e..91a3395566 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -394,6 +394,11 @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
 return false;
 }
 break;
+case PROXY_PING:
+if (msg->size != 0) {
+return false;
+}
+break;
 default:
 break;
 }
-- 
2.25.GIT




[PATCH RESEND v6 36/36] multi-process: add configure and usage information

2020-04-22 Thread elena . ufimtseva
From: Elena Ufimtseva 

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 MAINTAINERS  |  2 +
 docs/multi-process.rst   | 85 +
 scripts/mpqemu-launcher-perf-mode.py | 92 
 scripts/mpqemu-launcher.py   | 53 
 4 files changed, 232 insertions(+)
 create mode 100644 docs/multi-process.rst
 create mode 100755 scripts/mpqemu-launcher-perf-mode.py
 create mode 100755 scripts/mpqemu-launcher.py

diff --git a/MAINTAINERS b/MAINTAINERS
index ed48615e15..8ff3bfae6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2880,6 +2880,8 @@ F: remote/iohub.c
 F: remote/remote-opts.h
 F: remote/remote-opts.c
 F: docs/devel/multi-process.rst
+F: scripts/mpqemu-launcher.py
+F: scripts/mpqemu-launcher-perf-mode.py
 
 Build and test automation
 -
diff --git a/docs/multi-process.rst b/docs/multi-process.rst
new file mode 100644
index 00..8387d6c691
--- /dev/null
+++ b/docs/multi-process.rst
@@ -0,0 +1,85 @@
+Multi-process QEMU
+==
+
+This document describes how to configure and use multi-process qemu.
+For the design document refer to docs/devel/qemu-multiprocess.
+
+1) Configuration
+
+
+To enable support for multi-process add --enable-mpqemu
+to the list of options for the "configure" script.
+
+
+2) Usage
+
+
+Multi-process QEMU requires an orchestrator to launch. Please refer to a
+light-weight python based orchestrator for mpqemu in
+scripts/mpqemu-launcher.py to lauch QEMU in multi-process mode.
+
+scripts/mpqemu-launcher-perf-mode.py launches in "perf" mode. In this mode,
+the same QEMU process connects to multiple remote devices, each emulated in
+a separate process.
+
+As of now, we only support the emulation of lsi53c895a in a separate process.
+
+Following is a description of command-line used to launch mpqemu.
+
+* Orchestrator:
+
+  - The Orchestrator creates a unix socketpair
+
+  - It launches the remote process and passes one of the
+sockets to it via command-line.
+
+  - It then launches QEMU and specifies the other socket as an option
+to the Proxy device object
+
+* Remote Process:
+
+  - The first command-line option in the remote process is one of the
+sockets created by the Orchestrator
+
+  - The remaining options are no different from how one launches QEMU with
+devices. The only other requirement is each PCI device must have a
+unique ID specified to it. This is needed to pair remote device with the
+Proxy object.
+
+  - Example command-line for the remote process is as follows:
+
+  /usr/bin/qemu-scsu-dev 4   \
+  -device lsi53c895a,id=lsi0 \
+  -drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2   \
+  -device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0
+
+* QEMU:
+
+  - Since parts of the RAM are shared between QEMU & remote process, a
+memory-backend-memfd is required to facilitate this, as follows:
+
+-object memory-backend-memfd,id=mem,size=2G
+
+  - A "pci-proxy-dev" device is created for each of the PCI devices emulated
+in the remote process. A "socket" sub-option specifies the other end of
+unix channel created by orchestrator. The "id" sub-option must be specified
+and should be the same as the "id" specified for the remote PCI device
+
+  - Example commandline for QEMU is as follows:
+
+  -device pci-proxy-dev,id=lsi0,socket=3
+
+* Monitor / QMP:
+
+  - The remote process supports QEMU monitor. It could be specified using the
+"-monitor" or "-qmp" command-line options
+
+  - As an example, one could connect to the monitor by adding the following
+to the command-line of the remote process
+
+  -monitor unix:/home/qmp-sock,server,nowait
+
+  - The user could connect to the monitor using the qmp script or using
+"socat" as outlined below:
+
+  socat /home/qmp-sock stdio
diff --git a/scripts/mpqemu-launcher-perf-mode.py 
b/scripts/mpqemu-launcher-perf-mode.py
new file mode 100755
index 00..2733424c76
--- /dev/null
+++ b/scripts/mpqemu-launcher-perf-mode.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import socket
+import os
+import subprocess
+import time
+
+PROC_QEMU='/usr/bin/qemu-system-x86_64'
+
+PROC_REMOTE='/usr/bin/qemu-scsi-dev'
+
+proxy_1, remote_1 = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM)
+proxy_2, remote_2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM)
+proxy_3, remote_3 = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM)
+
+remote_cmd_1 = [ PROC_REMOTE,  
\
+ str(remote_1.fileno()),   
\
+

[PATCH v7 10/21] multi-process: setup memory manager for remote device

2020-06-27 Thread elena . ufimtseva
From: Jagannathan Raman 

SyncSysMemMsg message format is defined. It is used to send
file descriptors of the RAM regions to remote device.
RAM on the remote device is configured with a set of file descriptors.
Old RAM regions are deleted and new regions, each with an fd, is
added to the RAM.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS |  2 ++
 hw/i386/Makefile.objs   |  1 +
 hw/i386/remote-memory.c | 58 +
 include/hw/i386/remote-memory.h | 20 
 include/io/mpqemu-link.h| 13 
 io/mpqemu-link.c| 13 
 6 files changed, 107 insertions(+)
 create mode 100644 hw/i386/remote-memory.c
 create mode 100644 include/hw/i386/remote-memory.h

diff --git a/MAINTAINERS b/MAINTAINERS
index e204b3e0c6..017c96eace 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2950,6 +2950,8 @@ F: include/hw/i386/remote.h
 F: io/mpqemu-link.c
 F: include/io/mpqemu-link.h
 F: hw/i386/remote-msg.c
+F: include/hw/i386/remote-memory.h
+F: hw/i386/remote-memory.c
 
 Build and test automation
 -
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 83969585c1..d85537713e 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -16,6 +16,7 @@ obj-$(CONFIG_VMMOUSE) += vmmouse.o
 obj-$(CONFIG_PC) += port92.o
 obj-$(CONFIG_MPQEMU) += remote.o
 obj-$(CONFIG_MPQEMU) += remote-msg.o
+obj-$(CONFIG_MPQEMU) += remote-memory.o
 
 obj-y += kvmvapic.o
 obj-$(CONFIG_ACPI) += acpi-common.o
diff --git a/hw/i386/remote-memory.c b/hw/i386/remote-memory.c
new file mode 100644
index 00..4fad426bbb
--- /dev/null
+++ b/hw/i386/remote-memory.c
@@ -0,0 +1,58 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/i386/remote-memory.h"
+#include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
+#include "qapi/error.h"
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
+{
+SyncSysmemMsg *sysmem_info = &msg->data1.sync_sysmem;
+MemoryRegion *sysmem, *subregion, *next;
+static unsigned int suffix;
+Error *local_err = NULL;
+char *name;
+int region;
+
+sysmem = get_system_memory();
+
+memory_region_transaction_begin();
+
+QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) 
{
+if (subregion->ram) {
+memory_region_del_subregion(sysmem, subregion);
+qemu_ram_free(subregion->ram_block);
+}
+}
+
+for (region = 0; region < msg->num_fds; region++) {
+subregion = g_new(MemoryRegion, 1);
+name = g_strdup_printf("remote-mem-%u", suffix++);
+memory_region_init_ram_from_fd(subregion, OBJECT(sysmem),
+   name, sysmem_info->sizes[region],
+   RAM_SHARED, msg->fds[region],
+   sysmem_info->offsets[region],
+   &local_err);
+g_free(name);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
+
+memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
+subregion);
+}
+
+memory_region_transaction_commit();
+}
diff --git a/include/hw/i386/remote-memory.h b/include/hw/i386/remote-memory.h
new file mode 100644
index 00..e2e479bb6f
--- /dev/null
+++ b/include/hw/i386/remote-memory.h
@@ -0,0 +1,20 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_MEMORY_H
+#define REMOTE_MEMORY_H
+
+#include "qemu/osdep.h"
+#include "exec/hwaddr.h"
+#include "io/mpqemu-link.h"
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp);
+
+#endif
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 52aa89656c..c6d2b6bf8b 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -18,6 +18,8 @@
 #include "qemu/thread.h"
 #include "io/channel.h"
 #include "io/channel-socket.h"
+#include "exec/cpu-common.h"
+#include "exec/hwaddr.h"
 
 #define REMOTE_MAX_FDS 8
 
@@ -28,13 +30,22 @@
  *
  * MPQemuCmd enum type to specify the command to be executed on the remote
  * device.
+ *
+ * SYNC_SYSMEM  Shares QEMU's RAM with remote device's RAM
  */
 typedef enum {
   

[PATCH v7 01/21] memory: alloc RAM from file at offset

2020-06-27 Thread elena . ufimtseva
From: Jagannathan Raman 

Allow RAM MemoryRegion to be created from an offset in a file, instead
of allocating at offset of 0 by default. This is needed to synchronize
RAM between QEMU & remote process.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 backends/hostmem-memfd.c  |  2 +-
 exec.c| 11 +++
 hw/misc/ivshmem.c |  3 ++-
 include/exec/memory.h |  2 ++
 include/exec/ram_addr.h   |  2 +-
 include/qemu/mmap-alloc.h |  3 ++-
 memory.c  |  3 ++-
 util/mmap-alloc.c |  7 ---
 util/oslib-posix.c|  2 +-
 9 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index 1b5e4bfe0d..a862d010ab 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -56,7 +56,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error 
**errp)
 name = host_memory_backend_get_name(backend);
 memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
name, backend->size,
-   backend->share, fd, errp);
+   backend->share, fd, 0, errp);
 g_free(name);
 }
 
diff --git a/exec.c b/exec.c
index 21926dc9c7..afc42722b6 100644
--- a/exec.c
+++ b/exec.c
@@ -1854,6 +1854,7 @@ static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 int fd,
 bool truncate,
+off_t offset,
 Error **errp)
 {
 void *area;
@@ -1904,7 +1905,8 @@ static void *file_ram_alloc(RAMBlock *block,
 }
 
 area = qemu_ram_mmap(fd, memory, block->mr->align,
- block->flags & RAM_SHARED, block->flags & RAM_PMEM);
+ block->flags & RAM_SHARED, block->flags & RAM_PMEM,
+ offset);
 if (area == MAP_FAILED) {
 error_setg_errno(errp, errno,
  "unable to map backing store for guest RAM");
@@ -2336,7 +2338,7 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp, bool shared)
 #ifdef CONFIG_POSIX
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
  uint32_t ram_flags, int fd,
- Error **errp)
+ off_t offset, Error **errp)
 {
 RAMBlock *new_block;
 Error *local_err = NULL;
@@ -2389,7 +2391,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
MemoryRegion *mr,
 new_block->used_length = size;
 new_block->max_length = size;
 new_block->flags = ram_flags;
-new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
+new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
+ errp);
 if (!new_block->host) {
 g_free(new_block);
 return NULL;
@@ -2419,7 +2422,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, 
MemoryRegion *mr,
 return NULL;
 }
 
-block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
+block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, errp);
 if (!block) {
 if (created) {
 unlink(mem_path);
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index a8dc9b377d..b3cffbd1e0 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -492,7 +492,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, 
Error **errp)
 
 /* mmap the region and map into the BAR2 */
 memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
-   "ivshmem.bar2", size, true, fd, &local_err);
+   "ivshmem.bar2", size, true, fd, 0,
+   &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 7207025bd4..a8fc1c36db 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -902,6 +902,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
  * @size: size of the region.
  * @share: %true if memory must be mmaped with the MAP_SHARED flag
  * @fd: the fd to mmap.
+ * @offset: offset within the file referenced by fd
  * @errp: pointer to Error*, to store an error if it happens.
  *
  * Note that this function does not do anything to cause the data in the
@@ -913,6 +914,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
 uint64_t size,
 bool share,
 int fd,
+ram_addr_t offset,
 Error **errp);
 #endif
 
diff --git 

[PATCH v7 07/21] multi-process: add co-routines to communicate with remote

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

process to avoid blocking the main loop during the message exchanges.
To be used by proxy device.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 include/io/mpqemu-link.h | 16 +
 io/mpqemu-link.c | 78 
 2 files changed, 94 insertions(+)

diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 1542e8ed07..52aa89656c 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -17,6 +17,7 @@
 #include "qom/object.h"
 #include "qemu/thread.h"
 #include "io/channel.h"
+#include "io/channel-socket.h"
 
 #define REMOTE_MAX_FDS 8
 
@@ -30,6 +31,7 @@
  */
 typedef enum {
 INIT = 0,
+RET_MSG,
 MAX = INT_MAX,
 } MPQemuCmd;
 
@@ -67,6 +69,20 @@ typedef struct {
 uint8_t *data2;
 } MPQemuMsg;
 
+struct MPQemuRequest {
+MPQemuMsg *msg;
+QIOChannelSocket *sioc;
+Coroutine *co;
+bool finished;
+int error;
+long ret;
+};
+
+typedef struct MPQemuRequest MPQemuRequest;
+
+uint64_t mpqemu_msg_send_reply_co(MPQemuMsg *msg, QIOChannel *ioc,
+  Error **errp);
+
 void mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc);
 int mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc);
 
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index bfc542b5fd..c430b4d6a2 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -16,6 +16,8 @@
 #include "qapi/error.h"
 #include "qemu/iov.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "io/channel-socket.h"
 
 void mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc)
 {
@@ -118,6 +120,82 @@ int mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc)
 return 0;
 }
 
+/* Use in proxy only as it clobbers fd handlers. */
+static void coroutine_fn mpqemu_msg_send_co(void *data)
+{
+MPQemuRequest *req = (MPQemuRequest *)data;
+MPQemuMsg msg_reply = {0};
+long ret = -EINVAL;
+
+if (!req->sioc) {
+error_report("No channel available to send command %d",
+ req->msg->cmd);
+atomic_mb_set(&req->finished, true);
+req->error = -EINVAL;
+return;
+}
+
+req->co = qemu_coroutine_self();
+mpqemu_msg_send(req->msg, QIO_CHANNEL(req->sioc));
+
+yield_until_fd_readable(req->sioc->fd);
+
+ret = mpqemu_msg_recv(&msg_reply, QIO_CHANNEL(req->sioc));
+if (ret < 0) {
+error_report("ERROR: failed to get a reply for command %d, \
+ errno %s, ret is %ld",
+ req->msg->cmd, strerror(errno), ret);
+req->error = -errno;
+} else {
+if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != RET_MSG) {
+error_report("ERROR: Invalid reply received for command %d",
+ req->msg->cmd);
+req->error = -EINVAL;
+} else {
+req->ret = msg_reply.data1.u64;
+}
+}
+atomic_mb_set(&req->finished, true);
+}
+
+/*
+ * Create if needed and enter co-routine to send the message to the
+ * remote channel ioc and wait for the reply.
+ * Resturns the value from the reply message, sets the error on failure.
+ */
+
+uint64_t mpqemu_msg_send_reply_co(MPQemuMsg *msg, QIOChannel *ioc,
+  Error **errp)
+{
+MPQemuRequest req = {0};
+uint64_t ret = UINT64_MAX;
+
+req.sioc = QIO_CHANNEL_SOCKET(ioc);
+if (!req.sioc) {
+return ret;
+}
+
+req.msg = msg;
+req.ret = 0;
+req.finished = false;
+
+if (!req.co) {
+req.co = qemu_coroutine_create(mpqemu_msg_send_co, &req);
+}
+
+qemu_coroutine_enter(req.co);
+while (!req.finished) {
+aio_poll(qemu_get_aio_context(), false);
+}
+if (req.error) {
+error_setg(errp, "Error exchanging message with remote process, "\
+"socket %d, error %d", req.sioc->fd, req.error);
+}
+ret = req.ret;
+
+return ret;
+}
+
 bool mpqemu_msg_valid(MPQemuMsg *msg)
 {
 if (msg->cmd >= MAX && msg->cmd < 0) {
-- 
2.25.GIT




[PATCH v7 06/21] multi-process: define MPQemuMsg format and transmission functions

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

Defines MPQemuMsg, which is the message that is sent to the remote
process. This message is sent over QIOChannel and is used to
command the remote process to perform various tasks.

Also defined the helper functions to send and receive messages over the
QIOChannel

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS  |   2 +
 include/io/mpqemu-link.h |  75 +++
 io/Makefile.objs |   2 +
 io/mpqemu-link.c | 151 +++
 4 files changed, 230 insertions(+)
 create mode 100644 include/io/mpqemu-link.h
 create mode 100644 io/mpqemu-link.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 83aae5b441..50a5fc53d6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2947,6 +2947,8 @@ F: hw/pci-host/remote.c
 F: include/hw/pci-host/remote.h
 F: hw/i386/remote.c
 F: include/hw/i386/remote.h
+F: io/mpqemu-link.c
+F: include/io/mpqemu-link.h
 
 Build and test automation
 -
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
new file mode 100644
index 00..1542e8ed07
--- /dev/null
+++ b/include/io/mpqemu-link.h
@@ -0,0 +1,75 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef MPQEMU_LINK_H
+#define MPQEMU_LINK_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "io/channel.h"
+
+#define REMOTE_MAX_FDS 8
+
+#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data1.u64)
+
+/**
+ * MPQemuCmd:
+ *
+ * MPQemuCmd enum type to specify the command to be executed on the remote
+ * device.
+ */
+typedef enum {
+INIT = 0,
+MAX = INT_MAX,
+} MPQemuCmd;
+
+/**
+ * Maximum size of data2 field in the message to be transmitted.
+ */
+#define MPQEMU_MSG_DATA_MAX 256
+
+/**
+ * MPQemuMsg:
+ * @cmd: The remote command
+ * @bytestream: Indicates if the data to be shared is structured (data1)
+ *  or unstructured (data2)
+ * @size: Size of the data to be shared
+ * @data1: Structured data
+ * @fds: File descriptors to be shared with remote device
+ * @data2: Unstructured data
+ *
+ * MPQemuMsg Format of the message sent to the remote device from QEMU.
+ *
+ */
+typedef struct {
+int cmd;
+int bytestream;
+size_t size;
+
+union {
+uint64_t u64;
+} data1;
+
+int fds[REMOTE_MAX_FDS];
+int num_fds;
+
+/* Max size of data2 is MPQEMU_MSG_DATA_MAX */
+uint8_t *data2;
+} MPQemuMsg;
+
+void mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc);
+int mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc);
+
+bool mpqemu_msg_valid(MPQemuMsg *msg);
+
+#endif
diff --git a/io/Makefile.objs b/io/Makefile.objs
index 9a20fce4ed..5875ab0697 100644
--- a/io/Makefile.objs
+++ b/io/Makefile.objs
@@ -10,3 +10,5 @@ io-obj-y += channel-util.o
 io-obj-y += dns-resolver.o
 io-obj-y += net-listener.o
 io-obj-y += task.o
+
+io-obj-$(CONFIG_MPQEMU) += mpqemu-link.o
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
new file mode 100644
index 00..bfc542b5fd
--- /dev/null
+++ b/io/mpqemu-link.c
@@ -0,0 +1,151 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/module.h"
+#include "io/mpqemu-link.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/error-report.h"
+
+void mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc)
+{
+Error *local_err = NULL;
+struct iovec send[2];
+int *fds = NULL;
+size_t nfds = 0;
+
+send[0].iov_base = msg;
+send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
+
+send[1].iov_base = msg->bytestream ? msg->data2 : (void *)&msg->data1;
+send[1].iov_len = msg->size;
+
+if (msg->num_fds) {
+nfds = msg->num_fds;
+fds = msg->fds;
+}
+
+(void)qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send), fds, nfds,
+  &local_err);
+if (local_err) {
+error_report_err(local_err);
+}
+}
+
+static int mpqemu_readv(QIOChannel *ioc, struct iovec *iov, int **fds,
+size_t *nfds, Error **errp)
+{
+size_t size, len;
+
+size = iov->iov_len;
+
+while (size > 0) {
+len = qio_channel_readv_full(ioc, iov, 1, fds, nfds, errp);
+
+if (len == QIO_CHANNEL_ERR_BLOCK) {
+if (qemu_in_coroutine()) {
+ 

[PATCH v7 11/21] multi-process: introduce proxy object

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

Defines a PCI Device proxy object as a child of TYPE_PCI_DEVICE.

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 MAINTAINERS|  2 ++
 hw/pci/Makefile.objs   |  1 +
 hw/pci/proxy.c | 70 ++
 include/hw/pci/proxy.h | 43 ++
 4 files changed, 116 insertions(+)
 create mode 100644 hw/pci/proxy.c
 create mode 100644 include/hw/pci/proxy.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 017c96eace..b48c3114c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2952,6 +2952,8 @@ F: include/io/mpqemu-link.h
 F: hw/i386/remote-msg.c
 F: include/hw/i386/remote-memory.h
 F: hw/i386/remote-memory.c
+F: hw/pci/proxy.c
+F: include/hw/pci/proxy.h
 
 Build and test automation
 -
diff --git a/hw/pci/Makefile.objs b/hw/pci/Makefile.objs
index c78f2fb24b..515dda506c 100644
--- a/hw/pci/Makefile.objs
+++ b/hw/pci/Makefile.objs
@@ -12,3 +12,4 @@ common-obj-$(CONFIG_PCI_EXPRESS) += pcie_port.o pcie_host.o
 
 common-obj-$(call lnot,$(CONFIG_PCI)) += pci-stub.o
 common-obj-$(CONFIG_ALL) += pci-stub.o
+obj-$(CONFIG_MPQEMU) += proxy.o
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
new file mode 100644
index 00..6d62399c52
--- /dev/null
+++ b/hw/pci/proxy.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/proxy.h"
+#include "hw/pci/pci.h"
+#include "qapi/error.h"
+#include "io/channel-util.h"
+#include "hw/qdev-properties.h"
+#include "monitor/monitor.h"
+
+static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
+{
+pdev->com = qio_channel_new_fd(fd, errp);
+}
+
+static Property proxy_properties[] = {
+DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(device);
+int proxyfd;
+
+if (dev->fd) {
+proxyfd = monitor_fd_param(cur_mon, dev->fd, errp);
+if (proxyfd == -1) {
+error_prepend(errp, "proxy: unable to parse proxyfd: ");
+return;
+}
+proxy_set_socket(dev, proxyfd, errp);
+}
+}
+
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+k->realize = pci_proxy_dev_realize;
+device_class_set_props(dc, proxy_properties);
+}
+
+static const TypeInfo pci_proxy_dev_type_info = {
+.name  = TYPE_PCI_PROXY_DEV,
+.parent= TYPE_PCI_DEVICE,
+.instance_size = sizeof(PCIProxyDev),
+.class_size= sizeof(PCIProxyDevClass),
+.class_init= pci_proxy_dev_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
+{ },
+},
+};
+
+static void pci_proxy_dev_register_types(void)
+{
+type_register_static(&pci_proxy_dev_type_info);
+}
+
+type_init(pci_proxy_dev_register_types)
diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h
new file mode 100644
index 00..c1c7142fa2
--- /dev/null
+++ b/include/hw/pci/proxy.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PROXY_H
+#define PROXY_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "io/channel.h"
+
+#define TYPE_PCI_PROXY_DEV "pci-proxy-dev"
+
+#define PCI_PROXY_DEV(obj) \
+OBJECT_CHECK(PCIProxyDev, (obj), TYPE_PCI_PROXY_DEV)
+
+#define PCI_PROXY_DEV_CLASS(klass) \
+OBJECT_CLASS_CHECK(PCIProxyDevClass, (klass), TYPE_PCI_PROXY_DEV)
+
+#define PCI_PROXY_DEV_GET_CLASS(obj) \
+OBJECT_GET_CLASS(PCIProxyDevClass, (obj), TYPE_PCI_PROXY_DEV)
+
+typedef struct PCIProxyDev {
+PCIDevice parent_dev;
+char *fd;
+QIOChannel *com;
+} PCIProxyDev;
+
+typedef struct PCIProxyDevClass {
+PCIDeviceClass parent_class;
+
+void (*realize)(PCIProxyDev *dev, Error **errp);
+
+char *command;
+} PCIProxyDevClass;
+
+#endif /* PROXY_H */
-- 
2.25.GIT




[PATCH v7 00/21] Initial support for multi-process qemu

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

Hello

This is the v7 of the patchset.
Thank you very much for the detailed feedback for v6. We appreciate your time.

We have addressed the latest comments and suggestions that were
provided on v6 patch series and incorporated to this patchset.

This is the list of changes for v7:
 - QEMU & remote process share the same binary.
   This allowed us to reduce the number of patches as well.

 - We introduced the machine type "remote" that drives the remote process
   initialization.

 - v7 now uses QIOChannel for communication and descriptors management.

 - The remote process uses the main loop instead of a separate loop.

 - Co-routines support in the QEMU Proxy-remote process communication
   The communication model based on co-routines needs some more work and
   we would like to hear your take on it.
   Stefan has shared some ideas how we can proceed and we will take this
   to the next version after additional discussion.
   We did not implement the protocol to listen and accept new connections.

There are other changes that were incorporated from the feedback we have
received on v6.

We posted the Proof Of Concept patches [2] before the BoF session in 2018.
Subsequently, we posted RFC v1 [3], RFC v2 [4], RFC v3 [5], RFC v4 [6],
v5 [7] and v6 [8] of the patch series.
Following people contributed to this patchset:

John G Johnson 
Jagannathan Raman 
Elena Ufimtseva 
Kanth Ghatraju 
Konrad Wilk 

Also we would like to thank QEMU community for your help, suggestions
and reviewing this large series of patches.

For the full concept writeup about QEMU multi-process, please refer to
docs/devel/qemu-multiprocess.rst. Also see docs/qemu-multiprocess.txt for
usage information.

We will post separate patchsets for the following improvements for
the experimental Qemu multi-process:
 - Live migration;
 - communication channel improvements;

We welcome all your ideas, concerns, and questions for this patchset.

[1]: 
http://events17.linuxfoundation.org/sites/events/files/slides/KVM%20FORUM%20multi-process.pdf
[1]: https://www.youtube.com/watch?v=Kq1-coHh7lg
[2]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg566538.html
[3]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg602285.html
[4]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg624877.html
[5]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg642000.html
[6]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg655118.html
[7]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg682429.html
[8]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg697484.html

Elena Ufimtseva (9):
  multi-process: add qio channel function to transmit
  multi-process: define MPQemuMsg format and transmission functions
  multi-process: add co-routines to communicate with remote
  multi-process: Initialize communication channel at the remote end
  multi-process: introduce proxy object
  multi-process: Forward PCI config space acceses to the remote process
  multi-process: heartbeat messages to remote
  multi-process: perform device reset in the remote process
  multi-process: add configure and usage information

Jagannathan Raman (11):
  memory: alloc RAM from file at offset
  multi-process: Add config option for multi-process QEMU
  multi-process: setup PCI host bridge for remote device
  multi-process: setup a machine object for remote device process
  multi-process: Initialize message handler in remote device
  multi-process: setup memory manager for remote device
  multi-process: Connect Proxy Object with device in the remote process
  multi-process: PCI BAR read/write handling for proxy & remote
endpoints
  multi-process: Synchronize remote memory
  multi-process: create IOHUB object to handle irq
  multi-process: Retrieve PCI info from remote process

John G Johnson (1):
  multi-process: add the concept description to
docs/devel/qemu-multiprocess

 MAINTAINERS  |  24 +
 backends/hostmem-memfd.c |   2 +-
 configure|  11 +
 docs/devel/index.rst |   1 +
 docs/devel/multi-process.rst | 957 +++
 docs/multi-process.rst   |  71 ++
 exec.c   |  11 +-
 hw/Makefile.objs |   1 +
 hw/i386/Makefile.objs|   3 +
 hw/i386/remote-memory.c  |  58 ++
 hw/i386/remote-msg.c | 301 +
 hw/i386/remote.c |  99 +++
 hw/misc/ivshmem.c|   3 +-
 hw/pci-host/Makefile.objs|   1 +
 hw/pci-host/remote.c |  63 ++
 hw/pci/Makefile.objs |   2 +
 hw/pci/memory-sync.c | 214 ++
 hw/pci/proxy.c   | 436 
 hw/remote/Makefile.objs  |   1 +
 hw/remote/iohub.c| 153 +
 include/exec/memory.h|   2 +
 include/exec/ram_addr.h 

[PATCH v7 12/21] multi-process: Connect Proxy Object with device in the remote process

2020-06-27 Thread elena . ufimtseva
From: Jagannathan Raman 

Send a message to the remote process to connect PCI device with the
corresponding Proxy object in QEMU

Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
---
 hw/i386/remote-msg.c | 39 +++
 hw/pci/proxy.c   | 28 
 include/hw/pci/proxy.h   |  1 +
 include/io/mpqemu-link.h |  1 +
 io/mpqemu-link.c |  8 
 5 files changed, 77 insertions(+)

diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 58e24ab2ad..68f50866bb 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -6,6 +6,11 @@
 #include "io/mpqemu-link.h"
 #include "qapi/error.h"
 #include "sysemu/runstate.h"
+#include "io/channel-util.h"
+#include "hw/pci/pci.h"
+
+static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
+Error **errp);
 
 gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
 gpointer opaque)
@@ -34,6 +39,9 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition 
cond,
 }
 
 switch (msg.cmd) {
+case CONNECT_DEV:
+process_connect_dev_msg(&msg, ioc, &local_err);
+break;
 default:
 error_setg(&local_err, "Unknown command (%d) received from proxy \
in remote process pid=%d", msg.cmd, getpid());
@@ -50,3 +58,34 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition 
cond,
 
 return TRUE;
 }
+
+static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
+Error **errp)
+{
+char *devid = (char *)msg->data2;
+QIOChannel *dioc = NULL;
+DeviceState *dev = NULL;
+MPQemuMsg ret = { 0 };
+int rc = 0;
+
+g_assert(devid && (devid[msg->size - 1] == '\0'));
+
+dev = qdev_find_recursive(sysbus_get_default(), devid);
+if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+rc = 0xff;
+goto exit;
+}
+
+dioc = qio_channel_new_fd(msg->fds[0], errp);
+
+qio_channel_add_watch(dioc, G_IO_IN | G_IO_HUP, mpqemu_process_msg,
+  (void *)dev, NULL);
+
+exit:
+ret.cmd = RET_MSG;
+ret.bytestream = 0;
+ret.data1.u64 = rc;
+ret.size = sizeof(ret.data1);
+
+mpqemu_msg_send(&ret, com);
+}
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 6d62399c52..16649ed0ec 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -15,10 +15,38 @@
 #include "io/channel-util.h"
 #include "hw/qdev-properties.h"
 #include "monitor/monitor.h"
+#include "io/mpqemu-link.h"
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
+DeviceState *dev = DEVICE(pdev);
+MPQemuMsg msg = { 0 };
+int fds[2];
+Error *local_err = NULL;
+
 pdev->com = qio_channel_new_fd(fd, errp);
+
+if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) {
+error_setg(errp, "Failed to create proxy channel with fd %d", fd);
+return;
+}
+
+msg.cmd = CONNECT_DEV;
+msg.bytestream = 1;
+msg.data2 = (uint8_t *)dev->id;
+msg.size = strlen(dev->id) + 1;
+msg.num_fds = 1;
+msg.fds[0] = fds[1];
+
+(void)mpqemu_msg_send_reply_co(&msg, pdev->com, &local_err);
+if (local_err) {
+error_setg(errp, "Failed to send DEV_CONNECT to the remote process");
+close(fds[0]);
+} else {
+pdev->dev = qio_channel_new_fd(fds[0], errp);
+}
+
+close(fds[1]);
 }
 
 static Property proxy_properties[] = {
diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h
index c1c7142fa2..72dd7e0944 100644
--- a/include/hw/pci/proxy.h
+++ b/include/hw/pci/proxy.h
@@ -30,6 +30,7 @@ typedef struct PCIProxyDev {
 PCIDevice parent_dev;
 char *fd;
 QIOChannel *com;
+QIOChannel *dev;
 } PCIProxyDev;
 
 typedef struct PCIProxyDevClass {
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index c6d2b6bf8b..d620806c17 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -36,6 +36,7 @@
 typedef enum {
 INIT = 0,
 SYNC_SYSMEM,
+CONNECT_DEV,
 RET_MSG,
 MAX = INT_MAX,
 } MPQemuCmd;
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 5887c8c6c0..54df3b254e 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -234,6 +234,14 @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
 return false;
 }
 break;
+case CONNECT_DEV:
+if ((msg->num_fds != 1) ||
+(msg->fds[0] == -1) ||
+(msg->fds[0] == -1) ||
+!msg->bytestream) {
+return false;
+}
+break;
 default:
 break;
 }
-- 
2.25.GIT




[PATCH v7 13/21] multi-process: Forward PCI config space acceses to the remote process

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

The Proxy Object sends the PCI config space accesses as messages
to the remote process over the communication channel

TODO:
Investigate if the local PCI config writes can be dropped.
Without the proxy local PCI config space writes for the device,
the driver in the guest times out on the probing.
We have tried to only refer to the remote for the PCI config writes,
but the driver timeout in the guest forced as to left this
as it is (removing local PCI config only).

Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
---
 hw/i386/remote-msg.c | 54 
 hw/pci/proxy.c   | 54 
 include/io/mpqemu-link.h |  8 ++
 io/mpqemu-link.c | 14 +++
 4 files changed, 130 insertions(+)

diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 68f50866bb..aa5780d521 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -11,10 +11,16 @@
 
 static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
 Error **errp);
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg);
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+MPQemuMsg *msg);
 
 gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
 gpointer opaque)
 {
+DeviceState *dev = (DeviceState *)opaque;
+PCIDevice *pci_dev = PCI_DEVICE(dev);
 Error *local_err = NULL;
 MPQemuMsg msg = { 0 };
 
@@ -42,6 +48,12 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition 
cond,
 case CONNECT_DEV:
 process_connect_dev_msg(&msg, ioc, &local_err);
 break;
+case PCI_CONFIG_WRITE:
+process_config_write(ioc, pci_dev, &msg);
+break;
+case PCI_CONFIG_READ:
+process_config_read(ioc, pci_dev, &msg);
+break;
 default:
 error_setg(&local_err, "Unknown command (%d) received from proxy \
in remote process pid=%d", msg.cmd, getpid());
@@ -89,3 +101,45 @@ exit:
 
 mpqemu_msg_send(&ret, com);
 }
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg)
+{
+struct conf_data_msg *conf = (struct conf_data_msg *)msg->data2;
+MPQemuMsg ret = { 0 };
+
+if (conf->addr >= PCI_CFG_SPACE_EXP_SIZE) {
+error_report("Bad address received when writing PCI config, pid %d",
+ getpid());
+ret.data1.u64 = UINT64_MAX;
+} else {
+pci_default_write_config(dev, conf->addr, conf->val, conf->l);
+}
+
+ret.cmd = RET_MSG;
+ret.bytestream = 0;
+ret.size = sizeof(ret.data1);
+
+mpqemu_msg_send(&ret, ioc);
+}
+
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+MPQemuMsg *msg)
+{
+struct conf_data_msg *conf = (struct conf_data_msg *)msg->data2;
+MPQemuMsg ret = { 0 };
+
+if (conf->addr >= PCI_CFG_SPACE_EXP_SIZE) {
+error_report("Bad address received when reading PCI config, pid %d",
+ getpid());
+ret.data1.u64 = UINT64_MAX;
+} else {
+ret.data1.u64 = pci_default_read_config(dev, conf->addr, conf->l);
+}
+
+ret.cmd = RET_MSG;
+ret.bytestream = 0;
+ret.size = sizeof(ret.data1);
+
+mpqemu_msg_send(&ret, ioc);
+}
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 16649ed0ec..8934070a20 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -16,6 +16,7 @@
 #include "hw/qdev-properties.h"
 #include "monitor/monitor.h"
 #include "io/mpqemu-link.h"
+#include "qemu/error-report.h"
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
@@ -69,12 +70,65 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error 
**errp)
 }
 }
 
+static int config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
+  int l, unsigned int op)
+{
+struct conf_data_msg conf_data;
+MPQemuMsg msg = { 0 };
+long ret = -EINVAL;
+Error *local_err = NULL;
+
+conf_data.addr = addr;
+conf_data.val = (op == PCI_CONFIG_WRITE) ? *val : 0;
+conf_data.l = l;
+
+msg.data2 = (uint8_t *)&conf_data;
+
+msg.size = sizeof(conf_data);
+msg.cmd = op;
+msg.bytestream = 1;
+
+ret = mpqemu_msg_send_reply_co(&msg, pdev->dev, &local_err);
+if (local_err) {
+error_report("Failed to exchange PCI_CONFIG message with remote");
+}
+if (op == PCI_CONFIG_READ) {
+*val = (uint32_t)ret;
+}
+
+return ret;
+}
+
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+uint32_t val;
+
+(void)config_op

[PATCH v7 18/21] multi-process: heartbeat messages to remote

2020-06-27 Thread elena . ufimtseva
From: Elena Ufimtseva 

In order to detect remote processes which are hung, the
proxy periodically sends heartbeat messages to confirm if
the remote process is alive. The remote process responds
to this heartbeat message to confirm it is alive.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 hw/i386/remote-msg.c | 14 ++
 hw/pci/proxy.c   | 58 
 include/hw/pci/proxy.h   |  2 ++
 include/io/mpqemu-link.h |  1 +
 io/mpqemu-link.c |  1 +
 5 files changed, 76 insertions(+)

diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 9379ee6442..919bddc1d5 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -22,6 +22,7 @@ static void process_bar_write(QIOChannel *ioc, MPQemuMsg 
*msg, Error **errp);
 static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
 static void process_get_pci_info_msg(QIOChannel *ioc, MPQemuMsg *msg,
  PCIDevice *pci_dev);
+static void process_proxy_ping_msg(QIOChannel *ioc);
 
 gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
 gpointer opaque)
@@ -76,6 +77,9 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition 
cond,
 case GET_PCI_INFO:
 process_get_pci_info_msg(ioc, &msg, pci_dev);
 break;
+case PROXY_PING:
+process_proxy_ping_msg(ioc);
+break;
 default:
 error_setg(&local_err, "Unknown command (%d) received from proxy \
in remote process pid=%d", msg.cmd, getpid());
@@ -269,3 +273,13 @@ static void process_get_pci_info_msg(QIOChannel *ioc, 
MPQemuMsg *msg,
 
 mpqemu_msg_send(&ret, ioc);
 }
+
+static void process_proxy_ping_msg(QIOChannel *ioc)
+{
+MPQemuMsg ret = { 0 };
+
+ret.cmd = RET_MSG;
+ret.size = sizeof(ret.data1);
+
+mpqemu_msg_send(&ret, ioc);
+}
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 449341e459..e2e9a13287 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -24,6 +24,8 @@
 #include "util/event_notifier-posix.c"
 
 static void probe_pci_info(PCIDevice *dev);
+static void start_hb_timer(PCIProxyDev *dev);
+static void pci_proxy_dev_exit(PCIDevice *pdev);
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
@@ -132,6 +134,8 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error 
**errp)
 setup_irqfd(dev);
 
 probe_pci_info(PCI_DEVICE(dev));
+
+start_hb_timer(dev);
 }
 
 static int config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
@@ -192,6 +196,7 @@ static void pci_proxy_dev_class_init(ObjectClass *klass, 
void *data)
 k->realize = pci_proxy_dev_realize;
 k->config_read = pci_proxy_read_config;
 k->config_write = pci_proxy_write_config;
+k->exit = pci_proxy_dev_exit;
 
 device_class_set_props(dc, proxy_properties);
 }
@@ -356,3 +361,56 @@ static void probe_pci_info(PCIDevice *dev)
 }
 }
 }
+
+static void hb_msg(PCIProxyDev *dev)
+{
+DeviceState *ds = DEVICE(dev);
+MPQemuMsg msg = { 0 };
+long ret = -EINVAL;
+Error *local_err = NULL;
+
+msg.cmd = PROXY_PING;
+msg.bytestream = 0;
+msg.size = 0;
+
+ret = mpqemu_msg_send_reply_co(&msg, dev->com, &local_err);
+if (local_err) {
+error_report("Lost contact with remote device %s, error code %ld",
+ ds->id, ret);
+}
+}
+
+#define NOP_INTERVAL 1000
+
+static void remote_ping(void *opaque)
+{
+PCIProxyDev *dev = opaque;
+
+hb_msg(dev);
+
+timer_mod(dev->hb_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void start_hb_timer(PCIProxyDev *dev)
+{
+dev->hb_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+ remote_ping,
+ dev);
+
+timer_mod(dev->hb_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL);
+}
+
+static void stop_hb_timer(PCIProxyDev *dev)
+{
+timer_del(dev->hb_timer);
+timer_free(dev->hb_timer);
+}
+
+static void pci_proxy_dev_exit(PCIDevice *pdev)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
+
+stop_hb_timer(dev);
+}
diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h
index e6f076ae95..037740309d 100644
--- a/include/hw/pci/proxy.h
+++ b/include/hw/pci/proxy.h
@@ -53,6 +53,8 @@ struct PCIProxyDev {
 EventNotifier intr;
 EventNotifier resample;
 
+QEMUTimer *hb_timer;
+
 ProxyMemoryRegion region[PCI_NUM_REGIONS];
 };
 
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index 4b96cb8ccb..676d7eb3ef 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -44,6 +44,7 @@ typedef enum {
 BAR_READ,
 SET_IRQFD,
 GET_PCI_INFO,
+PROXY_PING,
 MAX = INT_MAX,
 } MPQemuCmd;
 
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index d09b2

[PATCH v7 16/21] multi-process: create IOHUB object to handle irq

2020-06-27 Thread elena . ufimtseva
From: Jagannathan Raman 

IOHUB object is added to manage PCI IRQs. It uses KVM_IRQFD
ioctl to create irqfd to injecting PCI interrupts to the guest.
IOHUB object forwards the irqfd to the remote process. Remote process
uses this fd to directly send interrupts to the guest, bypassing QEMU.

Signed-off-by: John G Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS   |   2 +
 hw/Makefile.objs  |   1 +
 hw/i386/remote-msg.c  |   4 +
 hw/i386/remote.c  |  15 
 hw/pci/proxy.c|  52 +
 hw/remote/Makefile.objs   |   1 +
 hw/remote/iohub.c | 153 ++
 include/hw/i386/remote.h  |   2 +
 include/hw/pci/pci_ids.h  |   3 +
 include/hw/pci/proxy.h|   8 ++
 include/hw/remote/iohub.h |  50 +
 include/io/mpqemu-link.h  |   6 ++
 io/mpqemu-link.c  |   1 +
 13 files changed, 298 insertions(+)
 create mode 100644 hw/remote/Makefile.objs
 create mode 100644 hw/remote/iohub.c
 create mode 100644 include/hw/remote/iohub.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 38d605445e..f9ede7e094 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2956,6 +2956,8 @@ F: hw/pci/proxy.c
 F: include/hw/pci/proxy.h
 F: hw/pci/memory-sync.c
 F: include/hw/pci/memory-sync.h
+F: hw/remote/iohub.c
+F: include/hw/remote/iohub.h
 
 Build and test automation
 -
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 4cbe5e4e57..8caf659de0 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -40,6 +40,7 @@ devices-dirs-$(CONFIG_MEM_DEVICE) += mem/
 devices-dirs-$(CONFIG_NUBUS) += nubus/
 devices-dirs-y += semihosting/
 devices-dirs-y += smbios/
+devices-dirs-y += remote/
 endif
 
 common-obj-y += $(devices-dirs-y)
diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 48b153eaae..67fee4bb57 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -10,6 +10,7 @@
 #include "hw/pci/pci.h"
 #include "exec/memattrs.h"
 #include "hw/i386/remote-memory.h"
+#include "hw/remote/iohub.h"
 
 static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
 Error **errp);
@@ -67,6 +68,9 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition 
cond,
 case SYNC_SYSMEM:
 remote_sysmem_reconfig(&msg, &local_err);
 break;
+case SET_IRQFD:
+process_set_irqfd_msg(pci_dev, &msg);
+break;
 default:
 error_setg(&local_err, "Unknown command (%d) received from proxy \
in remote process pid=%d", msg.cmd, getpid());
diff --git a/hw/i386/remote.c b/hw/i386/remote.c
index 5342e884ad..8e74a6f1af 100644
--- a/hw/i386/remote.c
+++ b/hw/i386/remote.c
@@ -17,12 +17,16 @@
 #include "qapi/error.h"
 #include "io/channel-util.h"
 #include "io/channel.h"
+#include "hw/pci/pci_host.h"
+#include "hw/remote/iohub.h"
 
 static void remote_machine_init(MachineState *machine)
 {
 MemoryRegion *system_memory, *system_io, *pci_memory;
 RemMachineState *s = REMOTE_MACHINE(machine);
 RemotePCIHost *rem_host;
+PCIHostState *pci_host;
+PCIDevice *pci_dev;
 
 system_memory = get_system_memory();
 system_io = get_system_io();
@@ -42,6 +46,17 @@ static void remote_machine_init(MachineState *machine)
 memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
 
 qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
+
+pci_host = PCI_HOST_BRIDGE(rem_host);
+pci_dev = pci_create_simple_multifunction(pci_host->bus,
+  PCI_DEVFN(REMOTE_IOHUB_DEV,
+REMOTE_IOHUB_FUNC),
+  true, TYPE_REMOTE_IOHUB_DEVICE);
+
+s->iohub = REMOTE_IOHUB_DEVICE(pci_dev);
+
+pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+ s->iohub, REMOTE_IOHUB_NB_PIRQS);
 }
 
 static void remote_set_socket(Object *obj, const char *str, Error **errp)
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 5ecbdd2dcf..9d8559b6d4 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -19,6 +19,9 @@
 #include "qemu/error-report.h"
 #include "hw/pci/memory-sync.h"
 #include "qom/object.h"
+#include "qemu/event_notifier.h"
+#include "sysemu/kvm.h"
+#include "util/event_notifier-posix.c"
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
@@ -57,6 +60,53 @@ static Property proxy_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static void proxy_intx_update(PCIDevice *pci_dev)
+{
+PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
+PCIINTxRoute route;
+int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+if (dev->irqfd.fd) 

[PATCH v7 03/21] multi-process: setup PCI host bridge for remote device

2020-06-27 Thread elena . ufimtseva
From: Jagannathan Raman 

PCI host bridge is setup for the remote device process. It is
implemented using remote-pcihost object. It is an extension of the PCI
host bridge setup by QEMU.
Remote-pcihost configures a PCI bus which could be used by the remote
PCI device to latch on to.

Signed-off-by: Jagannathan Raman 
Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
---
 MAINTAINERS  |  8 +
 hw/pci-host/Makefile.objs|  1 +
 hw/pci-host/remote.c | 63 
 include/hw/pci-host/remote.h | 34 +++
 4 files changed, 106 insertions(+)
 create mode 100644 hw/pci-host/remote.c
 create mode 100644 include/hw/pci-host/remote.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 1b40446c73..e46f1960bf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2938,6 +2938,14 @@ S: Maintained
 F: hw/semihosting/
 F: include/hw/semihosting/
 
+Multi-process QEMU
+M: Jagannathan Raman 
+M: Elena Ufimtseva 
+M: John G Johnson 
+S: Maintained
+F: hw/pci-host/remote.c
+F: include/hw/pci-host/remote.h
+
 Build and test automation
 -
 Build and test automation
diff --git a/hw/pci-host/Makefile.objs b/hw/pci-host/Makefile.objs
index e422e0aca0..daf900710d 100644
--- a/hw/pci-host/Makefile.objs
+++ b/hw/pci-host/Makefile.objs
@@ -18,6 +18,7 @@ common-obj-$(CONFIG_XEN_IGD_PASSTHROUGH) += xen_igd_pt.o
 common-obj-$(CONFIG_PCI_EXPRESS_Q35) += q35.o
 common-obj-$(CONFIG_PCI_EXPRESS_GENERIC_BRIDGE) += gpex.o
 common-obj-$(CONFIG_PCI_EXPRESS_XILINX) += xilinx-pcie.o
+common-obj-$(CONFIG_MPQEMU) += remote.o
 
 common-obj-$(CONFIG_PCI_EXPRESS_DESIGNWARE) += designware.o
 obj-$(CONFIG_POWERNV) += pnv_phb4.o pnv_phb4_pec.o
diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c
new file mode 100644
index 00..5ea9af4154
--- /dev/null
+++ b/hw/pci-host/remote.c
@@ -0,0 +1,63 @@
+/*
+ * Remote PCI host device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/qdev-properties.h"
+#include "hw/pci-host/remote.h"
+#include "exec/memory.h"
+
+static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
+PCIBus *rootbus)
+{
+return ":00";
+}
+
+static void remote_pcihost_realize(DeviceState *dev, Error **errp)
+{
+char *busname = g_strdup_printf("remote-pci-%ld", (unsigned long)getpid());
+PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+RemotePCIHost *s = REMOTE_HOST_DEVICE(dev);
+
+pci->bus = pci_root_bus_new(DEVICE(s), busname,
+s->mr_pci_mem, s->mr_sys_io,
+0, TYPE_PCIE_BUS);
+}
+
+static void remote_pcihost_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+
+hc->root_bus_path = remote_pcihost_root_bus_path;
+dc->realize = remote_pcihost_realize;
+
+dc->user_creatable = false;
+set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+dc->fw_name = "pci";
+}
+
+static const TypeInfo remote_pcihost_info = {
+.name = TYPE_REMOTE_HOST_DEVICE,
+.parent = TYPE_PCIE_HOST_BRIDGE,
+.instance_size = sizeof(RemotePCIHost),
+.class_init = remote_pcihost_class_init,
+};
+
+static void remote_pcihost_register(void)
+{
+type_register_static(&remote_pcihost_info);
+}
+
+type_init(remote_pcihost_register)
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
new file mode 100644
index 00..3df1b53c17
--- /dev/null
+++ b/include/hw/pci-host/remote.h
@@ -0,0 +1,34 @@
+/*
+ * PCI Host for remote device
+ *
+ * Copyright © 2018, 2020 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_PCIHOST_H
+#define REMOTE_PCIHOST_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "exec/memory.h"
+#include "hw/pci/pcie_host.h"
+
+#define TYPE_REMOTE_HOST_DEVICE "remote-pcihost"
+#define REMOTE_HOST_DEVICE(obj) \
+OBJECT_CHECK(RemotePCIHost, (obj), TYPE_REMOTE_HOST_DEVICE)
+
+typedef struct RemotePCIHost {
+/*< private >*/
+PCIExpressHost parent_obj;
+/*< public >*/
+
+MemoryRegion *mr_pci_mem;
+MemoryRegion *mr_sys_mem;
+MemoryRegion *mr_sys_io;
+} RemotePCIHost;
+
+#endif
-- 
2.25.GIT




  1   2   3   4   >