from:"Claudio Imbrenda"

Re: [PATCH 3/5] qemu-options: Remove the deprecated -async-teardown option

2024-01-12 Thread Claudio Imbrenda

On Fri, 12 Jan 2024 11:00:57 +0100
Thomas Huth  wrote:

> It's been marked as deprecated since QEMU 8.1 (and was only available
> since QEMU 8.0 anyway), so it should be fine to remove this now.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  docs/about/deprecated.rst   |  5 -
>  docs/about/removed-features.rst |  5 +
>  system/vl.c |  6 --
>  qemu-options.hx | 10 --
>  4 files changed, 5 insertions(+), 21 deletions(-)
> 
> diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> index dff4c76f1b..80eacd40ba 100644
> --- a/docs/about/deprecated.rst
> +++ b/docs/about/deprecated.rst
> @@ -63,11 +63,6 @@ as short-form boolean values, and passed to plugins as 
> ``arg_name=on``.
>  However, short-form booleans are deprecated and full explicit ``arg_name=on``
>  form is preferred.
>  
> -``-async-teardown`` (since 8.1)
> -'''
> -
> -Use ``-run-with async-teardown=on`` instead.
> -
>  ``-chroot`` (since 8.1)
>  '''
>  
> diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
> index ae728b6130..43f64a26ba 100644
> --- a/docs/about/removed-features.rst
> +++ b/docs/about/removed-features.rst
> @@ -472,6 +472,11 @@ Use ``-machine hpet=off`` instead.
>  The ``-no-acpi`` setting has been turned into a machine property.
>  Use ``-machine acpi=off`` instead.
>  
> +``-async-teardown`` (removed in 9.0)
> +
> +
> +Use ``-run-with async-teardown=on`` instead.
> +
>  
>  QEMU Machine Protocol (QMP) commands
>  
> diff --git a/system/vl.c b/system/vl.c
> index 7e258889f3..924356f864 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -3600,12 +3600,6 @@ void qemu_init(int argc, char **argv)
>  case QEMU_OPTION_daemonize:
>  os_set_daemonize(true);
>  break;
> -#if defined(CONFIG_LINUX)
> -/* deprecated */
> -case QEMU_OPTION_asyncteardown:
> -init_async_teardown();
> -break;
> -#endif
>  case QEMU_OPTION_run_with: {
>  const char *str;
>  opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"),
> diff --git a/qemu-options.hx b/qemu-options.hx
> index dafecf47d6..10c952ba3f 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -4975,16 +4975,6 @@ HXCOMM Internal use
>  DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
>  DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
>  
> -#ifdef __linux__
> -DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
> -"-async-teardown enable asynchronous teardown\n",
> -QEMU_ARCH_ALL)
> -SRST
> -``-async-teardown``
> -This option is deprecated and should no longer be used. The new option
> -``-run-with async-teardown=on`` is a replacement.
> -ERST
> -#endif
>  #ifdef CONFIG_POSIX
>  DEF("run-with", HAS_ARG, QEMU_OPTION_run_with,
>  "-run-with [async-teardown=on|off][,chroot=dir]\n"

Re: [PATCH v2] target/s390x/kvm/pv: Provide some more useful information if decryption fails

2024-01-11 Thread Claudio Imbrenda

On Wed, 10 Jan 2024 15:29:16 +0100
Thomas Huth  wrote:

> It's a common scenario to copy guest images from one host to another
> to run the guest on the other machine. This (of course) does not work
> with "secure exection" guests since they are encrypted with one certain

*execution

with that fixed:

Reviewed-by: Claudio Imbrenda 

> host key. However, if you still (accidentally) do it, you only get a
> very user-unfriendly error message that looks like this:
> 
>  qemu-system-s390x: KVM PV command 2 (KVM_PV_SET_SEC_PARMS) failed:
>   header rc 108 rrc 5 IOCTL rc: -22
> 
> Let's provide at least a somewhat nicer hint to the users so that they
> are able to figure out what might have gone wrong.
> 
> Buglink: https://issues.redhat.com/browse/RHEL-18212
> Signed-off-by: Thomas Huth 
> ---
>  v2: Print the error in s390_machine_protect() instead of doing it
>  in s390_pv_set_sec_parms(), report the text via Error **errp
> 
>  hw/s390x/ipl.h |  2 +-
>  target/s390x/kvm/pv.h  |  2 +-
>  hw/s390x/ipl.c |  5 ++---
>  hw/s390x/s390-virtio-ccw.c |  5 -
>  target/s390x/kvm/pv.c  | 25 -
>  5 files changed, 28 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
> index 7fc86e7905..57cd125769 100644
> --- a/hw/s390x/ipl.h
> +++ b/hw/s390x/ipl.h
> @@ -107,7 +107,7 @@ typedef union IplParameterBlock IplParameterBlock;
>  
>  int s390_ipl_set_loadparm(uint8_t *loadparm);
>  void s390_ipl_update_diag308(IplParameterBlock *iplb);
> -int s390_ipl_prepare_pv_header(void);
> +int s390_ipl_prepare_pv_header(Error **errp);
>  int s390_ipl_pv_unpack(void);
>  void s390_ipl_prepare_cpu(S390CPU *cpu);
>  IplParameterBlock *s390_ipl_get_iplb(void);
> diff --git a/target/s390x/kvm/pv.h b/target/s390x/kvm/pv.h
> index 7b935e2246..fca373a826 100644
> --- a/target/s390x/kvm/pv.h
> +++ b/target/s390x/kvm/pv.h
> @@ -42,7 +42,7 @@ int s390_pv_query_info(void);
>  int s390_pv_vm_enable(void);
>  void s390_pv_vm_disable(void);
>  bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms);
> -int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
> +int s390_pv_set_sec_parms(uint64_t origin, uint64_t length, Error **errp);
>  int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
>  void s390_pv_prep_reset(void);
>  int s390_pv_verify(void);
> diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
> index 76110e8f58..e934bf89d1 100644
> --- a/hw/s390x/ipl.c
> +++ b/hw/s390x/ipl.c
> @@ -702,7 +702,7 @@ static void s390_ipl_prepare_qipl(S390CPU *cpu)
>  cpu_physical_memory_unmap(addr, len, 1, len);
>  }
>  
> -int s390_ipl_prepare_pv_header(void)
> +int s390_ipl_prepare_pv_header(Error **errp)
>  {
>  IplParameterBlock *ipib = s390_ipl_get_iplb_pv();
>  IPLBlockPV *ipib_pv = >pv;
> @@ -711,8 +711,7 @@ int s390_ipl_prepare_pv_header(void)
>  
>  cpu_physical_memory_read(ipib_pv->pv_header_addr, hdr,
>   ipib_pv->pv_header_len);
> -rc = s390_pv_set_sec_parms((uintptr_t)hdr,
> -   ipib_pv->pv_header_len);
> +rc = s390_pv_set_sec_parms((uintptr_t)hdr, ipib_pv->pv_header_len, errp);
>  g_free(hdr);
>  return rc;
>  }
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index 1169e20b94..eaf61d3640 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -391,7 +391,7 @@ static int s390_machine_protect(S390CcwMachineState *ms)
>  }
>  
>  /* Set SE header and unpack */
> -rc = s390_ipl_prepare_pv_header();
> +rc = s390_ipl_prepare_pv_header(_err);
>  if (rc) {
>  goto out_err;
>  }
> @@ -410,6 +410,9 @@ static int s390_machine_protect(S390CcwMachineState *ms)
>  return rc;
>  
>  out_err:
> +if (local_err) {
> +error_report_err(local_err);
> +}
>  s390_machine_unprotect(ms);
>  return rc;
>  }
> diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
> index 6a69be7e5c..7ca7faec73 100644
> --- a/target/s390x/kvm/pv.c
> +++ b/target/s390x/kvm/pv.c
> @@ -29,7 +29,8 @@ static bool info_valid;
>  static struct kvm_s390_pv_info_vm info_vm;
>  static struct kvm_s390_pv_info_dump info_dump;
>  
> -static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data)
> +static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data,
> + int *pvrc)
>  {
>  struct kvm_pv_cmd pv_cmd = {
>  .cmd = cmd,
> @@ -46,6 +47,9 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, 
> void *data)
>   "IOCTL rc: %d", cmd, cmdname, pv_

Re: [PATCH] target/s390x/kvm/pv: Provide some more useful information if decryption fails

2024-01-09 Thread Claudio Imbrenda

On Tue,  9 Jan 2024 15:30:38 +0100
Thomas Huth  wrote:

> It's a common scenario to copy guest images from one host to another
> to run the guest on the other machine. This (of course) does not work
> with "secure exection" guests since they are encrypted with one certain

"secure execution"

> host key. However, if you still (accidentally) do it, you only get a
> very user-unfriendly error message that looks like this:
> 
>  qemu-system-s390x: KVM PV command 2 (KVM_PV_SET_SEC_PARMS) failed:
>   header rc 108 rrc 5 IOCTL rc: -22
> 
> Let's provide at least a somewhat nicer hint to the users so that they
> are able to figure out what might have gone wrong.
> 
> Buglink: https://issues.redhat.com/browse/RHEL-18212
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  target/s390x/kvm/pv.c | 20 
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
> index 6a69be7e5c..2833a255fa 100644
> --- a/target/s390x/kvm/pv.c
> +++ b/target/s390x/kvm/pv.c
> @@ -29,7 +29,8 @@ static bool info_valid;
>  static struct kvm_s390_pv_info_vm info_vm;
>  static struct kvm_s390_pv_info_dump info_dump;
>  
> -static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data)
> +static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data,
> + int *pvrc)
>  {
>  struct kvm_pv_cmd pv_cmd = {
>  .cmd = cmd,
> @@ -46,6 +47,9 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, 
> void *data)
>   "IOCTL rc: %d", cmd, cmdname, pv_cmd.rc, pv_cmd.rrc,
>   rc);
>  }
> +if (pvrc) {
> +*pvrc = pv_cmd.rc;
> +}
>  return rc;
>  }
>  
> @@ -53,12 +57,13 @@ static int __s390_pv_cmd(uint32_t cmd, const char 
> *cmdname, void *data)
>   * This macro lets us pass the command as a string to the function so
>   * we can print it on an error.
>   */
> -#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data)
> +#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data, NULL)
> +#define s390_pv_cmd_pvrc(cmd, data, pvrc) __s390_pv_cmd(cmd, #cmd, data, 
> pvrc)
>  #define s390_pv_cmd_exit(cmd, data)\
>  {  \
>  int rc;\
> \
> -rc = __s390_pv_cmd(cmd, #cmd, data);\
> +rc = __s390_pv_cmd(cmd, #cmd, data, NULL); \
>  if (rc) {  \
>  exit(1);   \
>  }  \
> @@ -144,12 +149,19 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState 
> *ms)
>  
>  int s390_pv_set_sec_parms(uint64_t origin, uint64_t length)
>  {
> +int ret, pvrc;
>  struct kvm_s390_pv_sec_parm args = {
>  .origin = origin,
>  .length = length,
>  };
>  
> -return s390_pv_cmd(KVM_PV_SET_SEC_PARMS, );
> +ret = s390_pv_cmd_pvrc(KVM_PV_SET_SEC_PARMS, , );
> +if (ret && pvrc == 0x108) {
> +error_report("Can't set secure parameters, please check whether "
> + "the image is correctly encrypted for this host");
> +}
> +
> +return ret;
>  }
>  
>  /*

Re: [PATCH v2 3/3] target/s390x/arch_dump: Add arch cleanup function for PV dumps

2023-11-09 Thread Claudio Imbrenda

On Thu,  9 Nov 2023 12:04:43 +
Janosch Frank  wrote:

> PV dumps block vcpu runs until dump end is reached. If there's an
> error between PV dump init and PV dump end the vm will never be able
> to run again. One example of such an error is insufficient disk space
> for the dump file.
> 
> Let's add a cleanup function that tries to do a dump end. The dump
> completion data is discarded but there's no point in writing it to a
> file anyway if there's a possibility that other PV dump data is
> missing.
> 
> Signed-off-by: Janosch Frank 

Reviewed-by: Claudio Imbrenda 

> ---
>  target/s390x/arch_dump.c | 17 +
>  1 file changed, 17 insertions(+)
> 
> diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
> index bdb0bfa0e7..7e8a1b4fc0 100644
> --- a/target/s390x/arch_dump.c
> +++ b/target/s390x/arch_dump.c
> @@ -433,6 +433,22 @@ static int arch_sections_write(DumpState *s, uint8_t 
> *buff)
>  return 0;
>  }
>  
> +static void arch_cleanup(DumpState *s)
> +{
> +g_autofree uint8_t *buff = NULL;
> +int rc;
> +
> +if (!pv_dump_initialized) {
> +return;
> +}
> +
> +buff = g_malloc(kvm_s390_pv_dmp_get_size_completion_data());
> +rc = kvm_s390_dump_completion_data(buff);
> +if (!rc) {
> +pv_dump_initialized = false;
> +}
> +}
> +
>  int cpu_get_dump_info(ArchDumpInfo *info,
>const struct GuestPhysBlockList *guest_phys_blocks)
>  {
> @@ -448,6 +464,7 @@ int cpu_get_dump_info(ArchDumpInfo *info,
>  info->arch_sections_add_fn = *arch_sections_add;
>  info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
>  info->arch_sections_write_fn = *arch_sections_write;
> +info->arch_cleanup_fn = *arch_cleanup;
>  }
>  return 0;
>  }

Re: [PATCH 2/4] target/s390x/dump: Remove unneeded dump info function pointer init

2023-11-07 Thread Claudio Imbrenda

On Tue,  7 Nov 2023 14:20:46 +
Janosch Frank  wrote:

> dump_state_prepare() now sets the fucntion pointers to NULL so we only
> need to touch them if we're going to use them.
> 
> Signed-off-by: Janosch Frank 

I would merge this and the previous patch

> ---
>  target/s390x/arch_dump.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
> index 51a2116515..bdb0bfa0e7 100644
> --- a/target/s390x/arch_dump.c
> +++ b/target/s390x/arch_dump.c
> @@ -448,10 +448,6 @@ int cpu_get_dump_info(ArchDumpInfo *info,
>  info->arch_sections_add_fn = *arch_sections_add;
>  info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
>  info->arch_sections_write_fn = *arch_sections_write;
> -} else {
> -info->arch_sections_add_fn = NULL;
> -info->arch_sections_write_hdr_fn = NULL;
> -info->arch_sections_write_fn = NULL;
>  }
>  return 0;
>  }

Re: [PATCH] MAINTAINERS: Fix a couple s390 paths

2023-10-20 Thread Claudio Imbrenda

On Fri, 20 Oct 2023 16:15:09 +0200
Eric Farman  wrote:

> These are simple typos, since the directories don't exist but the
> files themselves do in hw/s390x/
> 
> Fixes: 56e3483402 ("MAINTAINERS: split out s390x sections")
> Signed-off-by: Eric Farman 

Reviewed-by: Claudio Imbrenda 

> ---
>  MAINTAINERS | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9bd4fe378d..ac71eff7fa 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2574,7 +2574,7 @@ M: Halil Pasic 
>  M: Christian Borntraeger 
>  S: Supported
>  F: hw/s390x/storage-keys.h
> -F: hw/390x/s390-skeys*.c
> +F: hw/s390x/s390-skeys*.c
>  L: qemu-s3...@nongnu.org
>  
>  S390 storage attribute device
> @@ -2582,7 +2582,7 @@ M: Halil Pasic 
>  M: Christian Borntraeger 
>  S: Supported
>  F: hw/s390x/storage-attributes.h
> -F: hw/s390/s390-stattrib*.c
> +F: hw/s390x/s390-stattrib*.c
>  L: qemu-s3...@nongnu.org
>  
>  S390 floating interrupt controller

Re: util/async-teardown.c: is it really needed for --disable-system build?

2023-08-14 Thread Claudio Imbrenda

On Mon, 14 Aug 2023 10:12:35 +0300
Michael Tokarev  wrote:

> 14.08.2023 10:01, Claudio Imbrenda wrote:
> 
> > I think we could guard the offending item with CONFIG_SOFTMMU for now,
> > to immediately fix the issues you raised, and do the refactoring you
> > proposed later (e.g. next cycle).  
> 
> I don't think rushing for the last-minute fix is necessary in this case.

yes and no

it's a bug (which I introduced), and the quick fix seems to be
easy enough, so why not?

> It has real build problem on ia64 only which does not work for several
> releases anyway, and the linking of unnecessary pieces happened for a
> long time too.
> 
> /mjt

Re: util/async-teardown.c: is it really needed for --disable-system build?

2023-08-14 Thread Claudio Imbrenda

On Sat, 12 Aug 2023 12:48:14 +0300
Michael Tokarev  wrote:

> 12.08.2023 12:38, Michael Tokarev wrote:
> ...
> > It smells like, at the very least, os-posix.c should be split. We shouldn't 
> > include
> > a ton of qemu-system functionality (like very specific option parsing) into 
> > qemu-nbd
> > for example.
> > 
> > How about splitting os-posix.c into a few files in util/ (not in the root 
> > dir), and
> > adding them to util_ss in case of posix-os?  Ditto for os-win32.c, I guess, 
> > but I
> > haven't looked at this.
> > 
> > And for the question in $subj, this one needs to be guarded by 
> > CONFIG_SOFTMMU.  
> 
> Or maybe better yet, put the softmmu-specific functions (one very good 
> example here
> is os_parse_cmd_args() function - it clearly belongs to softmmu/, it should 
> never
> has been in global os-foo.c but in some softmmu-os-foo.c instead.  This way,
> async-teardown.c is moved to softmmu/ too, maybe os-linux-async-teardown.c.
> 
> /mjt

I think we could guard the offending item with CONFIG_SOFTMMU for now,
to immediately fix the issues you raised, and do the refactoring you
proposed later (e.g. next cycle).

what do you think?

Re: [PATCH] os-posix: Allow 'chroot' via '-run-with' and deprecate the old '-chroot' option

2023-06-30 Thread Claudio Imbrenda

On Fri, 30 Jun 2023 17:01:12 +0200
Thomas Huth  wrote:

> We recently introduced "-run-with" for options that influence the
> runtime behavior of QEMU. This option has the big advantage that it
> can group related options (so that it is easier for the users to spot
> them) and that the options become introspectable via QMP this way.
> So let's start moving more switches into this option group, starting
> with "-chroot" now.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  docs/about/deprecated.rst |  5 +
>  os-posix.c| 35 ++-
>  util/async-teardown.c | 21 -
>  qemu-options.hx   | 18 +-
>  4 files changed, 52 insertions(+), 27 deletions(-)
> 
> diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> index 0743459862..1cf53b86ce 100644
> --- a/docs/about/deprecated.rst
> +++ b/docs/about/deprecated.rst
> @@ -116,6 +116,11 @@ Use "whpx" (on Windows) or "hvf" (on macOS) instead.
>  
>  Use ``-run-with async-teardown=on`` instead.
>  
> +``-chroot`` (since 8.1)
> +'''
> +
> +Use ``-run-with chroot=dir`` instead.
> +
>  ``-singlestep`` (since 8.1)
>  '''
>  
> diff --git a/os-posix.c b/os-posix.c
> index 90ea71725f..0ae1fb2347 100644
> --- a/os-posix.c
> +++ b/os-posix.c
> @@ -38,6 +38,7 @@
>  #include "qemu/cutils.h"
>  #include "qemu/config-file.h"
>  #include "qemu/option.h"
> +#include "qemu/module.h"
>  
>  #ifdef CONFIG_LINUX
>  #include 
> @@ -148,6 +149,7 @@ int os_parse_cmd_args(int index, const char *optarg)
>  }
>  break;
>  case QEMU_OPTION_chroot:
> +warn_report("option is deprecated, use '-run-with chroot=...' 
> instead");
>  chroot_dir = optarg;
>  break;
>  case QEMU_OPTION_daemonize:
> @@ -158,18 +160,25 @@ int os_parse_cmd_args(int index, const char *optarg)
>  case QEMU_OPTION_asyncteardown:
>  init_async_teardown();
>  break;
> +#endif
>  case QEMU_OPTION_run_with: {
> +const char *str;
>  QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"),
>   optarg, false);
>  if (!opts) {
>  exit(1);
>  }
> +#if defined(CONFIG_LINUX)
>  if (qemu_opt_get_bool(opts, "async-teardown", false)) {
>  init_async_teardown();
>  }
> +#endif
> +str = qemu_opt_get(opts, "chroot");
> +if (str) {
> +chroot_dir = str;
> +}
>  break;
>  }
> -#endif
>  default:
>  return -1;
>  }
> @@ -348,3 +357,27 @@ int os_mlock(void)
>  return -ENOSYS;
>  #endif
>  }
> +
> +static QemuOptsList qemu_run_with_opts = {
> +.name = "run-with",
> +.head = QTAILQ_HEAD_INITIALIZER(qemu_run_with_opts.head),
> +.desc = {
> +#if defined(CONFIG_LINUX)
> +{
> +.name = "async-teardown",
> +.type = QEMU_OPT_BOOL,
> +},
> +#endif
> +{
> +.name = "chroot",
> +.type = QEMU_OPT_STRING,
> +},
> +{ /* end of list */ }
> +},
> +};
> +
> +static void register_teardown(void)
> +{
> +qemu_add_opts(_run_with_opts);
> +}
> +opts_init(register_teardown);
> diff --git a/util/async-teardown.c b/util/async-teardown.c
> index 3ab19c8740..62cdeb0f20 100644
> --- a/util/async-teardown.c
> +++ b/util/async-teardown.c
> @@ -12,9 +12,6 @@
>   */
>  
>  #include "qemu/osdep.h"
> -#include "qemu/config-file.h"
> -#include "qemu/option.h"
> -#include "qemu/module.h"
>  #include 
>  #include 
>  #include 
> @@ -147,21 +144,3 @@ void init_async_teardown(void)
>  clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
>  sigprocmask(SIG_SETMASK, _signals, NULL);
>  }
> -
> -static QemuOptsList qemu_run_with_opts = {
> -.name = "run-with",
> -.head = QTAILQ_HEAD_INITIALIZER(qemu_run_with_opts.head),
> -.desc = {
> -{
> -.name = "async-teardown",
> -.type = QEMU_OPT_BOOL,
> -},
> -{ /* end of list */ }
> -},
> -};
> -
> -static void register_teardown(void)
> -{
> -qemu_add_opts(_run_with_opts);
> -}
> -opts_init(register_teardown);
> diff --git a/qemu-options.hx b/qemu-options.hx
> index b57489d7ca..f49

Re: [PATCH v3 7/7] pc-bios/s390-ccw: Don't use __bss_start with the "larl" instruction

2023-06-29 Thread Claudio Imbrenda

On Thu, 29 Jun 2023 13:12:26 +0200
Thomas Huth  wrote:

> On 29/06/2023 12.58, Claudio Imbrenda wrote:
> > On Thu, 29 Jun 2023 12:48:21 +0200
> > Thomas Huth  wrote:
> >   
> >> start.S currently cannot be compiled with Clang 16 and binutils 2.40:
> >>
> >>   ld: start.o(.text+0x8): misaligned symbol `__bss_start' (0xc1e5) for
> >>   relocation R_390_PC32DBL
> >>
> >> According to the built-in linker script of ld, the symbol __bss_start
> >> can actually point *before* the .bss section and does not need to have
> >> any alignment, so in certain situations (like when using the internal
> >> assembler of Clang), the __bss_start symbol can indeed be unaligned
> >> and thus it is not suitable for being used with the "larl" instruction
> >> that needs an address that is at least aligned to halfwords.
> >> The problem went unnoticed so far since binutils <= 2.39 did not
> >> check the alignment, but starting with binutils 2.40, such unaligned
> >> addresses are now refused.
> >>
> >> Fix it by loading the address indirectly instead.  
> > 
> > what are the advantages of this solution compared to your previous one
> > (i.e. align .bss) ?  
> 
> __bss_start is supposed to point to an address that is before all bss-like 
> segments. There are also segments like .sbss and .bss.plt on other 
> architectures, see https://bugzilla.redhat.com/show_bug.cgi?id=2216662#c11 .
> Seems like we don't have them on s390x yet, so currently my previous patch 
> is fine, too. But in case there will ever be an extension to the s390x ABI 
> that introduces such additional segments, we have to switch back to 
> __bss_start again. So it sounds slightly more future-proof to me to keep 
> __bss_start here, even if we need a slightly more complex startup code here 
> now.

fair enough

Reviewed-by: Claudio Imbrenda 

> 
>   Thomas
> 
>

Re: [PATCH v3 1/7] s390-ccw: Getting rid of ulong

2023-06-29 Thread Claudio Imbrenda

On Thu, 29 Jun 2023 12:48:15 +0200
Thomas Huth  wrote:

> From: Juan Quintela 
> 
> Any good reason why this still exist?
> I can understand u* and __u* to be linux kernel like, but ulong?

shorter code? ¯\_(ツ)_/¯

> 
> Signed-off-by: Juan Quintela 
> Message-Id: <20230510143925.4094-4-quint...@redhat.com>
> Reviewed-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/helper.h|  2 +-
>  pc-bios/s390-ccw/s390-ccw.h  |  7 +++
>  pc-bios/s390-ccw/virtio-scsi.h   |  2 +-
>  pc-bios/s390-ccw/virtio.h|  4 ++--
>  pc-bios/s390-ccw/virtio-blkdev.c | 12 ++--
>  pc-bios/s390-ccw/virtio-scsi.c   |  4 ++--
>  pc-bios/s390-ccw/virtio.c| 12 ++--
>  7 files changed, 21 insertions(+), 22 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/helper.h b/pc-bios/s390-ccw/helper.h
> index 3d0731c4c6..8e3dfcb6d6 100644
> --- a/pc-bios/s390-ccw/helper.h
> +++ b/pc-bios/s390-ccw/helper.h
> @@ -38,7 +38,7 @@ static inline void yield(void)
>  
>  static inline void sleep(unsigned int seconds)
>  {
> -ulong target = get_time_seconds() + seconds;
> +unsigned long target = get_time_seconds() + seconds;
>  
>  while (get_time_seconds() < target) {
>  yield();
> diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
> index b88e0550ab..f849fba74b 100644
> --- a/pc-bios/s390-ccw/s390-ccw.h
> +++ b/pc-bios/s390-ccw/s390-ccw.h
> @@ -17,7 +17,6 @@ typedef unsigned char  u8;
>  typedef unsigned short u16;
>  typedef unsigned int   u32;
>  typedef unsigned long long u64;
> -typedef unsigned long  ulong;
>  typedef unsigned char  __u8;
>  typedef unsigned short __u16;
>  typedef unsigned int   __u32;
> @@ -67,11 +66,11 @@ void sclp_get_loadparm_ascii(char *loadparm);
>  int sclp_read(char *str, size_t count);
>  
>  /* virtio.c */
> -unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2,
> - ulong subchan_id, void *load_addr);
> +unsigned long virtio_load_direct(unsigned long rec_list1, unsigned long 
> rec_list2,
> + unsigned long subchan_id, void *load_addr);
>  bool virtio_is_supported(SubChannelId schid);
>  int virtio_blk_setup_device(SubChannelId schid);
> -int virtio_read(ulong sector, void *load_addr);
> +int virtio_read(unsigned long sector, void *load_addr);
>  
>  /* bootmap.c */
>  void zipl_load(void);
> diff --git a/pc-bios/s390-ccw/virtio-scsi.h b/pc-bios/s390-ccw/virtio-scsi.h
> index e6b6cd4815..c5612e16a2 100644
> --- a/pc-bios/s390-ccw/virtio-scsi.h
> +++ b/pc-bios/s390-ccw/virtio-scsi.h
> @@ -68,7 +68,7 @@ static inline bool virtio_scsi_response_ok(const 
> VirtioScsiCmdResp *r)
>  }
>  
>  int virtio_scsi_read_many(VDev *vdev,
> -  ulong sector, void *load_addr, int sec_num);
> +  unsigned long sector, void *load_addr, int 
> sec_num);
>  int virtio_scsi_setup_device(SubChannelId schid);
>  
>  #endif /* VIRTIO_SCSI_H */
> diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h
> index e657d381ec..85bd9d1695 100644
> --- a/pc-bios/s390-ccw/virtio.h
> +++ b/pc-bios/s390-ccw/virtio.h
> @@ -190,14 +190,14 @@ int virtio_get_block_size(void);
>  uint8_t virtio_get_heads(void);
>  uint8_t virtio_get_sectors(void);
>  uint64_t virtio_get_blocks(void);
> -int virtio_read_many(ulong sector, void *load_addr, int sec_num);
> +int virtio_read_many(unsigned long sector, void *load_addr, int sec_num);
>  
>  #define VIRTIO_SECTOR_SIZE 512
>  #define VIRTIO_ISO_BLOCK_SIZE 2048
>  #define VIRTIO_SCSI_BLOCK_SIZE 512
>  #define VIRTIO_DASD_DEFAULT_BLOCK_SIZE 4096
>  
> -static inline ulong virtio_sector_adjust(ulong sector)
> +static inline unsigned long virtio_sector_adjust(unsigned long sector)
>  {
>  return sector * (virtio_get_block_size() / VIRTIO_SECTOR_SIZE);
>  }
> diff --git a/pc-bios/s390-ccw/virtio-blkdev.c 
> b/pc-bios/s390-ccw/virtio-blkdev.c
> index 794f99b42c..a81207b52e 100644
> --- a/pc-bios/s390-ccw/virtio-blkdev.c
> +++ b/pc-bios/s390-ccw/virtio-blkdev.c
> @@ -16,7 +16,7 @@
>  #define VIRTIO_BLK_F_GEOMETRY   (1 << 4)
>  #define VIRTIO_BLK_F_BLK_SIZE   (1 << 6)
>  
> -static int virtio_blk_read_many(VDev *vdev, ulong sector, void *load_addr,
> +static int virtio_blk_read_many(VDev *vdev, unsigned long sector, void 
> *load_addr,
>  int sec_num)
>  {
>  VirtioBlkOuthdr out_hdr;
> @@ -49,7 +49,7 @@ static int virtio_blk_read_many(VDev *vdev, ulong sector, 
> void *load_addr,
>  return status;
>  }
>  
> -int

Re: [PATCH v3 7/7] pc-bios/s390-ccw: Don't use __bss_start with the "larl" instruction

2023-06-29 Thread Claudio Imbrenda

On Thu, 29 Jun 2023 12:48:21 +0200
Thomas Huth  wrote:

> start.S currently cannot be compiled with Clang 16 and binutils 2.40:
> 
>  ld: start.o(.text+0x8): misaligned symbol `__bss_start' (0xc1e5) for
>  relocation R_390_PC32DBL
> 
> According to the built-in linker script of ld, the symbol __bss_start
> can actually point *before* the .bss section and does not need to have
> any alignment, so in certain situations (like when using the internal
> assembler of Clang), the __bss_start symbol can indeed be unaligned
> and thus it is not suitable for being used with the "larl" instruction
> that needs an address that is at least aligned to halfwords.
> The problem went unnoticed so far since binutils <= 2.39 did not
> check the alignment, but starting with binutils 2.40, such unaligned
> addresses are now refused.
> 
> Fix it by loading the address indirectly instead.

what are the advantages of this solution compared to your previous one
(i.e. align .bss) ?

> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2216662
> Reported-by: Miroslav Rezanina 
> Suggested-by:  Andreas Krebbel 
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/start.S | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
> index 429a2b30a1..061b06591c 100644
> --- a/pc-bios/s390-ccw/start.S
> +++ b/pc-bios/s390-ccw/start.S
> @@ -19,7 +19,8 @@ _start:
>  larl%r15,stack + STACK_SIZE - STACK_FRAME_SIZE   /* Set up stack */
>  
>  /* clear bss */
> -larl%r2,__bss_start
> +larl%r2,bss_start_literal   /* __bss_start might be unaligned ... */
> +lg  %r2,0(%r2)  /* ... so load it indirectly */
>  larl%r3,_end
>  slgr%r3,%r2/* get sizeof bss */
>  ltgr%r3,%r3/* bss empty? */
> @@ -45,7 +46,6 @@ done:
>  memsetxc:
>  xc  0(1,%r1),0(%r1)
>  
> -
>  /*
>   * void disabled_wait(void)
>   *
> @@ -113,6 +113,8 @@ io_new_code:
>  br  %r14
>  
>  .align  8
> +bss_start_literal:
> +.quad   __bss_start
>  disabled_wait_psw:
>  .quad   0x000200018000,0x
>  enabled_wait_psw:

Re: [PATCH] pc-bios/s390-ccw: Get rid of the the __u* types

2023-06-27 Thread Claudio Imbrenda

On Tue, 27 Jun 2023 13:41:01 +0200
Thomas Huth  wrote:

> Using types starting with double underscores should be avoided since these
> names are marked as reserved by the C standard. The corresponding Linux
> kernel header file has also been changed accordingly a long time ago:
> 
>  
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/drivers/s390/cio/cio.h?id=cd6b4f27b9bb2a
> 
> So we should get rid of the __u* in the s390-ccw bios now finally, too.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  Based-on: <20230510143925.4094-4-quint...@redhat.com>
> 
>  pc-bios/s390-ccw/cio.h  | 232 ++--
>  pc-bios/s390-ccw/s390-ccw.h |   4 -
>  2 files changed, 116 insertions(+), 120 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
> index efeb449572..c977a52b50 100644
> --- a/pc-bios/s390-ccw/s390-ccw.h
> +++ b/pc-bios/s390-ccw/s390-ccw.h
> @@ -17,10 +17,6 @@ typedef unsigned char  u8;
>  typedef unsigned short u16;
>  typedef unsigned int   u32;
>  typedef unsigned long long u64;
> -typedef unsigned char  __u8;
> -typedef unsigned short __u16;
> -typedef unsigned int   __u32;
> -typedef unsigned long long __u64;
>  
>  #define true 1
>  #define false 0
> diff --git a/pc-bios/s390-ccw/cio.h b/pc-bios/s390-ccw/cio.h
> index 88a88adfd2..8b18153deb 100644
> --- a/pc-bios/s390-ccw/cio.h
> +++ b/pc-bios/s390-ccw/cio.h
> @@ -17,32 +17,32 @@
>   * path management control word
>   */
>  struct pmcw {
> -__u32 intparm;  /* interruption parameter */
> -__u32 qf:1; /* qdio facility */
> -__u32 w:1;
> -__u32 isc:3;/* interruption subclass */
> -__u32 res5:3;   /* reserved zeros */
> -__u32 ena:1;/* enabled */
> -__u32 lm:2; /* limit mode */
> -__u32 mme:2;/* measurement-mode enable */
> -__u32 mp:1; /* multipath mode */
> -__u32 tf:1; /* timing facility */
> -__u32 dnv:1;/* device number valid */
> -__u32 dev:16;   /* device number */
> -__u8  lpm;  /* logical path mask */
> -__u8  pnom; /* path not operational mask */
> -__u8  lpum; /* last path used mask */
> -__u8  pim;  /* path installed mask */
> -__u16 mbi;  /* measurement-block index */
> -__u8  pom;  /* path operational mask */
> -__u8  pam;  /* path available mask */
> -__u8  chpid[8]; /* CHPID 0-7 (if available) */
> -__u32 unused1:8;/* reserved zeros */
> -__u32 st:3; /* subchannel type */
> -__u32 unused2:18;   /* reserved zeros */
> -__u32 mbfc:1;   /* measurement block format control */
> -__u32 xmwme:1;  /* extended measurement word mode enable */
> -__u32 csense:1; /* concurrent sense; can be enabled ...*/
> +u32 intparm;/* interruption parameter */
> +u32 qf:1;   /* qdio facility */
> +u32 w:1;
> +u32 isc:3;  /* interruption subclass */
> +u32 res5:3; /* reserved zeros */
> +u32 ena:1;  /* enabled */
> +u32 lm:2;   /* limit mode */
> +u32 mme:2;  /* measurement-mode enable */
> +u32 mp:1;   /* multipath mode */
> +u32 tf:1;   /* timing facility */
> +u32 dnv:1;  /* device number valid */
> +u32 dev:16; /* device number */
> +u8  lpm;/* logical path mask */
> +u8  pnom;   /* path not operational mask */
> +u8  lpum;   /* last path used mask */
> +u8  pim;/* path installed mask */
> +u16 mbi;/* measurement-block index */
> +u8  pom;/* path operational mask */
> +u8  pam;/* path available mask */
> +u8  chpid[8];   /* CHPID 0-7 (if available) */
> +u32 unused1:8;  /* reserved zeros */
> +u32 st:3;   /* subchannel type */
> +u32 unused2:18; /* reserved zeros */
> +u32 mbfc:1; /* measurement block format control */
> +u32 xmwme:1;/* extended measurement word mode enable */
> +u32 csense:1;   /* concurrent sense; can be enabled ...*/
>  /*  ... per MSCH, however, if facility */
>  /*  ... is not installed, this results */
>  /*  ... in an operand exception.   */
> @@ -50,24 +50,24 @@ struct pmcw {
>  
>  /* Target SCHIB configuration. */
>  struct schib_config {
> -__u64 mba;
> -__u32 intparm;
> -__u16 mbi;
> -__u32 isc:3;
> -__u32 ena:1;
> -__

Re: [PATCH v2 3/4] pc-bios/s390-ccw: Move the stack array into start.S

2023-06-27 Thread Claudio Imbrenda

On Tue, 27 Jun 2023 09:47:02 +0200
Thomas Huth  wrote:

> The stack array is only referenced from the start-up code (which is
> shared between the s390-ccw.img and the s390-netboot.img), but it is
> currently declared twice, once in main.c and once in netmain.c.
> It makes more sense to declare this in start.S instead - which will
> also be helpful in the next patch, since we need to mention the .bss
> section in start.S in that patch.
> 
> While we're at it, let's also drop the huge alignment of the stack,
> since there is no technical requirement for aligning it to page
> boundaries.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  pc-bios/s390-ccw/s390-ccw.h | 1 -
>  pc-bios/s390-ccw/main.c | 1 -
>  pc-bios/s390-ccw/netmain.c  | 1 -
>  pc-bios/s390-ccw/start.S| 6 ++
>  4 files changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
> index b88e0550ab..91afcbbca9 100644
> --- a/pc-bios/s390-ccw/s390-ccw.h
> +++ b/pc-bios/s390-ccw/s390-ccw.h
> @@ -55,7 +55,6 @@ void consume_io_int(void);
>  /* main.c */
>  void write_subsystem_identification(void);
>  void write_iplb_location(void);
> -extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
>  unsigned int get_loadparm_index(void);
>  void main(void);
>  
> diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
> index a2def83e82..5506798098 100644
> --- a/pc-bios/s390-ccw/main.c
> +++ b/pc-bios/s390-ccw/main.c
> @@ -17,7 +17,6 @@
>  #include "virtio-scsi.h"
>  #include "dasd-ipl.h"
>  
> -char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
>  static SubChannelId blk_schid = { .one = 1 };
>  static char loadparm_str[LOADPARM_LEN + 1];
>  QemuIplParameters qipl;
> diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c
> index 056e93a818..5cd619b2d6 100644
> --- a/pc-bios/s390-ccw/netmain.c
> +++ b/pc-bios/s390-ccw/netmain.c
> @@ -50,7 +50,6 @@ void write_iplb_location(void) {}
>  /* STSI 3.2.2 offset of first vmdb + offset of uuid inside vmdb */
>  #define STSI322_VMDB_UUID_OFFSET ((8 + 12) * 4)
>  
> -char stack[PAGE_SIZE * 8] __attribute__((aligned(PAGE_SIZE)));
>  IplParameterBlock iplb __attribute__((aligned(PAGE_SIZE)));
>  static char cfgbuf[2048];
>  
> diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
> index 29b0a9ece0..111dea261b 100644
> --- a/pc-bios/s390-ccw/start.S
> +++ b/pc-bios/s390-ccw/start.S
> @@ -120,3 +120,9 @@ external_new_mask:
>  .quad   0x00018000
>  io_new_mask:
>  .quad   0x00018000
> +
> +.bss
> +.align  8
> +stack:
> +.space  STACK_SIZE
> +.size   stack,STACK_SIZE

Re: [PATCH v2 1/4] pc-bios/s390-ccw: Fix indentation in start.S

2023-06-27 Thread Claudio Imbrenda

On Tue, 27 Jun 2023 09:47:00 +0200
Thomas Huth  wrote:

> start.S is currently indented with a mixture of spaces and tabs, which
> is quite ugly. QEMU coding style says indentation should be 4 spaces,
> and this is also what we are using in the assembler files in the
> tests/tcg/s390x/ folder already, so let's adjust start.S accordingly.
> 
> Reviewed-by: Cédric Le Goater 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  pc-bios/s390-ccw/start.S | 136 +++
>  1 file changed, 68 insertions(+), 68 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
> index 6072906df4..d29de09cc6 100644
> --- a/pc-bios/s390-ccw/start.S
> +++ b/pc-bios/s390-ccw/start.S
> @@ -10,37 +10,37 @@
>   * directory.
>   */
>  
> -.globl _start
> +.globl _start
>  _start:
>  
> - larl   %r15, stack + 0x8000 /* Set up stack */
> +larl%r15,stack + 0x8000 /* Set up stack */
>  
> - /* clear bss */
> - larl %r2, __bss_start
> - larl %r3, _end
> - slgr %r3, %r2   /* get sizeof bss */
> - ltgr%r3,%r3 /* bss empty? */
> - jz  done
> - aghi%r3,-1
> - srlg%r4,%r3,8   /* how many 256 byte chunks? */
> - ltgr%r4,%r4
> - lgr %r1,%r2
> - jz  remainder
> +/* clear bss */
> +larl%r2,__bss_start
> +larl%r3,_end
> +slgr%r3,%r2/* get sizeof bss */
> +ltgr%r3,%r3/* bss empty? */
> +jz  done
> +aghi%r3,-1
> +srlg%r4,%r3,8  /* how many 256 byte chunks? */
> +ltgr%r4,%r4
> +lgr %r1,%r2
> +jz  remainder
>  loop:
> - xc  0(256,%r1),0(%r1)
> - la  %r1,256(%r1)
> - brctg   %r4,loop
> +xc  0(256,%r1),0(%r1)
> +la  %r1,256(%r1)
> +brctg   %r4,loop
>  remainder:
> - larl%r2,memsetxc
> - ex  %r3,0(%r2)
> +larl%r2,memsetxc
> +ex  %r3,0(%r2)
>  done:
> -/* set up a pgm exception disabled wait psw */
> -larl %r2, disabled_wait_psw
> -mvc  0x01d0(16), 0(%r2)
> -j  main  /* And call C */
> +/* set up a pgm exception disabled wait psw */
> +larl%r2,disabled_wait_psw
> +mvc 0x01d0(16),0(%r2)
> +j   main   /* And call C */
>  
>  memsetxc:
> - xc  0(1,%r1),0(%r1)
> +xc  0(1,%r1),0(%r1)
>  
>  
>  /*
> @@ -48,11 +48,11 @@ memsetxc:
>   *
>   * stops the current guest cpu.
>   */
> - .globl disabled_wait
> +.globl disabled_wait
>  disabled_wait:
> - larl%r1,disabled_wait_psw
> - lpswe   0(%r1)
> -1:   j   1b
> +larl%r1,disabled_wait_psw
> +lpswe   0(%r1)
> +1:  j   1b
>  
>  
>  /*
> @@ -60,61 +60,61 @@ disabled_wait:
>   *
>   * eats one sclp interrupt
>   */
> -.globl consume_sclp_int
> +.globl consume_sclp_int
>  consume_sclp_int:
> -/* enable service interrupts in cr0 */
> -stctg   %c0,%c0,0(%r15)
> -oi  6(%r15),0x2
> -lctlg   %c0,%c0,0(%r15)
> -/* prepare external call handler */
> -larl %r1, external_new_code
> -stg %r1, 0x1b8
> -larl %r1, external_new_mask
> -mvc 0x1b0(8),0(%r1)
> -/* load enabled wait PSW */
> -larl %r1, enabled_wait_psw
> -lpswe 0(%r1)
> +/* enable service interrupts in cr0 */
> +stctg   %c0,%c0,0(%r15)
> +oi  6(%r15),0x2
> +lctlg   %c0,%c0,0(%r15)
> +/* prepare external call handler */
> +larl%r1,external_new_code
> +stg %r1,0x1b8
> +larl%r1,external_new_mask
> +mvc 0x1b0(8),0(%r1)
> +/* load enabled wait PSW */
> +larl%r1,enabled_wait_psw
> +lpswe   0(%r1)
>  
>  /*
>   * void consume_io_int(void)
>   *
>   * eats one I/O interrupt
>   */
> -.globl consume_io_int
> +.globl consume_io_int
>  consume_io_int:
> -/* enable I/O interrupts in cr6 */
> -stctg %c6,%c6,0(%r15)
> -oi4(%r15), 0xff
> -lctlg %c6,%c6,0(%r15)
> -/* prepare i/o call handler */
> -larl  %r1, io_new_code
> -stg   %r1, 0x1f8
> -larl  %r1, io_new_mask
> -mvc   0x1f0(8),0(%r1)
> -/* load enabled wait PSW */
> -larl  %r1, enabled_wait_psw
> -lpswe 0(%r1)
> +/* enable I/O interrupts in cr6 */
> +stctg   %c6,%c6,0(%r15)
> +oi  4(%r15), 0xff
> +lctlg   %c6,%c6,0(%r15)
> +/* prepare i/o call handler */
> +larl%r1,io_new_cod

Re: [PATCH v2 4/4] pc-bios/s390-ccw: Don't use __bss_start with the "larl" instruction

2023-06-27 Thread Claudio Imbrenda

On Tue, 27 Jun 2023 09:47:03 +0200
Thomas Huth  wrote:

> start.S currently cannot be compiled with Clang 16 and binutils 2.40:
> 
>  ld: start.o(.text+0x8): misaligned symbol `__bss_start' (0xc1e5) for
>  relocation R_390_PC32DBL
> 
> According to the built-in linker script of ld, the symbol __bss_start
> can actually point *before* the .bss section and does not need to have
> any alignment, so in certain situations (like when using the internal
> assembler of Clang), the __bss_start symbol can indeed be unaligned
> and thus it is not suitable for being used with the "larl" instruction
> that needs an address that is at least aligned to halfwords.
> The problem went unnoticed so far since binutils <= 2.39 did not
> check the alignment, but starting with binutils 2.40, such unaligned
> addresses are now refused.
> 
> Fix it by using the real start address of the .bss section instead.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2216662
> Reported-by: Miroslav Rezanina 
> Suggested-by: Nick Clifton 
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/start.S | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
> index 111dea261b..a63c4e3ff2 100644
> --- a/pc-bios/s390-ccw/start.S
> +++ b/pc-bios/s390-ccw/start.S
> @@ -18,7 +18,7 @@ _start:
>  larl%r15,stack + STACK_SIZE - 160   /* Set up stack */
>  
>  /* clear bss */
> -larl%r2,__bss_start
> +larl%r2,.bss
>  larl    %r3,_end

since we are here, do you have guarantees that _end is always correctly
aligned?

if so:

Reviewed-by: Claudio Imbrenda 

>  slgr%r3,%r2/* get sizeof bss */
>  ltgr%r3,%r3/* bss empty? */

Re: [PATCH v2 2/4] pc-bios/s390-ccw: Provide space for initial stack frame in start.S

2023-06-27 Thread Claudio Imbrenda

On Tue, 27 Jun 2023 09:47:01 +0200
Thomas Huth  wrote:

> Providing the space of a stack frame is the duty of the caller,
> so we should reserve 160 bytes before jumping into the main function.
> Otherwise the main() function might write past the stack array.
> 
> While we're at it, add a proper STACK_SIZE macro for the stack size
> instead of using magic numbers (this is also required for the following
> patch).
> 
> Reviewed-by: Christian Borntraeger 
> Reviewed-by: Cédric Le Goater 
> Signed-off-by: Thomas Huth 


with Marc's suggestion applied:

Reviewed-by: Claudio Imbrenda 

> ---
>  pc-bios/s390-ccw/start.S | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
> index d29de09cc6..29b0a9ece0 100644
> --- a/pc-bios/s390-ccw/start.S
> +++ b/pc-bios/s390-ccw/start.S
> @@ -10,10 +10,12 @@
>   * directory.
>   */
>  
> +#define STACK_SIZE 0x8000
> +
>  .globl _start
>  _start:
>  
> -larl%r15,stack + 0x8000 /* Set up stack */
> +larl%r15,stack + STACK_SIZE - 160   /* Set up stack */
>  
>  /* clear bss */
>  larl%r2,__bss_start

Re: [PATCH v3 4/6] util/osdep: Introduce qemu_close_range()

2023-06-19 Thread Claudio Imbrenda

On Sat, 17 Jun 2023 13:36:19 +0800
Bin Meng  wrote:

> This introduces a new QEMU API qemu_close_range() that closes all
> open file descriptors from first to last (included).
> 
> This API will try a more efficient call to close_range(), or walk
> through of /proc/self/fd whenever these are possible, otherwise it
> falls back to a plain close loop.
> 
> Co-developed-by: Zhangjin Wu 
> Signed-off-by: Bin Meng 
> 
> ---
> 
> Changes in v3:
> - fix win32 build failure
> 
> Changes in v2:
> - new patch: "util/osdep: Introduce qemu_close_range()"
> 
>  include/qemu/osdep.h |  1 +
>  util/osdep.c | 48 
>  2 files changed, 49 insertions(+)
> 
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index cc61b00ba9..e22434ce10 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...);
>  int qemu_open(const char *name, int flags, Error **errp);
>  int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
>  int qemu_close(int fd);
> +int qemu_close_range(unsigned int first, unsigned int last);
>  int qemu_unlink(const char *name);
>  #ifndef _WIN32
>  int qemu_dup_flags(int fd, int flags);
> diff --git a/util/osdep.c b/util/osdep.c
> index e996c4744a..91275e70f8 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -30,6 +30,7 @@
>  #include "qemu/mprotect.h"
>  #include "qemu/hw-version.h"
>  #include "monitor/monitor.h"
> +#include 
>  
>  static const char *hw_version = QEMU_HW_VERSION;
>  
> @@ -411,6 +412,53 @@ int qemu_close(int fd)
>  return close(fd);
>  }
>  
> +int qemu_close_range(unsigned int first, unsigned int last)
> +{
> +DIR *dir = NULL;
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +int r = close_range(first, last, 0);
> +if (!r) {
> +/* Success, no need to try other ways. */
> +return 0;
> +}
> +#endif
> +
> +#ifdef __linux__
> +dir = opendir("/proc/self/fd");
> +#endif
> +if (!dir) {
> +/*
> + * If /proc is not mounted or /proc/self/fd is not supported,
> + * try close() from first to last.
> + */
> +for (int i = first; i <= last; i++) {
> +close(i);

will this compile on windows?

> +}
> +
> +return 0;
> +}
> +
> +#ifndef _WIN32
> +/* Avoid closing the directory */
> +int dfd = dirfd(dir);
> +
> +for (struct dirent *de = readdir(dir); de; de = readdir(dir)) {
> +int fd = atoi(de->d_name);
> +if (fd < first || fd > last) {
> +/* Exclude the fds outside the target range */
> +continue;
> +}
> +if (fd != dfd) {
> +close(fd);
> +}
> +}
> +closedir(dir);
> +#endif /* _WIN32 */
> +
> +return 0;
> +}
> +
>  /*
>   * Delete a file from the filesystem, unless the filename is /dev/fdset/...
>   *

Re: [PATCH v3 3/6] util/async-teardown: Fall back to close fds one by one

2023-06-19 Thread Claudio Imbrenda

On Sat, 17 Jun 2023 13:36:18 +0800
Bin Meng  wrote:

> When opening /proc/self/fd fails, current codes just return directly,
> but we can fall back to close fds one by one.
> 
> Signed-off-by: Bin Meng 
> 
> ---
> 
> (no changes since v2)
> 
> Changes in v2:
> - new patch: "util/async-teardown: Fall back to close fds one by one"
> 
>  util/async-teardown.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/util/async-teardown.c b/util/async-teardown.c
> index 3ab19c8740..7e0177a8da 100644
> --- a/util/async-teardown.c
> +++ b/util/async-teardown.c
> @@ -48,7 +48,11 @@ static void close_all_open_fd(void)
>  
>  dir = opendir("/proc/self/fd");
>  if (!dir) {
> -/* If /proc is not mounted, there is nothing that can be done. */
> +/* If /proc is not mounted, close fds one by one. */
> +int open_max = sysconf(_SC_OPEN_MAX), i;
> +for (i = 0; i < open_max; i++) {
> +close(i);
> +}
>  return;
>  }
>  /* Avoid closing the directory. */

a few patches later, you replace the whole close_all_open_fd() with a
generic version, I don't see a point in changing the code here.

this patch is useless, just drop it

Re: [PATCH v3 6/6] net: tap: Use qemu_close_range() to close fds

2023-06-19 Thread Claudio Imbrenda

On Sat, 17 Jun 2023 13:36:21 +0800
Bin Meng  wrote:

> From: Zhangjin Wu 
> 
> Current codes using a brute-force traversal of all file descriptors
> do not scale on a system where the maximum number of file descriptors
> is set to a very large value (e.g.: in a Docker container of Manjaro
> distribution it is set to 1073741816). QEMU just looks frozen during
> start-up.
> 
> The close-on-exec flag (O_CLOEXEC) was introduced since Linux kernel
> 2.6.23, FreeBSD 8.3, OpenBSD 5.0, Solaris 11. While it's true QEMU
> doesn't need to manually close the fds for child process as the proper
> O_CLOEXEC flag should have been set properly on files with its own
> codes, QEMU uses a huge number of 3rd party libraries and we don't
> trust them to reliably be using O_CLOEXEC on everything they open.
> 
> Modern Linux and BSDs have the close_range() call we can use to do the
> job, and on Linux we have one more way to walk through /proc/self/fd
> to complete the task efficiently, which is what qemu_close_range() does.
> 
> Reported-by: Zhangjin Wu 
> Co-developed-by: Bin Meng 
> Signed-off-by: Zhangjin Wu 
> Signed-off-by: Bin Meng 
> 
> ---
> 
> (no changes since v2)
> 
> Changes in v2:
> - Change to use qemu_close_range() to close fds for child process efficiently
> - v1 link: 
> https://lore.kernel.org/qemu-devel/20230406112041.798585-1-bm...@tinylab.org/
> 
>  net/tap.c | 23 +++
>  1 file changed, 11 insertions(+), 12 deletions(-)
> 
> diff --git a/net/tap.c b/net/tap.c
> index 1bf085d422..d482fabdff 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -446,13 +446,13 @@ static void launch_script(const char *setup_script, 
> const char *ifname,
>  return;
>  }
>  if (pid == 0) {
> -int open_max = sysconf(_SC_OPEN_MAX), i;
> +unsigned int last_fd = sysconf(_SC_OPEN_MAX) - 1;
> +
> +/* skip stdin, stdout and stderr */
> +qemu_close_range(3, fd - 1);
> +/* skip the currently used fd */
> +qemu_close_range(fd + 1, last_fd);
>  
> -for (i = 3; i < open_max; i++) {
> -if (i != fd) {
> -close(i);
> -}
> -}
>  parg = args;
>  *parg++ = (char *)setup_script;
>  *parg++ = (char *)ifname;
> @@ -536,16 +536,15 @@ static int net_bridge_run_helper(const char *helper, 
> const char *bridge,
>  return -1;
>  }
>  if (pid == 0) {
> -int open_max = sysconf(_SC_OPEN_MAX), i;
> +unsigned int last_fd = sysconf(_SC_OPEN_MAX) - 1, fd = sv[1];

please put fd on its own line

>  char *fd_buf = NULL;
>  char *br_buf = NULL;
>  char *helper_cmd = NULL;
>  
> -for (i = 3; i < open_max; i++) {
> -if (i != sv[1]) {
> -close(i);
> -}
> -}
> +/* skip stdin, stdout and stderr */
> +qemu_close_range(3, fd - 1);
> +/* skip the currently used fd */
> +qemu_close_range(fd + 1, last_fd);
>  
>  fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
>

Re: [PATCH v3 5/6] util/async-teardown: Use qemu_close_range() to close fds

2023-06-19 Thread Claudio Imbrenda

On Sat, 17 Jun 2023 13:36:20 +0800
Bin Meng  wrote:

> From: Zhangjin Wu 
> 
> Based on the old close_all_open_fd() of util/async-teardown.c, a new
> generic qemu_close_range() has been added in osdep.c.
> 
> Now, let's switch over to use the generic qemu_close_range().
> 
> Signed-off-by: Zhangjin Wu 
> Signed-off-by: Bin Meng 
> 
> ---
> 
> Changes in v3:
> - limit the last_fd of qemu_close_range() to sysconf(_SC_OPEN_MAX)
> 
> Changes in v2:
> - new patch: "util/async-teardown: Use qemu_close_range() to close fds"
> 
>  util/async-teardown.c | 42 ++
>  1 file changed, 2 insertions(+), 40 deletions(-)
> 
> diff --git a/util/async-teardown.c b/util/async-teardown.c
> index 7e0177a8da..e102912f3f 100644
> --- a/util/async-teardown.c
> +++ b/util/async-teardown.c
> @@ -29,44 +29,6 @@
>  
>  static pid_t the_ppid;
>  
> -/*
> - * Close all open file descriptors.
> - */
> -static void close_all_open_fd(void)
> -{
> -struct dirent *de;
> -int fd, dfd;
> -DIR *dir;
> -
> -#ifdef CONFIG_CLOSE_RANGE
> -int r = close_range(0, ~0U, 0);
> -if (!r) {
> -/* Success, no need to try other ways. */
> -return;
> -}
> -#endif
> -
> -dir = opendir("/proc/self/fd");
> -if (!dir) {
> -/* If /proc is not mounted, close fds one by one. */
> -int open_max = sysconf(_SC_OPEN_MAX), i;
> -for (i = 0; i < open_max; i++) {
> -close(i);
> -}
> -return;
> -}
> -/* Avoid closing the directory. */
> -dfd = dirfd(dir);
> -
> -for (de = readdir(dir); de; de = readdir(dir)) {
> -fd = atoi(de->d_name);
> -if (fd != dfd) {
> -close(fd);
> -}
> -}
> -closedir(dir);
> -}
> -
>  static void hup_handler(int signal)
>  {
>  /* Check every second if this process has been reparented. */
> @@ -84,6 +46,7 @@ static int async_teardown_fn(void *arg)
>  struct sigaction sa = { .sa_handler = hup_handler };
>  sigset_t hup_signal;
>  char name[16];
> +int open_max = sysconf(_SC_OPEN_MAX);
>  
>  /* Set a meaningful name for this process. */
>  snprintf(name, 16, "cleanup/%d", the_ppid);
> @@ -92,9 +55,8 @@ static int async_teardown_fn(void *arg)
>  /*
>   * Close all file descriptors that might have been inherited from the
>   * main qemu process when doing clone, needed to make libvirt happy.
> - * Not using close_range for increased compatibility with older kernels.
>   */
> -close_all_open_fd();
> +qemu_close_range(0, open_max - 1);

I think it would look easier to read if you just put the sysconf() call
here and avoid an extra variable

>  
>  /* Set up a handler for SIGHUP and unblock SIGHUP. */
>  sigaction(SIGHUP, , NULL);

[PATCH v2 0/1] s390x/pv: Fix spurious warning with asynchronous teardown

2023-05-10 Thread Claudio Imbrenda

Kernel commit 292a7d6fca33 ("KVM: s390: pv: fix asynchronous teardown
for small VMs") causes the KVM_PV_ASYNC_CLEANUP_PREPARE ioctl to fail
if the VM is not larger than 2GiB. QEMU would attempt it and fail,
print an error message, and then proceed with a normal teardown.

Avoid attempting to use asynchronous teardown altogether when the VM is
not larger than 2 GiB. This will avoid triggering the error message and
also avoid pointless overhead; normal teardown is fast enough for small
VMs.

v1->v2:
* Use 2GiB as threshold, the same as the kernel [thomas]
* Pass the machine state to s390_pv_vm_try_disable_async instead of
  using qdev_get_machine() [thomas]
* Update and improve patch description and comments

Claudio Imbrenda (1):
  s390x/pv: Fix spurious warning with asynchronous teardown

 hw/s390x/pv.c  | 10 --
 hw/s390x/s390-virtio-ccw.c |  2 +-
 include/hw/s390x/pv.h  |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

-- 
2.40.1

[PATCH v2 1/1] s390x/pv: Fix spurious warning with asynchronous teardown

2023-05-10 Thread Claudio Imbrenda

Kernel commit 292a7d6fca33 ("KVM: s390: pv: fix asynchronous teardown
for small VMs") causes the KVM_PV_ASYNC_CLEANUP_PREPARE ioctl to fail
if the VM is not larger than 2GiB. QEMU would attempt it and fail,
print an error message, and then proceed with a normal teardown.

Avoid attempting to use asynchronous teardown altogether when the VM is
not larger than 2 GiB. This will avoid triggering the error message and
also avoid pointless overhead; normal teardown is fast enough for small
VMs.

Reported-by: Marc Hartmayer 
Fixes: c3a073c610 ("s390x/pv: Add support for asynchronous teardown for reboot")
Link: https://lore.kernel.org/all/20230421085036.52511-2-imbre...@linux.ibm.com/
Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c  | 10 --
 hw/s390x/s390-virtio-ccw.c |  2 +-
 include/hw/s390x/pv.h  |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 49ea38236c..b63f3784c6 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -13,6 +13,7 @@
 
 #include 
 
+#include "qemu/units.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
@@ -115,7 +116,7 @@ static void *s390_pv_do_unprot_async_fn(void *p)
  return NULL;
 }
 
-bool s390_pv_vm_try_disable_async(void)
+bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
 {
 /*
  * t is only needed to create the thread; once qemu_thread_create
@@ -123,7 +124,12 @@ bool s390_pv_vm_try_disable_async(void)
  */
 QemuThread t;
 
-if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
+/*
+ * If the feature is not present or if the VM is not larger than 2 GiB,
+ * KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it.
+ */
+if ((MACHINE(ms)->maxram_size <= 2 * GiB) ||
+!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
 return false;
 }
 if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index e6f2c62625..2516b89b32 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -330,7 +330,7 @@ static inline void s390_do_cpu_ipl(CPUState *cs, 
run_on_cpu_data arg)
 
 static void s390_machine_unprotect(S390CcwMachineState *ms)
 {
-if (!s390_pv_vm_try_disable_async()) {
+if (!s390_pv_vm_try_disable_async(ms)) {
 s390_pv_vm_disable();
 }
 ms->pv = false;
diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index 966306a9db..6766557fb7 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -41,7 +41,7 @@ static inline bool s390_is_pv(void)
 int s390_pv_query_info(void);
 int s390_pv_vm_enable(void);
 void s390_pv_vm_disable(void);
-bool s390_pv_vm_try_disable_async(void);
+bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms);
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
 int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
 void s390_pv_prep_reset(void);
-- 
2.40.1

Re: [PATCH v1 1/1] s390x/pv: Fix spurious warning with asynchronous teardown

2023-05-10 Thread Claudio Imbrenda

On Wed, 10 May 2023 08:47:08 +0200
Thomas Huth  wrote:

> On 09/05/2023 18.27, Claudio Imbrenda wrote:
> > When rebooting a small VM using asynchronous teardown, a spurious
> > warning is emitted when the KVM_PV_ASYNC_CLEANUP_PREPARE ioctl fails.  
> 
> Why does the _PREPARE fail in that case? Why 4GiB and not more or less? This 

because of kernel commit 292a7d6fca33df70ca4b8e9b0d0e74adf87582dc, which
fixes problems in case the VM is small (<2GiB)

> sounds racy... what if you have a faster or slower machine?

why racy?

2 or 4GiB is still very fast, and at some point you have to draw a line.
I could make it 2GiB, which is the limit at which _PREPARE will fail,
but since I'm touching this code, I would like to avoid unnecessary
overhead, instead of "just fixing" 

I can put the limit to 2GiB if you think it's more clean

> 
> > Avoid using asynchronous teardown altogether when the VM is small
> > enough; the cutoff is set at 4GiB. This will avoid triggering the
> > warning and also avoid pointless overhead; normal teardown is fast
> > enough for small VMs.
> > 
> > Reported-by: Marc Hartmayer 
> > Fixes: c3a073c610 ("s390x/pv: Add support for asynchronous teardown for 
> > reboot")
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >   hw/s390x/pv.c | 6 +-
> >   1 file changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
> > index 49ea38236c..17c5556319 100644
> > --- a/hw/s390x/pv.c
> > +++ b/hw/s390x/pv.c
> > @@ -13,6 +13,7 @@
> >   
> >   #include 
> >   
> > +#include "qemu/units.h"
> >   #include "qapi/error.h"
> >   #include "qemu/error-report.h"
> >   #include "sysemu/kvm.h"
> > @@ -117,13 +118,16 @@ static void *s390_pv_do_unprot_async_fn(void *p)
> >   
> >   bool s390_pv_vm_try_disable_async(void)
> >   {
> > +MachineState *machine = MACHINE(qdev_get_machine());  
> 
> The calling function (s390_machine_unprotect()) already has a 
> S390CcwMachineState as parameter ... so you could pass along that value to 
> avoid the qdev_get_machine() here.

yes, I was thinking about that and decided against it to avoid changing
interfaces; I'll fix it in the next version

> 
> >   /*
> >* t is only needed to create the thread; once qemu_thread_create
> >* returns, it can safely be discarded.
> >*/
> >   QemuThread t;
> >   
> > -if (!kvm_check_extension(kvm_state, 
> > KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
> > +/* Avoid the overhead of asynchronous teardown for small machines */
> > +if ((machine->maxram_size < 4 * GiB) ||
> > +!kvm_check_extension(kvm_state, 
> > KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
> >   return false;
> >   }
> >   if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {  
> 
>   Thomas
>

[PATCH v1 1/1] s390x/pv: Fix spurious warning with asynchronous teardown

2023-05-09 Thread Claudio Imbrenda

When rebooting a small VM using asynchronous teardown, a spurious
warning is emitted when the KVM_PV_ASYNC_CLEANUP_PREPARE ioctl fails.

Avoid using asynchronous teardown altogether when the VM is small
enough; the cutoff is set at 4GiB. This will avoid triggering the
warning and also avoid pointless overhead; normal teardown is fast
enough for small VMs.

Reported-by: Marc Hartmayer 
Fixes: c3a073c610 ("s390x/pv: Add support for asynchronous teardown for reboot")
Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 49ea38236c..17c5556319 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -13,6 +13,7 @@
 
 #include 
 
+#include "qemu/units.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
@@ -117,13 +118,16 @@ static void *s390_pv_do_unprot_async_fn(void *p)
 
 bool s390_pv_vm_try_disable_async(void)
 {
+MachineState *machine = MACHINE(qdev_get_machine());
 /*
  * t is only needed to create the thread; once qemu_thread_create
  * returns, it can safely be discarded.
  */
 QemuThread t;
 
-if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
+/* Avoid the overhead of asynchronous teardown for small machines */
+if ((machine->maxram_size < 4 * GiB) ||
+!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
 return false;
 }
 if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
-- 
2.40.1

[PATCH v7 1/1] util/async-teardown: wire up query-command-line-options

2023-05-05 Thread Claudio Imbrenda

Add new -run-with option with an async-teardown=on|off parameter. It is
visible in the output of query-command-line-options QMP command, so it
can be discovered and used by libvirt.

The option -async-teardown is now redundant, deprecate it.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 docs/about/deprecated.rst |  5 +
 os-posix.c| 13 +
 qemu-options.hx   | 34 +++---
 util/async-teardown.c | 21 +
 4 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 1ca9dc33d6..0986db9a86 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -111,6 +111,11 @@ Use ``-machine acpi=off`` instead.
 The HAXM project has been retired (see https://github.com/intel/haxm#status).
 Use "whpx" (on Windows) or "hvf" (on macOS) instead.
 
+``-async-teardown`` (since 8.1)
+,,,
+
+Use ``-run-with async-teardown=on`` instead.
+
 
 QEMU Machine Protocol (QMP) commands
 
diff --git a/os-posix.c b/os-posix.c
index 5adc69f560..ef910aaf94 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -36,6 +36,8 @@
 #include "qemu/log.h"
 #include "sysemu/runstate.h"
 #include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -152,9 +154,20 @@ int os_parse_cmd_args(int index, const char *optarg)
 daemonize = 1;
 break;
 #if defined(CONFIG_LINUX)
+/* deprecated */
 case QEMU_OPTION_asyncteardown:
 init_async_teardown();
 break;
+case QEMU_OPTION_run_with:
+QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"),
+ optarg, false);
+if (!opts) {
+exit(1);
+}
+if (qemu_opt_get_bool(opts, "async-teardown", false)) {
+init_async_teardown();
+}
+break;
 #endif
 default:
 return -1;
diff --git a/qemu-options.hx b/qemu-options.hx
index b5efa648ba..24a1d63bbe 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4799,20 +4799,32 @@ DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", 
QEMU_ARCH_ALL)
 DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
 "-async-teardown enable asynchronous teardown\n",
 QEMU_ARCH_ALL)
-#endif
 SRST
 ``-async-teardown``
-Enable asynchronous teardown. A new process called "cleanup/"
-will be created at startup sharing the address space with the main qemu
-process, using clone. It will wait for the main qemu process to
-terminate completely, and then exit.
-This allows qemu to terminate very quickly even if the guest was
-huge, leaving the teardown of the address space to the cleanup
-process. Since the cleanup process shares the same cgroups as the
-main qemu process, accounting is performed correctly. This only
-works if the cleanup process is not forcefully killed with SIGKILL
-before the main qemu process has terminated completely.
+This option is deprecated and should no longer be used. The new option
+``-run-with async-teardown=on`` is a replacement.
 ERST
+DEF("run-with", HAS_ARG, QEMU_OPTION_run_with,
+"-run-with async-teardown[=on|off]\n"
+"misc QEMU process lifecycle options\n"
+"async-teardown=on enables asynchronous teardown\n",
+QEMU_ARCH_ALL)
+SRST
+``-run-with``
+Set QEMU process lifecycle options.
+
+``async-teardown=on`` enables asynchronous teardown. A new process called
+"cleanup/" will be created at startup sharing the address
+space with the main QEMU process, using clone. It will wait for the
+main QEMU process to terminate completely, and then exit. This allows
+QEMU to terminate very quickly even if the guest was huge, leaving the
+teardown of the address space to the cleanup process. Since the cleanup
+process shares the same cgroups as the main QEMU process, accounting is
+performed correctly. This only works if the cleanup process is not
+forcefully killed with SIGKILL before the main QEMU process has
+terminated completely.
+ERST
+#endif
 
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..3ab19c8740 100644
--- a/util/async-teardown.c
+++ b/util/async-teardown.c
@@ -12,6 +12,9 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
 #includ

[PATCH v7 0/1] util/async-teardown: appear in query-command-line-options

2023-05-05 Thread Claudio Imbrenda

Add new -run-with option with an async-teardown=on|off parameter. It
is visible in the output of query-command-line-options QMP command, so
it can be discovered and used by libvirt.

The option -async-teardown is now redundant, deprecate it.

v6->v7
* move variable declaration inside #ifdef to avoid compile time errors
  on *BSD due to unused variables. [thomas]

v5->v6
* deprecate the old -async-teardown option instead of removing it,
  since it has now appeared in 2 QEMU releases
* use -run-with as a grab bag commandline option for the async-teardown
  boolean parameter [paolo,markus,thomas]

v4->v5
* reword commit message [Markus]
* document the removal of the -async-teardown commandline option in
  docs/about/removed-features.rst [Markus]

v3->v4
* completely remove the useless -async-teardown option, since it was
  not wired up properly and it had no users [thomas]
* QEMU should be always uppercase in text and documentation [thomas]
* if the new -teardown option fails to parse, exit immediately instead
  of returning an error [thomas]

v2->v3
* add a new teardown option with an async parameter [Markus]
* reworded documentation of existing -async-teardown option so that it
  points to the new teardown option

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 docs/about/deprecated.rst |  5 +
 os-posix.c| 13 +
 qemu-options.hx   | 34 +++---
 util/async-teardown.c | 21 +
 4 files changed, 62 insertions(+), 11 deletions(-)

-- 
2.40.1

Re: [PATCH v6 1/1] util/async-teardown: wire up query-command-line-options

2023-04-28 Thread Claudio Imbrenda

On Fri, 28 Apr 2023 14:16:42 +0200
Thomas Huth  wrote:

> On 28/04/2023 13.12, Claudio Imbrenda wrote:
> > Add new -run-with option with an async-teardown=on|off parameter. It is
> > visible in the output of query-command-line-options QMP command, so it
> > can be discovered and used by libvirt.
> > 
> > The option -async-teardown is now redundant, deprecate it.
> > 
> > Reported-by: Boris Fiuczynski 
> > Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >   docs/about/deprecated.rst |  5 +
> >   os-posix.c| 15 +++
> >   qemu-options.hx   | 34 +++---
> >   util/async-teardown.c | 21 +
> >   4 files changed, 64 insertions(+), 11 deletions(-)
> > 
> > diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> > index 1ca9dc33d6..0986db9a86 100644
> > --- a/docs/about/deprecated.rst
> > +++ b/docs/about/deprecated.rst
> > @@ -111,6 +111,11 @@ Use ``-machine acpi=off`` instead.
> >   The HAXM project has been retired (see 
> > https://github.com/intel/haxm#status).
> >   Use "whpx" (on Windows) or "hvf" (on macOS) instead.
> >   
> > +``-async-teardown`` (since 8.1)
> > +,,,
> > +
> > +Use ``-run-with async-teardown=on`` instead.
> > +
> >   
> >   QEMU Machine Protocol (QMP) commands
> >   
> > diff --git a/os-posix.c b/os-posix.c
> > index 5adc69f560..117ad2bdc1 100644
> > --- a/os-posix.c
> > +++ b/os-posix.c
> > @@ -36,6 +36,8 @@
> >   #include "qemu/log.h"
> >   #include "sysemu/runstate.h"
> >   #include "qemu/cutils.h"
> > +#include "qemu/config-file.h"
> > +#include "qemu/option.h"
> >   
> >   #ifdef CONFIG_LINUX
> >   #include 
> > @@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
> >*/
> >   int os_parse_cmd_args(int index, const char *optarg)
> >   {
> > +QemuOpts *opts;  
> 
> Fails to compile on FreeBSD:
> 
> ../src/os-posix.c:137:15: error: unused variable 'opts' 
> [-Werror,-Wunused-variable]
>  QemuOpts *opts;
>^
> 1 error generated.
> 
> Apart from that, the patch looks fine to me.

oops, I'll move the variable inside the ifdef

> 
>   Thomas
>

[PATCH v6 1/1] util/async-teardown: wire up query-command-line-options

2023-04-28 Thread Claudio Imbrenda

Add new -run-with option with an async-teardown=on|off parameter. It is
visible in the output of query-command-line-options QMP command, so it
can be discovered and used by libvirt.

The option -async-teardown is now redundant, deprecate it.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 docs/about/deprecated.rst |  5 +
 os-posix.c| 15 +++
 qemu-options.hx   | 34 +++---
 util/async-teardown.c | 21 +
 4 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 1ca9dc33d6..0986db9a86 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -111,6 +111,11 @@ Use ``-machine acpi=off`` instead.
 The HAXM project has been retired (see https://github.com/intel/haxm#status).
 Use "whpx" (on Windows) or "hvf" (on macOS) instead.
 
+``-async-teardown`` (since 8.1)
+,,,
+
+Use ``-run-with async-teardown=on`` instead.
+
 
 QEMU Machine Protocol (QMP) commands
 
diff --git a/os-posix.c b/os-posix.c
index 5adc69f560..117ad2bdc1 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -36,6 +36,8 @@
 #include "qemu/log.h"
 #include "sysemu/runstate.h"
 #include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
  */
 int os_parse_cmd_args(int index, const char *optarg)
 {
+QemuOpts *opts;
+
 switch (index) {
 case QEMU_OPTION_runas:
 user_pwd = getpwnam(optarg);
@@ -152,9 +156,20 @@ int os_parse_cmd_args(int index, const char *optarg)
 daemonize = 1;
 break;
 #if defined(CONFIG_LINUX)
+/* deprecated */
 case QEMU_OPTION_asyncteardown:
 init_async_teardown();
 break;
+case QEMU_OPTION_run_with:
+opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"),
+   optarg, false);
+if (!opts) {
+exit(1);
+}
+if (qemu_opt_get_bool(opts, "async-teardown", false)) {
+init_async_teardown();
+}
+break;
 #endif
 default:
 return -1;
diff --git a/qemu-options.hx b/qemu-options.hx
index b5efa648ba..24a1d63bbe 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4799,20 +4799,32 @@ DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", 
QEMU_ARCH_ALL)
 DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
 "-async-teardown enable asynchronous teardown\n",
 QEMU_ARCH_ALL)
-#endif
 SRST
 ``-async-teardown``
-Enable asynchronous teardown. A new process called "cleanup/"
-will be created at startup sharing the address space with the main qemu
-process, using clone. It will wait for the main qemu process to
-terminate completely, and then exit.
-This allows qemu to terminate very quickly even if the guest was
-huge, leaving the teardown of the address space to the cleanup
-process. Since the cleanup process shares the same cgroups as the
-main qemu process, accounting is performed correctly. This only
-works if the cleanup process is not forcefully killed with SIGKILL
-before the main qemu process has terminated completely.
+This option is deprecated and should no longer be used. The new option
+``-run-with async-teardown=on`` is a replacement.
 ERST
+DEF("run-with", HAS_ARG, QEMU_OPTION_run_with,
+"-run-with async-teardown[=on|off]\n"
+"misc QEMU process lifecycle options\n"
+"async-teardown=on enables asynchronous teardown\n",
+QEMU_ARCH_ALL)
+SRST
+``-run-with``
+Set QEMU process lifecycle options.
+
+``async-teardown=on`` enables asynchronous teardown. A new process called
+"cleanup/" will be created at startup sharing the address
+space with the main QEMU process, using clone. It will wait for the
+main QEMU process to terminate completely, and then exit. This allows
+QEMU to terminate very quickly even if the guest was huge, leaving the
+teardown of the address space to the cleanup process. Since the cleanup
+process shares the same cgroups as the main QEMU process, accounting is
+performed correctly. This only works if the cleanup process is not
+forcefully killed with SIGKILL before the main QEMU process has
+terminated completely.
+ERST
+#endif
 
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..3ab19c8740 100644
--- a/util/

[PATCH v6 0/1] util/async-teardown: appear in query-command-line-options

2023-04-28 Thread Claudio Imbrenda

Add new -run-with option with an async-teardown=on|off parameter. It
is visible in the output of query-command-line-options QMP command, so
it can be discovered and used by libvirt.

The option -async-teardown is now redundant, deprecate it.

v5->v6
* deprecate the old -async-teardown option instead of removing it,
  since it has now appeared in 2 QEMU releases
* use -run-with as a grab bag commandline option for the async-teardown
  boolean parameter [paolo,markus,thomas]

v4->v5
* reword commit message [Markus]
* document the removal of the -async-teardown commandline option in
  docs/about/removed-features.rst [Markus]

v3->v4
* completely remove the useless -async-teardown option, since it was
  not wired up properly and it had no users [thomas]
* QEMU should be always uppercase in text and documentation [thomas]
* if the new -teardown option fails to parse, exit immediately instead
  of returning an error [thomas]

v2->v3
* add a new teardown option with an async parameter [Markus]
* reworded documentation of existing -async-teardown option so that it
  points to the new teardown option

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 docs/about/deprecated.rst |  5 +
 os-posix.c| 15 +++
 qemu-options.hx   | 34 +++---
 util/async-teardown.c | 21 +
 4 files changed, 64 insertions(+), 11 deletions(-)

-- 
2.40.0

[PATCH v5 0/1] util/async-teardown: appear in query-command-line-options

2023-03-27 Thread Claudio Imbrenda

Add new -teardown option with an async=on|off parameter. It is visible
in the output of query-command-line-options QMP command, so it can be
discovered and used by libvirt.

The option -async-teardown is now redundant. We'd normally deprecate it
and remove it after a grace period, but it was introduced only in the
previous version and it had no users, since it was not visible in the
query-command-line-options QMP command. Drop it.


v4->v5
* reword commit message [Markus]
* document the removal of the -async-teardown commandline option in
  docs/about/removed-features.rst [Markus]

v3->v4
* completely remove the useless -async-teardown option, since it was
  not wired up properly and it had no users [thomas]
* QEMU should be always uppercase in text and documentation [thomas]
* if the new -teardown option fails to parse, exit immediately instead
  of returning an error [thomas]

v2->v3
* add a new teardown option with an async parameter [Markus]
* reworded documentation of existing -async-teardown option so that it
  points to the new teardown option

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 docs/about/removed-features.rst |  5 +
 os-posix.c  | 15 +--
 qemu-options.hx | 33 +++--
 util/async-teardown.c   | 21 +
 4 files changed, 58 insertions(+), 16 deletions(-)

-- 
2.39.2

[PATCH v5 1/1] util/async-teardown: wire up query-command-line-options

2023-03-27 Thread Claudio Imbrenda

Add new -teardown option with an async=on|off parameter. It is visible
in the output of query-command-line-options QMP command, so it can be
discovered and used by libvirt.

The option -async-teardown is now redundant. We'd normally deprecate it
and remove it after a grace period, but it was introduced only in the
previous version and it had no users, since it was not visible in the
query-command-line-options QMP command. Drop it.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 docs/about/removed-features.rst |  5 +
 os-posix.c  | 15 +--
 qemu-options.hx | 33 +++--
 util/async-teardown.c   | 21 +
 4 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
index 5b258b446b..6d89f69be9 100644
--- a/docs/about/removed-features.rst
+++ b/docs/about/removed-features.rst
@@ -416,6 +416,11 @@ Input parameters that take a size value should only use a 
size suffix
 the value is hexadecimal.  That is, '0x20M' should be written either as
 '32M' or as '0x200'.
 
+``-async-teardown`` (removed in 8.0)
+
+
+Use ``-teardown async=on`` instead.
+
 ``-chardev`` backend aliases ``tty`` and ``parport`` (removed in 8.0)
 '
 
diff --git a/os-posix.c b/os-posix.c
index 5adc69f560..c1ca7b1cb3 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -36,6 +36,8 @@
 #include "qemu/log.h"
 #include "sysemu/runstate.h"
 #include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
  */
 int os_parse_cmd_args(int index, const char *optarg)
 {
+QemuOpts *opts;
+
 switch (index) {
 case QEMU_OPTION_runas:
 user_pwd = getpwnam(optarg);
@@ -152,8 +156,15 @@ int os_parse_cmd_args(int index, const char *optarg)
 daemonize = 1;
 break;
 #if defined(CONFIG_LINUX)
-case QEMU_OPTION_asyncteardown:
-init_async_teardown();
+case QEMU_OPTION_teardown:
+opts = qemu_opts_parse_noisily(qemu_find_opts("teardown"),
+   optarg, false);
+if (!opts) {
+exit(1);
+}
+if (qemu_opt_get_bool(opts, "async", false)) {
+init_async_teardown();
+}
 break;
 #endif
 default:
diff --git a/qemu-options.hx b/qemu-options.hx
index d42f60fb91..6a69b84f3c 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4763,23 +4763,28 @@ DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", 
QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
 #ifdef __linux__
-DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
-"-async-teardown enable asynchronous teardown\n",
+DEF("teardown", HAS_ARG, QEMU_OPTION_teardown,
+"-teardown async[=on|off]\n"
+"process teardown options\n"
+"async=on enables asynchronous teardown\n"
+   ,
 QEMU_ARCH_ALL)
-#endif
 SRST
-``-async-teardown``
-Enable asynchronous teardown. A new process called "cleanup/"
-will be created at startup sharing the address space with the main qemu
-process, using clone. It will wait for the main qemu process to
-terminate completely, and then exit.
-This allows qemu to terminate very quickly even if the guest was
-huge, leaving the teardown of the address space to the cleanup
-process. Since the cleanup process shares the same cgroups as the
-main qemu process, accounting is performed correctly. This only
-works if the cleanup process is not forcefully killed with SIGKILL
-before the main qemu process has terminated completely.
+``-teardown``
+Set process teardown options.
+
+``async=on`` enables asynchronous teardown. A new process called
+"cleanup/" will be created at startup sharing the address
+space with the main QEMU process, using clone. It will wait for the
+main QEMU process to terminate completely, and then exit. This allows
+QEMU to terminate very quickly even if the guest was huge, leaving the
+teardown of the address space to the cleanup process. Since the cleanup
+process shares the same cgroups as the main QEMU process, accounting is
+performed correctly. This only works if the cleanup process is not
+forcefully killed with SIGKILL before the main QEMU process has
+terminated completely.
 ERST
+#endif
 
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-na

[PATCH v4 0/1] util/async-teardown: wire up query-command-line-options

2023-03-27 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by adding a new -teardown commandline option
with an async=on|off parameter, correctly wired up so that it appears
in the output of query-command-line-options.

v3->v4
* completely remove the useless -async-teardown option, since it was
  not wired up properly and it had no users [thomas]
* QEMU should be always uppercase in text and documentation [thomas]
* if the new -teardown option fails to parse, exit immediately instead
  of returning an error [thomas]

v2->v3
* add a new teardown option with an async parameter [Markus]
* reworded documentation of existing -async-teardown option so that it
  points to the new teardown option

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 os-posix.c| 15 +--
 qemu-options.hx   | 33 +++--
 util/async-teardown.c | 21 +
 3 files changed, 53 insertions(+), 16 deletions(-)

-- 
2.39.2

[PATCH v4 1/1] util/async-teardown: wire up query-command-line-options

2023-03-27 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt had no way to
discover whether the feature was supported.

This patch fixes the issue by replacing the -async-teardown option with
a new -teardown option with a new async=on|off parameter.
The new option is correctly wired up so that it appears in the output
of query-command-line-options.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 os-posix.c| 15 +--
 qemu-options.hx   | 33 +++--
 util/async-teardown.c | 21 +
 3 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/os-posix.c b/os-posix.c
index 5adc69f560..c1ca7b1cb3 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -36,6 +36,8 @@
 #include "qemu/log.h"
 #include "sysemu/runstate.h"
 #include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
  */
 int os_parse_cmd_args(int index, const char *optarg)
 {
+QemuOpts *opts;
+
 switch (index) {
 case QEMU_OPTION_runas:
 user_pwd = getpwnam(optarg);
@@ -152,8 +156,15 @@ int os_parse_cmd_args(int index, const char *optarg)
 daemonize = 1;
 break;
 #if defined(CONFIG_LINUX)
-case QEMU_OPTION_asyncteardown:
-init_async_teardown();
+case QEMU_OPTION_teardown:
+opts = qemu_opts_parse_noisily(qemu_find_opts("teardown"),
+   optarg, false);
+if (!opts) {
+exit(1);
+}
+if (qemu_opt_get_bool(opts, "async", false)) {
+init_async_teardown();
+}
 break;
 #endif
 default:
diff --git a/qemu-options.hx b/qemu-options.hx
index d42f60fb91..6a69b84f3c 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4763,23 +4763,28 @@ DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", 
QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
 #ifdef __linux__
-DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
-"-async-teardown enable asynchronous teardown\n",
+DEF("teardown", HAS_ARG, QEMU_OPTION_teardown,
+"-teardown async[=on|off]\n"
+"process teardown options\n"
+"async=on enables asynchronous teardown\n"
+   ,
 QEMU_ARCH_ALL)
-#endif
 SRST
-``-async-teardown``
-Enable asynchronous teardown. A new process called "cleanup/"
-will be created at startup sharing the address space with the main qemu
-process, using clone. It will wait for the main qemu process to
-terminate completely, and then exit.
-This allows qemu to terminate very quickly even if the guest was
-huge, leaving the teardown of the address space to the cleanup
-process. Since the cleanup process shares the same cgroups as the
-main qemu process, accounting is performed correctly. This only
-works if the cleanup process is not forcefully killed with SIGKILL
-before the main qemu process has terminated completely.
+``-teardown``
+Set process teardown options.
+
+``async=on`` enables asynchronous teardown. A new process called
+"cleanup/" will be created at startup sharing the address
+space with the main QEMU process, using clone. It will wait for the
+main QEMU process to terminate completely, and then exit. This allows
+QEMU to terminate very quickly even if the guest was huge, leaving the
+teardown of the address space to the cleanup process. Since the cleanup
+process shares the same cgroups as the main QEMU process, accounting is
+performed correctly. This only works if the cleanup process is not
+forcefully killed with SIGKILL before the main QEMU process has
+terminated completely.
 ERST
+#endif
 
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..4a5dbce958 100644
--- a/util/async-teardown.c
+++ b/util/async-teardown.c
@@ -12,6 +12,9 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
 #include 
 #include 
 #include 
@@ -144,3 +147,21 @@ void init_async_teardown(void)
 clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
 sigprocmask(SIG_SETMASK, _signals, NULL);
 }
+
+static QemuOptsList qemu_teardown_opts = {
+.name = "teardown",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_teardow

Re: [PATCH v3 1/1] util/async-teardown: wire up query-command-line-options

2023-03-24 Thread Claudio Imbrenda

On Fri, 24 Mar 2023 18:56:06 +0100
Thomas Huth  wrote:

> On 24/03/2023 18.45, Claudio Imbrenda wrote:
> > The recently introduced -async-teardown commandline option was not
> > wired up properly and did not show up in the output of the QMP command
> > query-command-line-options. This means that libvirt will have no way to
> > discover whether the feature is supported.
> > 
> > This patch fixes the issue by correctly wiring up the commandline
> > option so that it appears in the output of query-command-line-options.
> > 
> > Reported-by: Boris Fiuczynski 
> > Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >   os-posix.c| 14 ++
> >   qemu-options.hx   | 35 ---
> >   util/async-teardown.c | 21 +
> >   3 files changed, 59 insertions(+), 11 deletions(-)
> > 
> > diff --git a/os-posix.c b/os-posix.c
> > index 5adc69f560..48acd7acf5 100644
> > --- a/os-posix.c
> > +++ b/os-posix.c
> > @@ -36,6 +36,8 @@
> >   #include "qemu/log.h"
> >   #include "sysemu/runstate.h"
> >   #include "qemu/cutils.h"
> > +#include "qemu/config-file.h"
> > +#include "qemu/option.h"
> >   
> >   #ifdef CONFIG_LINUX
> >   #include 
> > @@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
> >*/
> >   int os_parse_cmd_args(int index, const char *optarg)
> >   {
> > +QemuOpts *opts;
> > +
> >   switch (index) {
> >   case QEMU_OPTION_runas:
> >   user_pwd = getpwnam(optarg);
> > @@ -155,6 +159,16 @@ int os_parse_cmd_args(int index, const char *optarg)
> >   case QEMU_OPTION_asyncteardown:
> >   init_async_teardown();
> >   break;
> > +case QEMU_OPTION_teardown:
> > +opts = qemu_opts_parse_noisily(qemu_find_opts("teardown"),
> > +   optarg, false);
> > +if (!opts) {
> > +return -1;
> > +}
> > +if (qemu_opt_get_bool(opts, "async", false)) {
> > +init_async_teardown();
> > +}
> > +break;
> >   #endif
> >   default:
> >   return -1;
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index d42f60fb91..8582980b12 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -4766,20 +4766,33 @@ DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, 
> > "", QEMU_ARCH_ALL)
> >   DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
> >   "-async-teardown enable asynchronous teardown\n",
> >   QEMU_ARCH_ALL)
> > -#endif
> >   SRST
> >   ``-async-teardown``
> > -Enable asynchronous teardown. A new process called "cleanup/"
> > -will be created at startup sharing the address space with the main qemu
> > -process, using clone. It will wait for the main qemu process to
> > -terminate completely, and then exit.
> > -This allows qemu to terminate very quickly even if the guest was
> > -huge, leaving the teardown of the address space to the cleanup
> > -process. Since the cleanup process shares the same cgroups as the
> > -main qemu process, accounting is performed correctly. This only
> > -works if the cleanup process is not forcefully killed with SIGKILL
> > -before the main qemu process has terminated completely.
> > +Equivalent to -teardown async=on  
> 
> We should avoid of providing multiple ways of doing the same thing to the 
> users if there is no real benefit. So I'd vote for either removing the 
> "-async-teardown" option here directly (since it just has been introduced in 
> 7.2 and there are no known users out there yet), or at least deprecate it 
> (put an entry in docs/about/deprecated.rst), so we can remove it again in 

both are fine for me (although I have a slight preference for removing
it altogether)

> two releases.
> 
>   Thomas
>

[PATCH v3 1/1] util/async-teardown: wire up query-command-line-options

2023-03-24 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by correctly wiring up the commandline
option so that it appears in the output of query-command-line-options.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 os-posix.c| 14 ++
 qemu-options.hx   | 35 ---
 util/async-teardown.c | 21 +
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/os-posix.c b/os-posix.c
index 5adc69f560..48acd7acf5 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -36,6 +36,8 @@
 #include "qemu/log.h"
 #include "sysemu/runstate.h"
 #include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -132,6 +134,8 @@ static bool os_parse_runas_uid_gid(const char *optarg)
  */
 int os_parse_cmd_args(int index, const char *optarg)
 {
+QemuOpts *opts;
+
 switch (index) {
 case QEMU_OPTION_runas:
 user_pwd = getpwnam(optarg);
@@ -155,6 +159,16 @@ int os_parse_cmd_args(int index, const char *optarg)
 case QEMU_OPTION_asyncteardown:
 init_async_teardown();
 break;
+case QEMU_OPTION_teardown:
+opts = qemu_opts_parse_noisily(qemu_find_opts("teardown"),
+   optarg, false);
+if (!opts) {
+return -1;
+}
+if (qemu_opt_get_bool(opts, "async", false)) {
+init_async_teardown();
+}
+break;
 #endif
 default:
 return -1;
diff --git a/qemu-options.hx b/qemu-options.hx
index d42f60fb91..8582980b12 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4766,20 +4766,33 @@ DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", 
QEMU_ARCH_ALL)
 DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
 "-async-teardown enable asynchronous teardown\n",
 QEMU_ARCH_ALL)
-#endif
 SRST
 ``-async-teardown``
-Enable asynchronous teardown. A new process called "cleanup/"
-will be created at startup sharing the address space with the main qemu
-process, using clone. It will wait for the main qemu process to
-terminate completely, and then exit.
-This allows qemu to terminate very quickly even if the guest was
-huge, leaving the teardown of the address space to the cleanup
-process. Since the cleanup process shares the same cgroups as the
-main qemu process, accounting is performed correctly. This only
-works if the cleanup process is not forcefully killed with SIGKILL
-before the main qemu process has terminated completely.
+Equivalent to -teardown async=on
+ERST
+
+DEF("teardown", HAS_ARG, QEMU_OPTION_teardown,
+"-teardown async[=on|off]\n"
+"process teardown options\n"
+"async=on enables asynchronous teardown\n"
+   ,
+QEMU_ARCH_ALL)
+SRST
+``-teardown``
+Set process teardown options.
+
+``async=on`` enables asynchronous teardown.  A new process called
+"cleanup/" will be created at startup sharing the address
+space with the main qemu process, using clone.  It will wait for the
+main qemu process to terminate completely, and then exit.  This allows
+qemu to terminate very quickly even if the guest was huge, leaving the
+teardown of the address space to the cleanup process.  Since the cleanup
+process shares the same cgroups as the main qemu process, accounting is
+performed correctly.  This only works if the cleanup process is not
+forcefully killed with SIGKILL before the main qemu process has
+terminated completely.
 ERST
+#endif
 
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..4a5dbce958 100644
--- a/util/async-teardown.c
+++ b/util/async-teardown.c
@@ -12,6 +12,9 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
 #include 
 #include 
 #include 
@@ -144,3 +147,21 @@ void init_async_teardown(void)
 clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
 sigprocmask(SIG_SETMASK, _signals, NULL);
 }
+
+static QemuOptsList qemu_teardown_opts = {
+.name = "teardown",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_teardown_opts.head),
+.desc = {
+{
+.name = "async",
+.type = QEMU_OPT_BOOL,
+},
+{ /* end of list */ }
+},
+};
+
+static void register_teardown(void)
+{
+qemu_add_opts(_teardown_opts);
+}
+opts_init(register_teardown);
-- 
2.39.2

[PATCH v3 0/1] util/async-teardown: wire up query-command-line-options

2023-03-24 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by adding a new -teardown commandline option
with an async=on|off parameter, correctly wired up so that it appears
in the output of query-command-line-options.

v2->v3
* add a new teardown option with an async parameter [Markus]
* reworded documentation of existing -async-teardown option so that it
  points to the new teardown option

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 os-posix.c| 14 ++
 qemu-options.hx   | 35 ---
 util/async-teardown.c | 21 +
 3 files changed, 59 insertions(+), 11 deletions(-)

-- 
2.39.2

Re: [PATCH v2 1/1] util/async-teardown: wire up query-command-line-options

2023-03-20 Thread Claudio Imbrenda

On Mon, 20 Mar 2023 17:05:07 +0100
Markus Armbruster  wrote:

> Thomas Huth  writes:
> 
> > On 20/03/2023 16.31, Markus Armbruster wrote:  
> >> Claudio Imbrenda  writes:
> >>   
> >>> The recently introduced -async-teardown commandline option was not
> >>> wired up properly and did not show up in the output of the QMP command
> >>> query-command-line-options. This means that libvirt will have no way to
> >>> discover whether the feature is supported.  
> >> 
> >> There was nothing improper in its wiring.  The issue is that
> >> query-command-line-options is junk.  See my recent post
> >> 
> >>  Subject: query-command-line-options (was: [PATCH 1/7] qemu: 
> >> capabilities: Introduce QEMU_CAPS_MACHINE_ACPI)
> >>  Date: Tue, 07 Mar 2023 10:40:23 +0100
> >>  Message-ID: <87jzzsc320.fsf...@pond.sub.org>
> >>   
> >>> This patch fixes the issue by correctly wiring up the commandline
> >>> option so that it appears in the output of query-command-line-options.
> >>>
> >>> Reported-by: Boris Fiuczynski 
> >>> Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on 
> >>> Linux")
> >>> Signed-off-by: Claudio Imbrenda 
> >>> ---
> >>>   util/async-teardown.c | 17 +
> >>>   1 file changed, 17 insertions(+)
> >>>
> >>> diff --git a/util/async-teardown.c b/util/async-teardown.c
> >>> index 62cdeb0f20..c9b9a3cdb2 100644
> >>> --- a/util/async-teardown.c
> >>> +++ b/util/async-teardown.c
> >>> @@ -12,6 +12,9 @@
> >>>*/
> >>>   
> >>>   #include "qemu/osdep.h"
> >>> +#include "qemu/config-file.h"
> >>> +#include "qemu/option.h"
> >>> +#include "qemu/module.h"
> >>>   #include 
> >>>   #include 
> >>>   #include 
> >>> @@ -144,3 +147,17 @@ void init_async_teardown(void)
> >>>   clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
> >>>   sigprocmask(SIG_SETMASK, _signals, NULL);
> >>>   }
> >>> +
> >>> +static QemuOptsList qemu_async_teardown_opts = {
> >>> +.name = "async-teardown",
> >>> +.head = QTAILQ_HEAD_INITIALIZER(qemu_async_teardown_opts.head),
> >>> +.desc = {
> >>> +{ /* end of list */ }
> >>> +},
> >>> +};
> >>> +
> >>> +static void register_async_teardown(void)
> >>> +{
> >>> +qemu_add_opts(_async_teardown_opts);
> >>> +}
> >>> +opts_init(register_async_teardown);  
> >> 
> >> Now it *is* improperly wired up :)
> >> 
> >> You're defining new QemuOpts config group "async-teardown" with
> >> arbitrary option parameters, but don't actually use it for parsing or
> >> recording the option.  I figure because you can't: there is no option
> >> argument to parse and record, which is what QemuOpts is designed to do.
> >> 
> >> If you need the feature to be visible in query-command-line-options, you
> >> should make it an option parameter (a KEY, not a GROUP), preferably of
> >> an existing group / option.  
> >
> > Would it make sense to add it e.g. to "-action" instead, i.e. something 
> > like 
> > "-action teardown=async" ?  
> 
> I believe the new parameter "teardown" would be visible in
> query-command-line-options.
> 
> How well does it fit -action?

I guess it can be shoehorned in. generally action is about stuff that
happens in/to the guest, while in this case it's about how qemu will
perform the teardown of its address space once it terminates.

the important parts are: this is an OS-specific option (Linux), and it
needs to be parsed and enabled before sandboxing (otherwise clone(2)
might not work)

[PATCH v2 0/1] util/async-teardown: wire up query-command-line-options

2023-03-20 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by correctly wiring up the commandline
option so that it appears in the output of query-command-line-options.

v1->v2
* remove the unneeded .implied_opt_name initializer [Thomas]

Claudio Imbrenda (1):
  util/async-teardown: wire up query-command-line-options

 util/async-teardown.c | 17 +
 1 file changed, 17 insertions(+)

-- 
2.39.2

[PATCH v2 1/1] util/async-teardown: wire up query-command-line-options

2023-03-20 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by correctly wiring up the commandline
option so that it appears in the output of query-command-line-options.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 util/async-teardown.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..c9b9a3cdb2 100644
--- a/util/async-teardown.c
+++ b/util/async-teardown.c
@@ -12,6 +12,9 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
 #include 
 #include 
 #include 
@@ -144,3 +147,17 @@ void init_async_teardown(void)
 clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
 sigprocmask(SIG_SETMASK, _signals, NULL);
 }
+
+static QemuOptsList qemu_async_teardown_opts = {
+.name = "async-teardown",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_async_teardown_opts.head),
+.desc = {
+{ /* end of list */ }
+},
+};
+
+static void register_async_teardown(void)
+{
+qemu_add_opts(_async_teardown_opts);
+}
+opts_init(register_async_teardown);
-- 
2.39.2

Re: [PATCH v1 1/1] util/async-teardown: wire up query-command-line-options

2023-03-20 Thread Claudio Imbrenda

On Mon, 20 Mar 2023 09:56:05 +0100
Thomas Huth  wrote:

> On 20/03/2023 08.48, Claudio Imbrenda wrote:
> > The recently introduced -async-teardown commandline option was not
> > wired up properly and did not show up in the output of the QMP command
> > query-command-line-options. This means that libvirt will have no way to
> > discover whether the feature is supported.
> > 
> > This patch fixes the issue by correctly wiring up the commandline
> > option so that it appears in the output of query-command-line-options.
> > 
> > Reported-by: Boris Fiuczynski 
> > Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >   util/async-teardown.c | 18 ++
> >   1 file changed, 18 insertions(+)
> > 
> > diff --git a/util/async-teardown.c b/util/async-teardown.c
> > index 62cdeb0f20..9a2e7bc146 100644
> > --- a/util/async-teardown.c
> > +++ b/util/async-teardown.c
> > @@ -12,6 +12,9 @@
> >*/
> >   
> >   #include "qemu/osdep.h"
> > +#include "qemu/config-file.h"
> > +#include "qemu/option.h"
> > +#include "qemu/module.h"
> >   #include 
> >   #include 
> >   #include 
> > @@ -144,3 +147,18 @@ void init_async_teardown(void)
> >   clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
> >   sigprocmask(SIG_SETMASK, _signals, NULL);
> >   }
> > +
> > +static QemuOptsList qemu_async_teardown_opts = {
> > +.name = "async-teardown",
> > +.implied_opt_name = "enable",  
> 
> Are you sure about the "implied_opt_name" field? If I got that right, it's 

yeah that should not be there, I'll fix and send a v2

> used for options where you can omit the part before the "=" in the first 
> paramter, e.g.
> 
>   -netdev user
> 
> is the same as:
> 
>   -netdev type=user
> 
> ... but as far as I can see, there is no way to use
> 
>   -async-teardown enable=off
> 
> at the command line?
> 
>   Thomas
> 
> 
> > +.head = QTAILQ_HEAD_INITIALIZER(qemu_async_teardown_opts.head),
> > +.desc = {
> > +{ /* end of list */ }
> > +},
> > +};
> > +
> > +static void register_async_teardown(void)
> > +{
> > +qemu_add_opts(_async_teardown_opts);
> > +}
> > +opts_init(register_async_teardown);  
>

[PATCH v1 1/1] util/async-teardown: wire up query-command-line-options

2023-03-20 Thread Claudio Imbrenda

The recently introduced -async-teardown commandline option was not
wired up properly and did not show up in the output of the QMP command
query-command-line-options. This means that libvirt will have no way to
discover whether the feature is supported.

This patch fixes the issue by correctly wiring up the commandline
option so that it appears in the output of query-command-line-options.

Reported-by: Boris Fiuczynski 
Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux")
Signed-off-by: Claudio Imbrenda 
---
 util/async-teardown.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/util/async-teardown.c b/util/async-teardown.c
index 62cdeb0f20..9a2e7bc146 100644
--- a/util/async-teardown.c
+++ b/util/async-teardown.c
@@ -12,6 +12,9 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/config-file.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
 #include 
 #include 
 #include 
@@ -144,3 +147,18 @@ void init_async_teardown(void)
 clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL);
 sigprocmask(SIG_SETMASK, _signals, NULL);
 }
+
+static QemuOptsList qemu_async_teardown_opts = {
+.name = "async-teardown",
+.implied_opt_name = "enable",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_async_teardown_opts.head),
+.desc = {
+{ /* end of list */ }
+},
+};
+
+static void register_async_teardown(void)
+{
+qemu_add_opts(_async_teardown_opts);
+}
+opts_init(register_async_teardown);
-- 
2.39.2

[PATCH v3 0/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-02-14 Thread Claudio Imbrenda

The first patch is just a minimal header update to compile the second
patch; it can be safely discarded once the Linux headers are updated to
6.2.

The second patch adds support for asynchronous teardown of protected
guests when rebooting. The existing guest is prepared for asynchronous
teardown, the rebooted guest will be able to continue immediately, while a
background thread actually performs the necessary cleanup.

v2->v3:
* improve description of header updates
* allocate the QemuThread on the stack to avoid leak, and explain why

v1->v2:
* remove useless snprintf and pass the name of the thread directly
* make the name of the thread more understandable

Claudio Imbrenda (2):
  Linux header update
  s390x/pv: Add support for asynchronous teardown for reboot

 hw/s390x/pv.c  | 28 
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 linux-headers/linux/kvm.h  |  3 +++
 4 files changed, 37 insertions(+), 1 deletion(-)

-- 
2.39.1

[PATCH v3 1/2] Linux header update

2023-02-14 Thread Claudio Imbrenda

Update kvm.h Linux header with these commits:

8c516b25d6e9 ("KVM: s390: pv: add KVM_CAP_S390_PROTECTED_ASYNC_DISABLE")
fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")

Signed-off-by: Claudio Imbrenda 
---
 linux-headers/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index ebdafa576d..122b273433 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1737,6 +1738,8 @@ enum pv_cmd_id {
KVM_PV_UNSHARE_ALL,
KVM_PV_INFO,
KVM_PV_DUMP,
+   KVM_PV_ASYNC_CLEANUP_PREPARE,
+   KVM_PV_ASYNC_CLEANUP_PERFORM,
 };
 
 struct kvm_pv_cmd {
-- 
2.39.1

[PATCH v3 2/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-02-14 Thread Claudio Imbrenda

This patch adds support for the asynchronous teardown for reboot for
protected VMs.

When attempting to tear down a protected VM, try to use the new
asynchronous interface first. If that fails, fall back to the classic
synchronous one.

The asynchronous interface involves invoking the new
KVM_PV_ASYNC_DISABLE_PREPARE command for the KVM_S390_PV_COMMAND ioctl.

This will prepare the current protected VM for asynchronous teardown.
Once the protected VM is prepared for teardown, execution can continue
immediately.

Once the protected VM has been prepared, a new thread is started to
actually perform the teardown. The new thread uses the new
KVM_PV_ASYNC_DISABLE command for the KVM_S390_PV_COMMAND ioctl. The
previously prepared protected VM is torn down in the new thread.

Once KVM_PV_ASYNC_DISABLE is invoked, it is possible to use
KVM_PV_ASYNC_DISABLE_PREPARE again. If a protected VM has already been
prepared and its cleanup has not started, it will not be possible to
prepare a new VM. In that case the classic synchronous teardown has to
be performed.

The synchronous teardown will now also clean up any prepared VMs whose
asynchronous teardown has not been initiated yet.

This considerably speeds up the reboot of a protected VM; for large VMs
especially, it could take a long time to perform a reboot with the
traditional synchronous teardown, while with this patch it is almost
immediate.

Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c  | 28 
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8a1c71436b..49ea38236c 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -16,6 +16,7 @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
+#include "sysemu/cpus.h"
 #include "qom/object_interfaces.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
@@ -108,6 +109,33 @@ void s390_pv_vm_disable(void)
  s390_pv_cmd_exit(KVM_PV_DISABLE, NULL);
 }
 
+static void *s390_pv_do_unprot_async_fn(void *p)
+{
+ s390_pv_cmd_exit(KVM_PV_ASYNC_CLEANUP_PERFORM, NULL);
+ return NULL;
+}
+
+bool s390_pv_vm_try_disable_async(void)
+{
+/*
+ * t is only needed to create the thread; once qemu_thread_create
+ * returns, it can safely be discarded.
+ */
+QemuThread t;
+
+if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
+return false;
+}
+if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
+return false;
+}
+
+qemu_thread_create(, "async_cleanup", s390_pv_do_unprot_async_fn, NULL,
+   QEMU_THREAD_DETACHED);
+
+return true;
+}
+
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length)
 {
 struct kvm_s390_pv_sec_parm args = {
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index f22f61b8b6..503f212a31 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -41,6 +41,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/s390x/tod.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
 #include "hw/s390x/pv.h"
 #include "migration/blocker.h"
 #include "qapi/visitor.h"
@@ -329,7 +330,9 @@ static inline void s390_do_cpu_ipl(CPUState *cs, 
run_on_cpu_data arg)
 
 static void s390_machine_unprotect(S390CcwMachineState *ms)
 {
-s390_pv_vm_disable();
+if (!s390_pv_vm_try_disable_async()) {
+s390_pv_vm_disable();
+}
 ms->pv = false;
 migrate_del_blocker(pv_mig_blocker);
 error_free_or_abort(_mig_blocker);
diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index 9360aa1091..966306a9db 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -41,6 +41,7 @@ static inline bool s390_is_pv(void)
 int s390_pv_query_info(void);
 int s390_pv_vm_enable(void);
 void s390_pv_vm_disable(void);
+bool s390_pv_vm_try_disable_async(void);
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
 int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
 void s390_pv_prep_reset(void);
@@ -60,6 +61,7 @@ static inline bool s390_is_pv(void) { return false; }
 static inline int s390_pv_query_info(void) { return 0; }
 static inline int s390_pv_vm_enable(void) { return 0; }
 static inline void s390_pv_vm_disable(void) {}
+static inline bool s390_pv_vm_try_disable_async(void) { return false; }
 static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { 
return 0; }
 static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) 
{ return 0; }
 static inline void s390_pv_prep_reset(void) {}
-- 
2.39.1

Re: [PATCH v2 2/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-01-17 Thread Claudio Imbrenda

On Tue, 17 Jan 2023 09:53:46 +0100
Thomas Huth  wrote:

[...]

> > +static void *s390_pv_do_unprot_async_fn(void *p)
> > +{
> > + s390_pv_cmd_exit(KVM_PV_ASYNC_CLEANUP_PERFORM, NULL);
> > + return NULL;
> > +}
> > +
> > +bool s390_pv_vm_try_disable_async(void)
> > +{
> > +QemuThread *t;
> > +
> > +if (!kvm_check_extension(kvm_state, 
> > KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
> > +return false;
> > +}
> > +if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
> > +return false;
> > +}
> > +
> > +t = g_malloc0(sizeof(QemuThread));  
> 
> Sorry for not noticing it in v1 already ... but isn't this leaking memory? 
> Who's supposed to free "t" again?

I assumed that QEMU_THREAD_DETACHED took care of that; it seems like I
was mistaken (oops).

I'll find a way to fix this

> 
>   Thomas
> 
> 
> > +qemu_thread_create(t, "async_cleanup", s390_pv_do_unprot_async_fn, 
> > NULL,
> > +   QEMU_THREAD_DETACHED);
> > +
> > +return true;
> > +}  
>

[PATCH v2 0/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-01-05 Thread Claudio Imbrenda

The first patch is just a minimal header update to compile the second
patch; it can be safely discarded once the Linux headers are updated to
6.2.

The second patch adds support for asynchronous teardown of protected
guests when rebooting. First the existing guest is prepared for
asynchronous teardown, the rebooted guest will be able to continue
immediately, while a background thread actually performs the necessary
cleanup.

v1->v2:
* remove useless snprintf and pass the name of the thread directly
* make the name of the thread more understandable

Claudio Imbrenda (2):
  Linux header update
  s390x/pv: Add support for asynchronous teardown for reboot

 hw/s390x/pv.c  | 25 +
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 linux-headers/linux/kvm.h  |  3 +++
 4 files changed, 34 insertions(+), 1 deletion(-)

-- 
2.39.0

[PATCH v2 2/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-01-05 Thread Claudio Imbrenda

This patch adds support for the asynchronous teardown for reboot for
protected VMs.

When attempting to tear down a protected VM, try to use the new
asynchronous interface first. If that fails, fall back to the classic
synchronous one.

The asynchronous interface involves invoking the new
KVM_PV_ASYNC_DISABLE_PREPARE command for the KVM_S390_PV_COMMAND ioctl.

This will prepare the current protected VM for asynchronous teardown.
Once the protected VM is prepared for teardown, execution can continue
immediately.

Once the protected VM has been prepared, a new thread is started to
actually perform the teardown. The new thread uses the new
KVM_PV_ASYNC_DISABLE command for the KVM_S390_PV_COMMAND ioctl. The
previously prepared protected VM is torn down in the new thread.

Once KVM_PV_ASYNC_DISABLE is invoked, it is possible to use
KVM_PV_ASYNC_DISABLE_PREPARE again. If a protected VM has already been
prepared and its cleanup has not started, it will not be possible to
prepare a new VM. In that case the classic synchronous teardown has to
be performed.

The synchronous teardown will now also clean up any prepared VMs whose
asynchronous teardown has not been initiated yet.

This considerably speeds up the reboot of a protected VM; for large VMs
especially, it could take a long time to perform a reboot with the
traditional synchronous teardown, while with this patch it is almost
immediate.

Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c  | 25 +
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8dfe92d8df..3d1c529eb5 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -16,6 +16,7 @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
+#include "sysemu/cpus.h"
 #include "qom/object_interfaces.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
@@ -107,6 +108,30 @@ void s390_pv_vm_disable(void)
  s390_pv_cmd_exit(KVM_PV_DISABLE, NULL);
 }
 
+static void *s390_pv_do_unprot_async_fn(void *p)
+{
+ s390_pv_cmd_exit(KVM_PV_ASYNC_CLEANUP_PERFORM, NULL);
+ return NULL;
+}
+
+bool s390_pv_vm_try_disable_async(void)
+{
+QemuThread *t;
+
+if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
+return false;
+}
+if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
+return false;
+}
+
+t = g_malloc0(sizeof(QemuThread));
+qemu_thread_create(t, "async_cleanup", s390_pv_do_unprot_async_fn, NULL,
+   QEMU_THREAD_DETACHED);
+
+return true;
+}
+
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length)
 {
 struct kvm_s390_pv_sec_parm args = {
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index f22f61b8b6..503f212a31 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -41,6 +41,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/s390x/tod.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
 #include "hw/s390x/pv.h"
 #include "migration/blocker.h"
 #include "qapi/visitor.h"
@@ -329,7 +330,9 @@ static inline void s390_do_cpu_ipl(CPUState *cs, 
run_on_cpu_data arg)
 
 static void s390_machine_unprotect(S390CcwMachineState *ms)
 {
-s390_pv_vm_disable();
+if (!s390_pv_vm_try_disable_async()) {
+s390_pv_vm_disable();
+}
 ms->pv = false;
 migrate_del_blocker(pv_mig_blocker);
 error_free_or_abort(_mig_blocker);
diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index 9360aa1091..966306a9db 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -41,6 +41,7 @@ static inline bool s390_is_pv(void)
 int s390_pv_query_info(void);
 int s390_pv_vm_enable(void);
 void s390_pv_vm_disable(void);
+bool s390_pv_vm_try_disable_async(void);
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
 int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
 void s390_pv_prep_reset(void);
@@ -60,6 +61,7 @@ static inline bool s390_is_pv(void) { return false; }
 static inline int s390_pv_query_info(void) { return 0; }
 static inline int s390_pv_vm_enable(void) { return 0; }
 static inline void s390_pv_vm_disable(void) {}
+static inline bool s390_pv_vm_try_disable_async(void) { return false; }
 static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { 
return 0; }
 static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) 
{ return 0; }
 static inline void s390_pv_prep_reset(void) {}
-- 
2.39.0

[PATCH v2 1/2] Linux header update

2023-01-05 Thread Claudio Imbrenda

Signed-off-by: Claudio Imbrenda 
---
 linux-headers/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index ebdafa576d..122b273433 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1737,6 +1738,8 @@ enum pv_cmd_id {
KVM_PV_UNSHARE_ALL,
KVM_PV_INFO,
KVM_PV_DUMP,
+   KVM_PV_ASYNC_CLEANUP_PREPARE,
+   KVM_PV_ASYNC_CLEANUP_PERFORM,
 };
 
 struct kvm_pv_cmd {
-- 
2.39.0

Re: [PATCH 2/5] s390x/pv: Implement CGS check handler

2023-01-05 Thread Claudio Imbrenda

On Thu, 5 Jan 2023 12:42:54 +0100
Thomas Huth  wrote:

> On 04/01/2023 12.51, Cédric Le Goater wrote:
> > From: Cédric Le Goater 
> > 
> > When a protected VM is started with the maximum number of CPUs (248),
> > the service call providing information on the CPUs requires more
> > buffer space than allocated and QEMU disgracefully aborts :
> > 
> >  LOADPARM=[]
> >  Using virtio-blk.
> >  Using SCSI scheme.
> >  
> > ...
> >  qemu-system-s390x: KVM_S390_MEM_OP failed: Argument list too long
> > 
> > Implement a test for this limitation in the ConfidentialGuestSupportClass
> > check handler and provide some valid information to the user before the
> > machine starts.
> > 
> > Signed-off-by: Cédric Le Goater 
> > ---
> >   hw/s390x/pv.c | 23 +++
> >   1 file changed, 23 insertions(+)
> > 
> > diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
> > index 8dfe92d8df..3a7ec70634 100644
> > --- a/hw/s390x/pv.c
> > +++ b/hw/s390x/pv.c
> > @@ -266,6 +266,26 @@ int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, 
> > Error **errp)
> >   return 0;
> >   }
> >   
> > +static bool s390_pv_check_cpus(Error **errp)
> > +{
> > +MachineState *ms = MACHINE(qdev_get_machine());
> > +MachineClass *mc = MACHINE_GET_CLASS(ms);
> > +uint32_t pv_max_cpus = mc->max_cpus - 1;  
> 
> Not sure whether "mc->max_cpus - 1" is the right approach here. I think it 
> would be better to calculate the amount of CPUs that we can support.
> 
> So AFAIK the problem is that SCLP information that is gathered during 
> read_SCP_info() in hw/s390x/sclp.c. If protected virtualization is enabled, 
> everything has to fit in one page (i.e. 4096 bytes) there.
> 
> So we have space for
> 
>   (TARGET_PAGE_SIZE - offset_cpu) / sizeof(CPUEntry)
> 
> CPUs.
> 
> With S390_FEAT_EXTENDED_LENGTH_SCCB enabled, offset_cpu is 144 (see struct 
> ReadInfo in sclp.h), otherwise it is 128.
> 
> That means, with S390_FEAT_EXTENDED_LENGTH_SCCB we can have a maximum of:
> 
>   (4096 - 144) / 16 = 247 CPUs
> 
> which is what you were trying to check with the mc->max_cpus - 1 here.
> 
> But with "-cpu els=off", it sounds like we could fit all 248 also with 
> protected VMs? Could you please give it a try?
> 
> Anyway, instead of using "pv_max_cpus = mc->max_cpus - 1" I'd suggest to use 
> something like this instead:
> 
>   int offset_cpu = s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) ?
>   offsetof(ReadInfo, entries) :
>   SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET;
>   pv_max_cpus = (TARGET_PAGE_SIZE - offset_cpu) /sizeof(CPUEntry);

I agree with Thomas here

> 
>Thomas
> 
>

Re: [PATCH 4/5] s390x/pv: Introduce a s390_pv_check() helper for runtime

2023-01-05 Thread Claudio Imbrenda

On Wed,  4 Jan 2023 12:51:10 +0100
Cédric Le Goater  wrote:

> From: Cédric Le Goater 
> 
> If a secure kernel is started in a non-protected VM, the OS will hang
> during boot without giving a proper error message to the user.
> 
> Perform the checks on Confidential Guest support at runtime with an
> helper called from the service call switching the guest to protected
> mode.
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  include/hw/s390x/pv.h |  2 ++
>  hw/s390x/pv.c | 14 ++
>  target/s390x/diag.c   |  7 +++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
> index 9360aa1091..ca7dac2e20 100644
> --- a/include/hw/s390x/pv.h
> +++ b/include/hw/s390x/pv.h
> @@ -55,6 +55,7 @@ int kvm_s390_dump_init(void);
>  int kvm_s390_dump_cpu(S390CPU *cpu, void *buff);
>  int kvm_s390_dump_mem_state(uint64_t addr, size_t len, void *dest);
>  int kvm_s390_dump_completion_data(void *buff);
> +bool s390_pv_check(Error **errp);
>  #else /* CONFIG_KVM */
>  static inline bool s390_is_pv(void) { return false; }
>  static inline int s390_pv_query_info(void) { return 0; }
> @@ -75,6 +76,7 @@ static inline int kvm_s390_dump_cpu(S390CPU *cpu, void 
> *buff) { return 0; }
>  static inline int kvm_s390_dump_mem_state(uint64_t addr, size_t len,
>void *dest) { return 0; }
>  static inline int kvm_s390_dump_completion_data(void *buff) { return 0; }
> +static inline bool s390_pv_check(Error **errp) { return false; }
>  #endif /* CONFIG_KVM */
>  
>  int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
> diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
> index 8d0d3f4adc..96c0728ec9 100644
> --- a/hw/s390x/pv.c
> +++ b/hw/s390x/pv.c
> @@ -307,6 +307,20 @@ static bool s390_pv_guest_check(const Object *obj, Error 
> **errp)
>  return s390_pv_check_cpus(errp) && s390_pv_check_host(errp);
>  }
>  
> +bool s390_pv_check(Error **errp)
> +{
> +MachineState *ms = MACHINE(qdev_get_machine());
> +Object *obj = OBJECT(ms->cgs);
> +
> +if (!obj) {
> +error_setg(errp, "Protected VM started without a Confidential"
> +   " Guest support interface");
> +return false;
> +}
> +
> +return s390_pv_guest_check(obj, errp);
> +}
> +
>  OBJECT_DEFINE_TYPE_WITH_INTERFACES(S390PVGuest,
> s390_pv_guest,
> S390_PV_GUEST,
> diff --git a/target/s390x/diag.c b/target/s390x/diag.c
> index 76b01dcd68..9b16e25930 100644
> --- a/target/s390x/diag.c
> +++ b/target/s390x/diag.c
> @@ -79,6 +79,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, 
> uint64_t r3, uintptr_t ra)
>  uint64_t addr =  env->regs[r1];
>  uint64_t subcode = env->regs[r3];
>  IplParameterBlock *iplb;
> +Error *local_err = NULL;
>  
>  if (env->psw.mask & PSW_MASK_PSTATE) {
>  s390_program_interrupt(env, PGM_PRIVILEGED, ra);
> @@ -176,6 +177,12 @@ out:
>  return;
>  }
>  
> +if (!s390_pv_check(_err)) {
> +error_report_err(local_err);
> +env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV;
> +return;
> +}
> +

in general yes

however I have noticed that we don't always return a PGM_SPECIFICATION
when PV is not available (we currently do it only for DIAG308_PV_SET). I
think we should return the PGM_SPECIFICATION for all PV subcodes when
the feature is not present (but this is a separate issue)

let me add Janosch in CC since he wrote that code

>  s390_ipl_reset_request(cs, S390_RESET_PV);
>  break;
>  default:

Re: [PATCH v1 2/2] s390x/pv: Add support for asynchronous teardown for reboot

2023-01-03 Thread Claudio Imbrenda

On Tue, 3 Jan 2023 12:27:32 +0100
Thomas Huth  wrote:

> On 22/12/2022 16.04, Claudio Imbrenda wrote:
> > This patch adds support for the asynchronous teardown for reboot for
> > protected VMs.  
> [...]
> > +bool s390_pv_vm_try_disable_async(void)
> > +{
> > +char tname[VCPU_THREAD_NAME_SIZE];
> > +QemuThread *t;
> > +
> > +if (!kvm_check_extension(kvm_state, 
> > KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
> > +return false;
> > +}
> > +if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
> > +return false;
> > +}
> > +
> > +t = g_malloc0(sizeof(QemuThread));
> > +snprintf(tname, VCPU_THREAD_NAME_SIZE, "async_unpr/KVM");
> > +
> > +qemu_thread_create(t, tname, s390_pv_do_unprot_async_fn, NULL,
> > +   QEMU_THREAD_DETACHED);  
> 
> I think you could get along without the tname[] array here by simply passing 
> the string directly to qemu_thread_create() ?

ahh, you're right

maybe I should also pick a better name? in retrospect it looks quite
cryptic

> 
> Apart from that, patch looks fine to me.
> 
>   Thomas
>

[PATCH v1 1/2] Linux header update

2022-12-22 Thread Claudio Imbrenda

Signed-off-by: Claudio Imbrenda 
---
 linux-headers/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index ebdafa576d..122b273433 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1737,6 +1738,8 @@ enum pv_cmd_id {
KVM_PV_UNSHARE_ALL,
KVM_PV_INFO,
KVM_PV_DUMP,
+   KVM_PV_ASYNC_CLEANUP_PREPARE,
+   KVM_PV_ASYNC_CLEANUP_PERFORM,
 };
 
 struct kvm_pv_cmd {
-- 
2.38.1

[PATCH v1 0/2] s390x/pv: Add support for asynchronous teardown for reboot

2022-12-22 Thread Claudio Imbrenda

The first patch is just a minimal header update to compile the second
patch; it can be safely discarded once the Linux headers are updated to
6.2.

The second patch adds support for asynchronous teardown of protected
guests when rebooting. First the existing guest is prepared for
asynchronous teardown, the rebooted guest will be able to continue
immediately, while a background thread actually performs the necessary
cleanup.

Claudio Imbrenda (2):
  Linux header update
  s390x/pv: Add support for asynchronous teardown for reboot

 hw/s390x/pv.c  | 28 
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 linux-headers/linux/kvm.h  |  3 +++
 4 files changed, 37 insertions(+), 1 deletion(-)

-- 
2.38.1

[PATCH v1 2/2] s390x/pv: Add support for asynchronous teardown for reboot

2022-12-22 Thread Claudio Imbrenda

This patch adds support for the asynchronous teardown for reboot for
protected VMs.

When attempting to tear down a protected VM, try to use the new
asynchronous interface first. If that fails, fall back to the classic
synchronous one.

The asynchronous interface involves invoking the new
KVM_PV_ASYNC_DISABLE_PREPARE command for the KVM_S390_PV_COMMAND ioctl.

This will prepare the current protected VM for asynchronous teardown.
Once the protected VM is prepared for teardown, execution can continue
immediately.

Once the protected VM has been prepared, a new thread is started to
actually perform the teardown. The new thread uses the new
KVM_PV_ASYNC_DISABLE command for the KVM_S390_PV_COMMAND ioctl. The
previously prepared protected VM is torn down in the new thread.

Once KVM_PV_ASYNC_DISABLE is invoked, it is possible to use
KVM_PV_ASYNC_DISABLE_PREPARE again. If a protected VM has already been
prepared and its cleanup has not started, it will not be possible to
prepare a new VM. In that case the classic synchronous teardown has to
be performed.

The synchronous teardown will now also clean up any prepared VMs whose
asynchronous teardown has not been initiated yet.

This considerably speeds up the reboot of a protected VM; for large VMs
especially, it could take a long time to perform a reboot with the
traditional synchronous teardown, while with this patch it is almost
immediate.

Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c  | 28 
 hw/s390x/s390-virtio-ccw.c |  5 -
 include/hw/s390x/pv.h  |  2 ++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8dfe92d8df..0b8345092b 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -16,6 +16,7 @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
+#include "sysemu/cpus.h"
 #include "qom/object_interfaces.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
@@ -107,6 +108,33 @@ void s390_pv_vm_disable(void)
  s390_pv_cmd_exit(KVM_PV_DISABLE, NULL);
 }
 
+static void *s390_pv_do_unprot_async_fn(void *p)
+{
+ s390_pv_cmd_exit(KVM_PV_ASYNC_CLEANUP_PERFORM, NULL);
+ return NULL;
+}
+
+bool s390_pv_vm_try_disable_async(void)
+{
+char tname[VCPU_THREAD_NAME_SIZE];
+QemuThread *t;
+
+if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) 
{
+return false;
+}
+if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) {
+return false;
+}
+
+t = g_malloc0(sizeof(QemuThread));
+snprintf(tname, VCPU_THREAD_NAME_SIZE, "async_unpr/KVM");
+
+qemu_thread_create(t, tname, s390_pv_do_unprot_async_fn, NULL,
+   QEMU_THREAD_DETACHED);
+
+return true;
+}
+
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length)
 {
 struct kvm_s390_pv_sec_parm args = {
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index f22f61b8b6..503f212a31 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -41,6 +41,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/s390x/tod.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
 #include "hw/s390x/pv.h"
 #include "migration/blocker.h"
 #include "qapi/visitor.h"
@@ -329,7 +330,9 @@ static inline void s390_do_cpu_ipl(CPUState *cs, 
run_on_cpu_data arg)
 
 static void s390_machine_unprotect(S390CcwMachineState *ms)
 {
-s390_pv_vm_disable();
+if (!s390_pv_vm_try_disable_async()) {
+s390_pv_vm_disable();
+}
 ms->pv = false;
 migrate_del_blocker(pv_mig_blocker);
 error_free_or_abort(_mig_blocker);
diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index 9360aa1091..966306a9db 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -41,6 +41,7 @@ static inline bool s390_is_pv(void)
 int s390_pv_query_info(void);
 int s390_pv_vm_enable(void);
 void s390_pv_vm_disable(void);
+bool s390_pv_vm_try_disable_async(void);
 int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
 int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
 void s390_pv_prep_reset(void);
@@ -60,6 +61,7 @@ static inline bool s390_is_pv(void) { return false; }
 static inline int s390_pv_query_info(void) { return 0; }
 static inline int s390_pv_vm_enable(void) { return 0; }
 static inline void s390_pv_vm_disable(void) {}
+static inline bool s390_pv_vm_try_disable_async(void) { return false; }
 static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { 
return 0; }
 static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) 
{ return 0; }
 static inline void s390_pv_prep_reset(void) {}
-- 
2.38.1

Re: [PATCH] docs/system/s390x: Document the "loadparm" machine property

2022-11-14 Thread Claudio Imbrenda

On Mon, 14 Nov 2022 14:25:02 +0100
Thomas Huth  wrote:

> The "loadparm" machine property is useful for selecting alternative
> kernels on the disk of the guest, but so far we do not tell the users
> yet how to use it. Add some documentation to fill this gap.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2128235
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  docs/system/s390x/bootdevices.rst | 26 ++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/docs/system/s390x/bootdevices.rst 
> b/docs/system/s390x/bootdevices.rst
> index b5950133e8..40089c35a9 100644
> --- a/docs/system/s390x/bootdevices.rst
> +++ b/docs/system/s390x/bootdevices.rst
> @@ -53,6 +53,32 @@ recommended to specify a CD-ROM device via ``-device 
> scsi-cd`` (as mentioned
>  above) instead.
>  
>  
> +Selecting kernels with the ``loadparm`` property
> +
> +
> +The ``s390-ccw-virtio`` machine supports the so-called ``loadparm`` parameter
> +which can be used to select the kernel on the disk of the guest that the
> +s390-ccw bios should boot. When starting QEMU, it can be specified like 
> this::
> +
> + qemu-system-s390x -machine s390-ccw-virtio,loadparm=
> +
> +The first way to use this parameter is to use the word ``PROMPT`` as the
> + here. In that case the s390-ccw bios will show a list of
> +installed kernels on the disk of the guest and ask the user to enter a number
> +to chose which kernel should be booted -- similar to what can be achieved by
> +specifying the ``-boot menu=on`` option when starting QEMU. Note that the 
> menu
> +list will only show the names of the installed kernels when using a DASD-like
> +disk image with 4k byte sectors, on normal SCSI-style disks with 512-byte
> +sectors, there is not enough space for the zipl loader on the disk to store
> +the kernel names, so you only get a list without names here.
> +
> +The second way to use this parameter is to use a number in the range from 0
> +to 31. The numbers that can be used here correspond to the numbers that are
> +shown when using the ``PROMPT`` option, and the s390-ccw bios will then try
> +to automatically boot the kernel that is associated with the given number.
> +Note that ``0`` can be used to boot the default entry.
> +
> +
>  Booting from a network device
>  -
>

[PATCH] s390x/pv: remove semicolon from macro definition

2022-10-10 Thread Claudio Imbrenda

Remove spurious semicolon at the end of the macro s390_pv_cmd

Signed-off-by: Claudio Imbrenda 
---
 hw/s390x/pv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 401b63d6cb..9bcd5d829f 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -45,7 +45,7 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, 
void *data)
  * This macro lets us pass the command as a string to the function so
  * we can print it on an error.
  */
-#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data);
+#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data)
 #define s390_pv_cmd_exit(cmd, data)\
 {  \
 int rc;\
-- 
2.37.3

[PATCH v4 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-12 Thread Claudio Imbrenda

This patch adds support for asynchronously tearing down a VM on Linux.

When qemu terminates, either naturally or because of a fatal signal,
the VM is torn down. If the VM is huge, it can take a considerable
amount of time for it to be cleaned up. In case of a protected VM, it
might take even longer than a non-protected VM (this is the case on
s390x, for example).

Some users might want to shut down a VM and restart it immediately,
without having to wait. This is especially true if management
infrastructure like libvirt is used.

This patch implements a simple trick on Linux to allow qemu to return
immediately, with the teardown of the VM being performed
asynchronously.

If the new commandline option -async-teardown is used, a new process is
spawned from qemu at startup, using the clone syscall, in such way that
it will share its address space with qemu.The new process will have the
name "cleanup/". It will wait until qemu terminates
completely, and then it will exit itself.

This allows qemu to terminate quickly, without having to wait for the
whole address space to be torn down. The cleanup process will exit
after qemu, so it will be the last user of the address space, and
therefore it will take care of the actual teardown. The cleanup
process will share the same cgroups as qemu, so both memory usage and
cpu time will be accounted properly.

If possible, close_range will be used in the cleanup process to close
all open file descriptors. If it is not available or if it fails, /proc
will be used to determine which file descriptors to close.

If the cleanup process is forcefully killed with SIGKILL before the
main qemu process has terminated completely, the mechanism is defeated
and the teardown will not be asynchronous.

This feature can already be used with libvirt by adding the following
to the XML domain definition to pass the parameter to qemu directly:

  http://libvirt.org/schemas/domain/qemu/1.0;>
  
  

Signed-off-by: Claudio Imbrenda 
Reviewed-by: Murilo Opsfelder Araujo 
Tested-by: Murilo Opsfelder Araujo 
---
 include/qemu/async-teardown.h |  22 +
 meson.build   |   1 +
 os-posix.c|   6 ++
 qemu-options.hx   |  19 +
 util/async-teardown.c | 155 ++
 util/meson.build  |   1 +
 6 files changed, 204 insertions(+)
 create mode 100644 include/qemu/async-teardown.h
 create mode 100644 util/async-teardown.c

diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h
new file mode 100644
index 00..092e7a37e7
--- /dev/null
+++ b/include/qemu/async-teardown.h
@@ -0,0 +1,22 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ *  Claudio Imbrenda 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version.  See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_ASYNC_TEARDOWN_H
+#define QEMU_ASYNC_TEARDOWN_H
+
+#include "config-host.h"
+
+#ifdef CONFIG_LINUX
+void init_async_teardown(void);
+#endif
+
+#endif
diff --git a/meson.build b/meson.build
index 294e9a8f32..7bccad93d0 100644
--- a/meson.build
+++ b/meson.build
@@ -1892,6 +1892,7 @@ config_host_data.set('HAVE_SYS_IOCCOM_H', 
cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 
 # has_function
+config_host_data.set('CONFIG_CLOSE_RANGE', cc.has_function('close_range'))
 config_host_data.set('CONFIG_ACCEPT4', cc.has_function('accept4'))
 config_host_data.set('CONFIG_CLOCK_ADJTIME', cc.has_function('clock_adjtime'))
 config_host_data.set('CONFIG_DUP3', cc.has_function('dup3'))
diff --git a/os-posix.c b/os-posix.c
index 321fc4bd13..4858650c3e 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -39,6 +39,7 @@
 
 #ifdef CONFIG_LINUX
 #include 
+#include "qemu/async-teardown.h"
 #endif
 
 /*
@@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg)
 case QEMU_OPTION_daemonize:
 daemonize = 1;
 break;
+#if defined(CONFIG_LINUX)
+case QEMU_OPTION_asyncteardown:
+init_async_teardown();
+break;
+#endif
 default:
 return -1;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index 3f23a42fa8..f913fc307f 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4743,6 +4743,25 @@ HXCOMM Internal use
 DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
+#ifdef __linux__
+DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
+"-async-teardown enable asynchronous teardown\n",
+QEMU_ARCH_ALL)
+#endif
+SRST
+``-async-teardown``
+Enable asynchronous teardown. A new process called "cleanup/"
+will be created at startup sharing the address space with the main qemu
+process, using clone. It will wait for the main qemu process to
+

Re: [PATCH v3 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-12 Thread Claudio Imbrenda

On Fri, 12 Aug 2022 08:38:59 -0300
Murilo Opsfelder Araújo  wrote:

> On 8/12/22 04:26, Claudio Imbrenda wrote:
> > On Thu, 11 Aug 2022 23:05:52 -0300
> > Murilo Opsfelder Araújo  wrote:
> >  
> >> On 8/11/22 11:02, Daniel P. Berrangé wrote:
> >> [...]  
> >>>>> Hmm, I was hoping you could just use SIGKILL to guarantee that this
> >>>>> gets killed off.  Is SIGKILL delivered too soon to allow for the
> >>>>> main QEMU process to have exited quickly ?  
> >>>>
> >>>> yes, I tried. qemu has not finished exiting when the signal is
> >>>> delivered, the cleanup process dies before qemu, which defeats the
> >>>> purpose  
> >>>
> >>> Ok, too bad.
> >>>  
> >>>>> If so I wonder what happens when systemd just delivers SIGKILL to
> >>>>> all processes in the cgroup - I'm not sure there's a guarantee it
> >>>>> will SIGKILL the main qemu before it SIGKILLs this helper  
> >>>>
> >>>> I'm afraid in that case there is no guarantee.
> >>>>
> >>>> for what it's worth, both virsh shutdown and destroy seem to do things
> >>>> properly.  
> >>>
> >>> Hmm, probably because libvirt tells QEMU to exit before systemd comes
> >>> along and tells everything in the cgroup to die with SIGKILL.  
> >>
> >> It seems Libvirt sends SIGKILL if qemu process doesn't terminate within 10
> >> seconds after Libvirt sent SIGTERM:
> >>
> >> https://gitlab.com/libvirt/libvirt/-/blob/0615df084ec9996b5df88d6a1b59c557e22f3a12/src/util/virprocess.c#L375
> >>   
> >
> > but this is fine.
> >
> > with asynchronous teardown, qemu will exit almost immediately when
> > receiving SIGTERM, and the cleanup process will start cleaning up.  
> 
> Under normal and orderly conditions, yes.
> 
> >> So I guess this patch happened to work with Libvirt because the main qemu
> >> process terminated before the timeout and before SIGKILL was delivered.  
> >
> > it seems so
> >  
> >>
> >> The cleanup process is trying to solve the problem where the main qemu 
> >> process
> >> takes too long to terminate. However, if the cleanup process itself takes 
> >> too
> >> long, SIGKILL will be sent by Libvirt anyway.  
> >
> > but that is not a problem, the sole purpose of the cleanup process is
> > to terminate _after_ qemu. it doesn't matter what happens after qemu
> > has terminated. if you look at the patch, after going to great lengths
> > to assure that qemu has terminated, all the child process does is
> > _exit(0).
> >  
> >>
> >> Perhaps we can describe this situation in the parameter help, e.g.: If
> >> management layer decides to send SIGKILL (e.g.: due to timeout or 
> >> deliberate
> >> decision), the cleanup process can exit before the main process, deceiving 
> >> its
> >> purpose.  
> >
> > if the management layer (or the user) decides to send SIGKILL
> > immediately to the whole cgroup without sending SIGTERM first, then
> > this whole asynchronous teardown mechanism is defeated, yes.  
> 
> This situation is what we likely want to describe in the parameter help. I 
> don't
> want to give users the false impression that this option will *always* behave
> the manner we expect it to work *most* of the time.

fair enough, I'll improve the documentation

> 
> --
> Murilo

Re: [PATCH v3 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-12 Thread Claudio Imbrenda

On Thu, 11 Aug 2022 23:05:52 -0300
Murilo Opsfelder Araújo  wrote:

> On 8/11/22 11:02, Daniel P. Berrangé wrote:
> [...]
> >>> Hmm, I was hoping you could just use SIGKILL to guarantee that this
> >>> gets killed off.  Is SIGKILL delivered too soon to allow for the
> >>> main QEMU process to have exited quickly ?  
> >>
> >> yes, I tried. qemu has not finished exiting when the signal is
> >> delivered, the cleanup process dies before qemu, which defeats the
> >> purpose  
> >
> > Ok, too bad.
> >  
> >>> If so I wonder what happens when systemd just delivers SIGKILL to
> >>> all processes in the cgroup - I'm not sure there's a guarantee it
> >>> will SIGKILL the main qemu before it SIGKILLs this helper  
> >>
> >> I'm afraid in that case there is no guarantee.
> >>
> >> for what it's worth, both virsh shutdown and destroy seem to do things
> >> properly.  
> >
> > Hmm, probably because libvirt tells QEMU to exit before systemd comes
> > along and tells everything in the cgroup to die with SIGKILL.  
> 
> It seems Libvirt sends SIGKILL if qemu process doesn't terminate within 10
> seconds after Libvirt sent SIGTERM:
> 
> https://gitlab.com/libvirt/libvirt/-/blob/0615df084ec9996b5df88d6a1b59c557e22f3a12/src/util/virprocess.c#L375

but this is fine.

with asynchronous teardown, qemu will exit almost immediately when
receiving SIGTERM, and the cleanup process will start cleaning up.

> 
> So I guess this patch happened to work with Libvirt because the main qemu
> process terminated before the timeout and before SIGKILL was delivered.

it seems so

> 
> The cleanup process is trying to solve the problem where the main qemu process
> takes too long to terminate. However, if the cleanup process itself takes too
> long, SIGKILL will be sent by Libvirt anyway.

but that is not a problem, the sole purpose of the cleanup process is
to terminate _after_ qemu. it doesn't matter what happens after qemu
has terminated. if you look at the patch, after going to great lengths
to assure that qemu has terminated, all the child process does is
_exit(0). 

> 
> Perhaps we can describe this situation in the parameter help, e.g.: If
> management layer decides to send SIGKILL (e.g.: due to timeout or deliberate
> decision), the cleanup process can exit before the main process, deceiving its
> purpose.

if the management layer (or the user) decides to send SIGKILL
immediately to the whole cgroup without sending SIGTERM first, then
this whole asynchronous teardown mechanism is defeated, yes.

Re: [PATCH v3 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-11 Thread Claudio Imbrenda

On Thu, 11 Aug 2022 13:27:44 +0100
Daniel P. Berrangé  wrote:

> On Tue, Aug 09, 2022 at 08:40:24AM +0200, Claudio Imbrenda wrote:
> > This patch adds support for asynchronously tearing down a VM on Linux.
> > 
> > When qemu terminates, either naturally or because of a fatal signal,
> > the VM is torn down. If the VM is huge, it can take a considerable
> > amount of time for it to be cleaned up. In case of a protected VM, it
> > might take even longer than a non-protected VM (this is the case on
> > s390x, for example).
> > 
> > Some users might want to shut down a VM and restart it immediately,
> > without having to wait. This is especially true if management
> > infrastructure like libvirt is used.
> > 
> > This patch implements a simple trick on Linux to allow qemu to return
> > immediately, with the teardown of the VM being performed
> > asynchronously.
> > 
> > If the new commandline option -async-teardown is used, a new process is
> > spawned from qemu at startup, using the clone syscall, in such way that
> > it will share its address space with qemu.
> > 
> > The new process will have the name "cleanup/". It will wait
> > until qemu terminates, and then it will exit itself.
> > 
> > This allows qemu to terminate quickly, without having to wait for the
> > whole address space to be torn down. The teardown process will exit
> > after qemu, so it will be the last user of the address space, and
> > therefore it will take care of the actual teardown.
> > 
> > The teardown process will share the same cgroups as qemu, so both
> > memory usage and cpu time will be accounted properly.
> > 
> > This feature can already be used with libvirt by adding the following
> > to the XML domain definition to pass the parameter to qemu directly:
> > 
> >   http://libvirt.org/schemas/domain/qemu/1.0;>
> >   
> >   
> > 
> > More advanced interfaces like pidfd or close_range have intentionally
> > been avoided in order to be more compatible with older kernels.
> > 
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >  include/qemu/async-teardown.h |  22 ++
> >  os-posix.c|   6 ++
> >  qemu-options.hx   |  17 +
> >  util/async-teardown.c | 123 ++
> >  util/meson.build  |   1 +
> >  5 files changed, 169 insertions(+)
> >  create mode 100644 include/qemu/async-teardown.h
> >  create mode 100644 util/async-teardown.c
> >   
> 
> > diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h
> > new file mode 100644
> > index 00..092e7a37e7
> > --- /dev/null
> > +++ b/include/qemu/async-teardown.h
> > @@ -0,0 +1,22 @@
> > +/*
> > + * Asynchronous teardown
> > + *
> > + * Copyright IBM, Corp. 2022
> > + *
> > + * Authors:
> > + *  Claudio Imbrenda 
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or (at 
> > your
> > + * option) any later version.  See the COPYING file in the top-level 
> > directory.
> > + *
> > + */
> > +#ifndef QEMU_ASYNC_TEARDOWN_H
> > +#define QEMU_ASYNC_TEARDOWN_H
> > +
> > +#include "config-host.h"
> > +
> > +#ifdef CONFIG_LINUX
> > +void init_async_teardown(void);
> > +#endif
> > +
> > +#endif
> > diff --git a/os-posix.c b/os-posix.c
> > index 321fc4bd13..4858650c3e 100644
> > --- a/os-posix.c
> > +++ b/os-posix.c
> > @@ -39,6 +39,7 @@
> >  
> >  #ifdef CONFIG_LINUX
> >  #include 
> > +#include "qemu/async-teardown.h"
> >  #endif
> >  
> >  /*
> > @@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg)
> >  case QEMU_OPTION_daemonize:
> >  daemonize = 1;
> >  break;
> > +#if defined(CONFIG_LINUX)
> > +case QEMU_OPTION_asyncteardown:
> > +init_async_teardown();
> > +break;
> > +#endif
> >  default:
> >  return -1;
> >  }
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 3f23a42fa8..d434353159 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -4743,6 +4743,23 @@ HXCOMM Internal use
> >  DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
> >  DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
> >  
> > +#ifdef __linux__
> > +DEF("async-teardown", 0, QEMU_OPTION_asyncteard

Re: [PATCH v3 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-11 Thread Claudio Imbrenda

On Wed, 10 Aug 2022 17:30:41 -0300
Murilo Opsfelder Araújo  wrote:

> Hi, Claudio.

Hi Murilo,

[...]
 
> I've smoke-tested this on ppc and everything looks fine.
> For what's worth:
> 
> Reviewed-by: Murilo Opsfelder Araujo 
> Tested-by: Murilo Opsfelder Araujo 

thanks a lot for testing this!

> 
> 
> Have you measured the benefits of using -async-teardown vs. not using it?
> If so, can you please share the details so I can give it a try on ppc, too?
> 
> The wall-clock perception is that nothing has changed, for better or worse.
> My tests used mid-sized VMs, like 128 vCPUs, 64GB RAM.

The number of CPUs doesn't really have any impact. 64G of RAM is quite
small to notice a sizeable difference, although you should be able to
see a few seconds of speedup in the shutdown. Also, starting a guest
with a lot of RAM is not enough, you have to make sure that the guest
ram is actually allocated (completely fill the ram in the guest before
shutting it down)

I just tested a 64G and a 256G guest on s390x. I measured the time
between the last line in the console ("Reached target Power-Off.") and
the moment when control comes back to the shell. The measurement was
not exactly super accurate (I manually ran "date +%s" in another shell
when I saw the last line in the console, and then again when I got the
shell back from qemu). 

The 64G guest needs a few seconds, the 256G needs almost exactly 4
times as much. With the asynchronous teardown it's almost instant in
both cases (less than 1s, too fast to measure manually).

Try a multi-TB guest if you can (at the moment I can't) to
see a more marked effect.

Also consider that this is for _normal_ guests. Protected guests on
s390x have an even slower teardown due to the way protected
virtualization is implemented in the hardware.

I hope this was helpful

> 
> Cheers!
> 
> > ---
> >   include/qemu/async-teardown.h |  22 ++
> >   os-posix.c|   6 ++
> >   qemu-options.hx   |  17 +
> >   util/async-teardown.c | 123 ++
> >   util/meson.build  |   1 +
> >   5 files changed, 169 insertions(+)
> >   create mode 100644 include/qemu/async-teardown.h
> >   create mode 100644 util/async-teardown.c
> > 
> > diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h
> > new file mode 100644
> > index 00..092e7a37e7
> > --- /dev/null
> > +++ b/include/qemu/async-teardown.h
> > @@ -0,0 +1,22 @@
> > +/*
> > + * Asynchronous teardown
> > + *
> > + * Copyright IBM, Corp. 2022
> > + *
> > + * Authors:
> > + *  Claudio Imbrenda 
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or (at 
> > your
> > + * option) any later version.  See the COPYING file in the top-level 
> > directory.
> > + *
> > + */
> > +#ifndef QEMU_ASYNC_TEARDOWN_H
> > +#define QEMU_ASYNC_TEARDOWN_H
> > +
> > +#include "config-host.h"
> > +
> > +#ifdef CONFIG_LINUX
> > +void init_async_teardown(void);
> > +#endif
> > +
> > +#endif
> > diff --git a/os-posix.c b/os-posix.c
> > index 321fc4bd13..4858650c3e 100644
> > --- a/os-posix.c
> > +++ b/os-posix.c
> > @@ -39,6 +39,7 @@
> >   
> >   #ifdef CONFIG_LINUX
> >   #include 
> > +#include "qemu/async-teardown.h"
> >   #endif
> >   
> >   /*
> > @@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg)
> >   case QEMU_OPTION_daemonize:
> >   daemonize = 1;
> >   break;
> > +#if defined(CONFIG_LINUX)
> > +case QEMU_OPTION_asyncteardown:
> > +init_async_teardown();
> > +break;
> > +#endif
> >   default:
> >   return -1;
> >   }
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 3f23a42fa8..d434353159 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -4743,6 +4743,23 @@ HXCOMM Internal use
> >   DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
> >   DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
> >   
> > +#ifdef __linux__
> > +DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
> > +"-async-teardown enable asynchronous teardown\n",
> > +QEMU_ARCH_ALL)
> > +#endif
> > +SRST
> > +``-async-teardown``
> > +Enable asynchronous teardown. A new teardown process will be
> > +created at startup, using clone. The teardown process will share
> > +the address spac

Re: [PATCH v2 2/3] softmmu/physmem: Remove the ifdef linux around the pagesize functions

2022-08-10 Thread Claudio Imbrenda

On Wed, 10 Aug 2022 14:57:19 +0200
Thomas Huth  wrote:

> Now that host_memory_backend_pagesize() is not depending on the hugetlb
> memory path handling anymore, we can also remove the #ifdef and the
> TOCTTOU comment from the calling functions - the code should now work
> equally well on all host architectures.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  softmmu/physmem.c | 17 -
>  1 file changed, 17 deletions(-)
> 
> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index dc3c3e5f2e..50231bab30 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -1331,13 +1331,6 @@ GString *ram_block_format(void)
>  return buf;
>  }
>  
> -#ifdef __linux__
> -/*
> - * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
> - * may or may not name the same files / on the same filesystem now as
> - * when we actually open and map them.  Iterate over the file
> - * descriptors instead, and use qemu_fd_getpagesize().
> - */
>  static int find_min_backend_pagesize(Object *obj, void *opaque)
>  {
>  long *hpsize_min = opaque;
> @@ -1391,16 +1384,6 @@ long qemu_maxrampagesize(void)
>  object_child_foreach(memdev_root, find_max_backend_pagesize, );
>  return pagesize;
>  }
> -#else
> -long qemu_minrampagesize(void)
> -{
> -return qemu_real_host_page_size();
> -}
> -long qemu_maxrampagesize(void)
> -{
> -return qemu_real_host_page_size();
> -}
> -#endif
>  
>  #ifdef CONFIG_POSIX
>  static int64_t get_file_size(int fd)

Re: [PATCH v2 1/3] backends/hostmem: Fix support of memory-backend-memfd in qemu_maxrampagesize()

2022-08-10 Thread Claudio Imbrenda

On Wed, 10 Aug 2022 14:57:18 +0200
Thomas Huth  wrote:

> It is currently not possible yet to use "memory-backend-memfd" on s390x
> with hugepages enabled. This problem is caused by qemu_maxrampagesize()
> not taking memory-backend-memfd objects into account yet, so the code
> in s390_memory_init() fails to enable the huge page support there via
> s390_set_max_pagesize(). Fix it by generalizing the code, so that it
> looks at qemu_ram_pagesize(memdev->mr.ram_block) instead of re-trying
> to get the information from the filesystem.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2116496
> Suggested-by: David Hildenbrand 
> Signed-off-by: Thomas Huth 

more removed lines than added, I like it

Reviewed-by: Claudio Imbrenda 

> ---
>  backends/hostmem.c | 14 ++
>  1 file changed, 2 insertions(+), 12 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 624bb7ecd3..4428e06738 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -306,22 +306,12 @@ bool host_memory_backend_is_mapped(HostMemoryBackend 
> *backend)
>  return backend->is_mapped;
>  }
>  
> -#ifdef __linux__
>  size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
>  {
> -Object *obj = OBJECT(memdev);
> -char *path = object_property_get_str(obj, "mem-path", NULL);
> -size_t pagesize = qemu_mempath_getpagesize(path);
> -
> -g_free(path);
> +size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block);
> +g_assert(pagesize >= qemu_real_host_page_size());
>  return pagesize;
>  }
> -#else
> -size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
> -{
> -return qemu_real_host_page_size();
> -}
> -#endif
>  
>  static void
>  host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)

Re: [PATCH v2 0/3] Fix hugepages with memfd on s390x and clean up related code

2022-08-10 Thread Claudio Imbrenda

On Wed, 10 Aug 2022 14:57:17 +0200
Thomas Huth  wrote:

> The first patch fixes the problem that hugepages cannot be used via
> the "memory-backend-memfd" object on s390x. The second and third patch
> are some clean-ups that can be done after generalizing the code in the
> first patch.

thanks for fixing this

> 
> v2:
>  - Use qemu_ram_pagesize(memdev->mr.ram_block) instead of adding
>additional code for the memfd object
>  - Added the two clean-up patches on top to simplify the code
> 
> Thomas Huth (3):
>   backends/hostmem: Fix support of memory-backend-memfd in
> qemu_maxrampagesize()
>   softmmu/physmem: Remove the ifdef __linux__  around the pagesize
> functions
>   util/mmap-alloc: Remove qemu_mempath_getpagesize()
> 
>  include/qemu/mmap-alloc.h |  2 --
>  backends/hostmem.c| 14 ++
>  softmmu/physmem.c | 17 -
>  util/mmap-alloc.c | 31 ---
>  4 files changed, 2 insertions(+), 62 deletions(-)
>

Re: [PATCH v2 3/3] util/mmap-alloc: Remove qemu_mempath_getpagesize()

2022-08-10 Thread Claudio Imbrenda

On Wed, 10 Aug 2022 14:57:20 +0200
Thomas Huth  wrote:

> The last user of this function has just been removed, so we can
> drop this function now, too.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  include/qemu/mmap-alloc.h |  2 --
>  util/mmap-alloc.c | 31 ---
>  2 files changed, 33 deletions(-)
> 
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index 5076695cc8..2825e231a7 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -4,8 +4,6 @@
>  
>  size_t qemu_fd_getpagesize(int fd);
>  
> -size_t qemu_mempath_getpagesize(const char *mem_path);
> -
>  /**
>   * qemu_ram_mmap: mmap anonymous memory, the specified file or device.
>   *
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 5b90cb68ea..5ed7d29183 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -53,37 +53,6 @@ size_t qemu_fd_getpagesize(int fd)
>  return qemu_real_host_page_size();
>  }
>  
> -size_t qemu_mempath_getpagesize(const char *mem_path)
> -{
> -#ifdef CONFIG_LINUX
> -struct statfs fs;
> -int ret;
> -
> -if (mem_path) {
> -do {
> -ret = statfs(mem_path, );
> -} while (ret != 0 && errno == EINTR);
> -
> -if (ret != 0) {
> -fprintf(stderr, "Couldn't statfs() memory path: %s\n",
> -strerror(errno));
> -exit(1);
> -}
> -
> -if (fs.f_type == HUGETLBFS_MAGIC) {
> -/* It's hugepage, return the huge page size */
> -return fs.f_bsize;
> -}
> -}
> -#ifdef __sparc__
> -/* SPARC Linux needs greater alignment than the pagesize */
> -return QEMU_VMALLOC_ALIGN;
> -#endif
> -#endif
> -
> -return qemu_real_host_page_size();
> -}
> -
>  #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
>  static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
>  {

[PATCH v3 1/1] os-posix: asynchronous teardown for shutdown on Linux

2022-08-09 Thread Claudio Imbrenda

This patch adds support for asynchronously tearing down a VM on Linux.

When qemu terminates, either naturally or because of a fatal signal,
the VM is torn down. If the VM is huge, it can take a considerable
amount of time for it to be cleaned up. In case of a protected VM, it
might take even longer than a non-protected VM (this is the case on
s390x, for example).

Some users might want to shut down a VM and restart it immediately,
without having to wait. This is especially true if management
infrastructure like libvirt is used.

This patch implements a simple trick on Linux to allow qemu to return
immediately, with the teardown of the VM being performed
asynchronously.

If the new commandline option -async-teardown is used, a new process is
spawned from qemu at startup, using the clone syscall, in such way that
it will share its address space with qemu.

The new process will have the name "cleanup/". It will wait
until qemu terminates, and then it will exit itself.

This allows qemu to terminate quickly, without having to wait for the
whole address space to be torn down. The teardown process will exit
after qemu, so it will be the last user of the address space, and
therefore it will take care of the actual teardown.

The teardown process will share the same cgroups as qemu, so both
memory usage and cpu time will be accounted properly.

This feature can already be used with libvirt by adding the following
to the XML domain definition to pass the parameter to qemu directly:

  http://libvirt.org/schemas/domain/qemu/1.0;>
  
  

More advanced interfaces like pidfd or close_range have intentionally
been avoided in order to be more compatible with older kernels.

Signed-off-by: Claudio Imbrenda 
---
 include/qemu/async-teardown.h |  22 ++
 os-posix.c|   6 ++
 qemu-options.hx   |  17 +
 util/async-teardown.c | 123 ++
 util/meson.build  |   1 +
 5 files changed, 169 insertions(+)
 create mode 100644 include/qemu/async-teardown.h
 create mode 100644 util/async-teardown.c

diff --git a/include/qemu/async-teardown.h b/include/qemu/async-teardown.h
new file mode 100644
index 00..092e7a37e7
--- /dev/null
+++ b/include/qemu/async-teardown.h
@@ -0,0 +1,22 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ *  Claudio Imbrenda 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version.  See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_ASYNC_TEARDOWN_H
+#define QEMU_ASYNC_TEARDOWN_H
+
+#include "config-host.h"
+
+#ifdef CONFIG_LINUX
+void init_async_teardown(void);
+#endif
+
+#endif
diff --git a/os-posix.c b/os-posix.c
index 321fc4bd13..4858650c3e 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -39,6 +39,7 @@
 
 #ifdef CONFIG_LINUX
 #include 
+#include "qemu/async-teardown.h"
 #endif
 
 /*
@@ -150,6 +151,11 @@ int os_parse_cmd_args(int index, const char *optarg)
 case QEMU_OPTION_daemonize:
 daemonize = 1;
 break;
+#if defined(CONFIG_LINUX)
+case QEMU_OPTION_asyncteardown:
+init_async_teardown();
+break;
+#endif
 default:
 return -1;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index 3f23a42fa8..d434353159 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4743,6 +4743,23 @@ HXCOMM Internal use
 DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
+#ifdef __linux__
+DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
+"-async-teardown enable asynchronous teardown\n",
+QEMU_ARCH_ALL)
+#endif
+SRST
+``-async-teardown``
+Enable asynchronous teardown. A new teardown process will be
+created at startup, using clone. The teardown process will share
+the address space of the main qemu process, and wait for the main
+process to terminate. At that point, the teardown process will
+also exit. This allows qemu to terminate quickly if the guest was
+huge, leaving the teardown of the address space to the teardown
+process. Since the teardown process shares the same cgroups as the
+main qemu process, accounting is performed correctly.
+ERST
+
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
 "control error message format\n"
diff --git a/util/async-teardown.c b/util/async-teardown.c
new file mode 100644
index 00..07fe549891
--- /dev/null
+++ b/util/async-teardown.c
@@ -0,0 +1,123 @@
+/*
+ * Asynchronous teardown
+ *
+ * Copyright IBM, Corp. 2022
+ *
+ * Authors:
+ *  Claudio Imbrenda 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at your
+ * option) any later version.  See the COPYING file in the top-level directory.
+ *
+ */

Re: [PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-05 Thread Claudio Imbrenda

On Thu, 4 Aug 2022 17:58:34 +0100
Daniel P. Berrangé  wrote:

> On Thu, Aug 04, 2022 at 09:20:59AM +0100, Daniel P. Berrangé wrote:
> > On Thu, Aug 04, 2022 at 07:56:49AM +0200, Claudio Imbrenda wrote:  
> > > On Wed, 3 Aug 2022 18:34:45 +0100
> > > Daniel P. Berrangé  wrote:
> > >   
> > > > On Wed, Aug 03, 2022 at 07:31:41PM +0200, Claudio Imbrenda wrote:  
> > > > > This patch adds support for asynchronously tearing down a VM on Linux.
> > > > > 
> > > > > When qemu terminates, either naturally or because of a fatal signal,
> > > > > the VM is torn down. If the VM is huge, it can take a considerable
> > > > > amount of time for it to be cleaned up. In case of a protected VM, it
> > > > > might take even longer than a non-protected VM (this is the case on
> > > > > s390x, for example).
> > > > > 
> > > > > Some users might want to shut down a VM and restart it immediately,
> > > > > without having to wait. This is especially true if management
> > > > > infrastructure like libvirt is used.
> > > > > 
> > > > > This patch implements a simple trick on Linux to allow qemu to return
> > > > > immediately, with the teardown of the VM being performed
> > > > > asynchronously.
> > > > > 
> > > > > If the new commandline option -async-teardown is used, a new process 
> > > > > is
> > > > > spawned from qemu at startup, using the clone syscall, in such way 
> > > > > that
> > > > > it will share its address space with qemu.
> > > > > 
> > > > > The new process will then simpy wait until qemu terminates, and then 
> > > > > it
> > > > > will exit itself.
> > > > > 
> > > > > This allows qemu to terminate quickly, without having to wait for the
> > > > > whole address space to be torn down. The teardown process will exit
> > > > > after qemu, so it will be the last user of the address space, and
> > > > > therefore it will take care of the actual teardown.
> > > > > 
> > > > > The teardown process will share the same cgroups as qemu, so both
> > > > > memory usage and cpu time will be accounted properly.
> > > > > 
> > > > > This feature can already be used with libvirt by adding the following
> > > > > to the XML domain definition:
> > > > > 
> > > > >   http://libvirt.org/schemas/domain/qemu/1.0;>
> > > > >   
> > > > >   
> > > > 
> > > > How does this work in practice ?  Libvirt should be blocking until  
> > > 
> > > I don't know the inner details of how libvirt works..
> > >   
> > > > all processes in the cgroup have exited, including this cloned
> > > > child process.  
> > > 
> > > ..but I tested it and it works
> > > 
> > > my impression is that libvirt by default is only waiting for the
> > > main qemu process.  
> > 
> > If true, that would be a bug that needs fixing and should not be
> > relied on.  
> 
> Libvirt is invoking 'TerminateMachine' DBus call on systemd-machined.
> That in turn iterates over every process in the cgroup and kills
> them off.
> 
> Docs are a little vague and I've not followed the code perfectly, but
> that should mean TeminateMachine doesnt return until every process in
> the cgroup has exited.
> 
> That said, since this is a dbus API call, libvirt will probably
> timeout waiting for the DBus reply after something like 30-60
> seconds IIRC.

I have not observed any delays.

could it be that DBus doesn't wait for the process to be completely
dead, but only that the signal is delivered?

and which signal will DBus use?

> 
> >   
> > > the only issue I have found is the log file, which stays open as long
> > > as some file descriptors (which the cloned process inherits from the
> > > main qemu process) stay open. A new VM cannot be started if its log file
> > > is still open by the logger process. The close_range() call solves the
> > > issue.  
> 
> With regards,
> Daniel

Re: [PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-05 Thread Claudio Imbrenda

On Thu, 4 Aug 2022 17:56:34 +0100
Daniel P. Berrangé  wrote:

> On Wed, Aug 03, 2022 at 07:31:41PM +0200, Claudio Imbrenda wrote:
> > This patch adds support for asynchronously tearing down a VM on Linux.
> > 
> > When qemu terminates, either naturally or because of a fatal signal,
> > the VM is torn down. If the VM is huge, it can take a considerable
> > amount of time for it to be cleaned up. In case of a protected VM, it
> > might take even longer than a non-protected VM (this is the case on
> > s390x, for example).
> > 
> > Some users might want to shut down a VM and restart it immediately,
> > without having to wait. This is especially true if management
> > infrastructure like libvirt is used.
> > 
> > This patch implements a simple trick on Linux to allow qemu to return
> > immediately, with the teardown of the VM being performed
> > asynchronously.
> > 
> > If the new commandline option -async-teardown is used, a new process is
> > spawned from qemu at startup, using the clone syscall, in such way that
> > it will share its address space with qemu.
> > 
> > The new process will then simpy wait until qemu terminates, and then it
> > will exit itself.
> > 
> > This allows qemu to terminate quickly, without having to wait for the
> > whole address space to be torn down. The teardown process will exit
> > after qemu, so it will be the last user of the address space, and
> > therefore it will take care of the actual teardown.
> > 
> > The teardown process will share the same cgroups as qemu, so both
> > memory usage and cpu time will be accounted properly.
> > 
> > This feature can already be used with libvirt by adding the following
> > to the XML domain definition:
> > 
> >   http://libvirt.org/schemas/domain/qemu/1.0;>
> >   
> >   
> > 
> > Signed-off-by: Claudio Imbrenda 
> > ---
> >  include/qemu/osdep.h |  2 ++
> >  os-posix.c   |  5 
> >  qemu-options.hx  | 17 ++
> >  util/osdep.c | 55 
> >  4 files changed, 79 insertions(+)  
> 
> 
> > diff --git a/util/osdep.c b/util/osdep.c
> > index 60fcbbaebe..bb0baf97a0 100644
> > --- a/util/osdep.c
> > +++ b/util/osdep.c
> > @@ -23,6 +23,15 @@
> >   */
> >  #include "qemu/osdep.h"
> >  #include "qapi/error.h"
> > +
> > +#ifdef CONFIG_LINUX
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#endif
> > +
> >  #include "qemu/cutils.h"
> >  #include "qemu/sockets.h"
> >  #include "qemu/error-report.h"
> > @@ -512,6 +521,52 @@ const char *qemu_hw_version(void)
> >  return hw_version;
> >  }
> >  
> > +#ifdef __linux__
> > +static int async_teardown_fn(void *arg)
> > +{
> > +sigset_t all_signals;
> > +fd_set r, w, e;
> > +int fd;
> > +
> > +/* open a pidfd descriptor for the parent qemu process */
> > +fd = syscall(__NR_pidfd_open, getppid(), 0);  
> 
> We ought to open this FD in the parent process to avoid a race
> where the parent crashes immediately after clone() and gets
> reparented to 'init' before this child process calls pidfd_open,
> otherwise it'll sit around waiting for init to exit.

sounds good

> 
> > +/* if something went wrong, or if the file descriptor is too big */
> > +if ((fd < 0) || (fd >= FD_SETSIZE)) {
> > +_exit(1);
> > +}
> > +/* zero all fd sets */
> > +FD_ZERO();
> > +FD_ZERO();
> > +FD_ZERO();
> > +/* set the fd for the pidfd in the "read" set */
> > +FD_SET(fd, );
> > +/* block all signals */
> > +sigfillset(_signals);
> > +sigprocmask(SIG_BLOCK, _signals, NULL);  
> 
> Technnically this is racy as there's still a window in which the
> child is running when signals are not blocked.

true, I will apply the workaround you suggest below

> 
> > +/* wait for the pid to disappear -> fd will appear as ready for read */
> > +(void) select(fd + 1, , , , NULL);  
> 
> While using pidfd can work, a stronger protection would be to use
> 
>prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
> 
> this guarantees that the kernel will deliver SIGKILL to this
> process immediately when the parent QEMU exits.
> 
> We should probably do both in fact.

makes sense

> 
> > +
> > +/*
> > + * Close all file descriptors that might have been inherited

Re: [PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-05 Thread Claudio Imbrenda

On Thu, 4 Aug 2022 17:41:01 +0100
Daniel P. Berrangé  wrote:

> On Thu, Aug 04, 2022 at 04:49:29PM +0200, Claudio Imbrenda wrote:
> > On Thu, 4 Aug 2022 09:29:39 +0100
> > Daniel P. Berrangé  wrote:
> >   
> > > On Wed, Aug 03, 2022 at 06:34:45PM +0100, Daniel P. Berrangé wrote:  
> > > > On Wed, Aug 03, 2022 at 07:31:41PM +0200, Claudio Imbrenda wrote:
> > > > > This patch adds support for asynchronously tearing down a VM on Linux.
> > > > > 
> > > > > When qemu terminates, either naturally or because of a fatal signal,
> > > > > the VM is torn down. If the VM is huge, it can take a considerable
> > > > > amount of time for it to be cleaned up. In case of a protected VM, it
> > > > > might take even longer than a non-protected VM (this is the case on
> > > > > s390x, for example).
> > > > > 
> > > > > Some users might want to shut down a VM and restart it immediately,
> > > > > without having to wait. This is especially true if management
> > > > > infrastructure like libvirt is used.
> > > > > 
> > > > > This patch implements a simple trick on Linux to allow qemu to return
> > > > > immediately, with the teardown of the VM being performed
> > > > > asynchronously.
> > > > > 
> > > > > If the new commandline option -async-teardown is used, a new process 
> > > > > is
> > > > > spawned from qemu at startup, using the clone syscall, in such way 
> > > > > that
> > > > > it will share its address space with qemu.
> > > > > 
> > > > > The new process will then simpy wait until qemu terminates, and then 
> > > > > it
> > > > > will exit itself.
> > > > > 
> > > > > This allows qemu to terminate quickly, without having to wait for the
> > > > > whole address space to be torn down. The teardown process will exit
> > > > > after qemu, so it will be the last user of the address space, and
> > > > > therefore it will take care of the actual teardown.
> > > > > 
> > > > > The teardown process will share the same cgroups as qemu, so both
> > > > > memory usage and cpu time will be accounted properly.
> > > > > 
> > > > > This feature can already be used with libvirt by adding the following
> > > > > to the XML domain definition:
> > > > > 
> > > > >   http://libvirt.org/schemas/domain/qemu/1.0;>
> > > > >   
> > > > >   
> > > > 
> > > > How does this work in practice ?  Libvirt should be blocking until
> > > > all processes in the cgroup have exited, including this cloned
> > > > child process.
> > > 
> > > Also, have you disabled use of seccomp with QEMU when testing this,
> > > as the seccomp filter that libivrt enables is supposed to block
> > > any use of clone() except for the creation of threads.  
> > 
> > it was just a vanilla libvirt 8.0.0 as found on ubuntu 22.04; I have no
> > idea how it is configured by default  
> 
> Ok, so the reason it is working is because the extra process is
> cloned() right in middle of processing argv. This is before the
> seccomp filter is applied to the process, so clone() is not blocked.
> 
> One think I note about this in practice is that (unsurprisingly)
> if you do a process listing, users now see 2 QEMU processes instead
> of one.
> 
> I wonder if we should consider overwriting argv in the child
> process with "[qemu async teardown]" to give users a hint as to
> why this duplicate process exists.

sounds like a good idea

> 
> With regards,
> Daniel

Re: [PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-04 Thread Claudio Imbrenda

On Thu, 4 Aug 2022 09:29:39 +0100
Daniel P. Berrangé  wrote:

> On Wed, Aug 03, 2022 at 06:34:45PM +0100, Daniel P. Berrangé wrote:
> > On Wed, Aug 03, 2022 at 07:31:41PM +0200, Claudio Imbrenda wrote:  
> > > This patch adds support for asynchronously tearing down a VM on Linux.
> > > 
> > > When qemu terminates, either naturally or because of a fatal signal,
> > > the VM is torn down. If the VM is huge, it can take a considerable
> > > amount of time for it to be cleaned up. In case of a protected VM, it
> > > might take even longer than a non-protected VM (this is the case on
> > > s390x, for example).
> > > 
> > > Some users might want to shut down a VM and restart it immediately,
> > > without having to wait. This is especially true if management
> > > infrastructure like libvirt is used.
> > > 
> > > This patch implements a simple trick on Linux to allow qemu to return
> > > immediately, with the teardown of the VM being performed
> > > asynchronously.
> > > 
> > > If the new commandline option -async-teardown is used, a new process is
> > > spawned from qemu at startup, using the clone syscall, in such way that
> > > it will share its address space with qemu.
> > > 
> > > The new process will then simpy wait until qemu terminates, and then it
> > > will exit itself.
> > > 
> > > This allows qemu to terminate quickly, without having to wait for the
> > > whole address space to be torn down. The teardown process will exit
> > > after qemu, so it will be the last user of the address space, and
> > > therefore it will take care of the actual teardown.
> > > 
> > > The teardown process will share the same cgroups as qemu, so both
> > > memory usage and cpu time will be accounted properly.
> > > 
> > > This feature can already be used with libvirt by adding the following
> > > to the XML domain definition:
> > > 
> > >   http://libvirt.org/schemas/domain/qemu/1.0;>
> > >   
> > > 
> > 
> > How does this work in practice ?  Libvirt should be blocking until
> > all processes in the cgroup have exited, including this cloned
> > child process.  
> 
> Also, have you disabled use of seccomp with QEMU when testing this,
> as the seccomp filter that libivrt enables is supposed to block
> any use of clone() except for the creation of threads.

it was just a vanilla libvirt 8.0.0 as found on ubuntu 22.04; I have no
idea how it is configured by default

> 
> With regards,
> Daniel

Re: [PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-03 Thread Claudio Imbrenda

On Wed, 3 Aug 2022 18:34:45 +0100
Daniel P. Berrangé  wrote:

> On Wed, Aug 03, 2022 at 07:31:41PM +0200, Claudio Imbrenda wrote:
> > This patch adds support for asynchronously tearing down a VM on Linux.
> > 
> > When qemu terminates, either naturally or because of a fatal signal,
> > the VM is torn down. If the VM is huge, it can take a considerable
> > amount of time for it to be cleaned up. In case of a protected VM, it
> > might take even longer than a non-protected VM (this is the case on
> > s390x, for example).
> > 
> > Some users might want to shut down a VM and restart it immediately,
> > without having to wait. This is especially true if management
> > infrastructure like libvirt is used.
> > 
> > This patch implements a simple trick on Linux to allow qemu to return
> > immediately, with the teardown of the VM being performed
> > asynchronously.
> > 
> > If the new commandline option -async-teardown is used, a new process is
> > spawned from qemu at startup, using the clone syscall, in such way that
> > it will share its address space with qemu.
> > 
> > The new process will then simpy wait until qemu terminates, and then it
> > will exit itself.
> > 
> > This allows qemu to terminate quickly, without having to wait for the
> > whole address space to be torn down. The teardown process will exit
> > after qemu, so it will be the last user of the address space, and
> > therefore it will take care of the actual teardown.
> > 
> > The teardown process will share the same cgroups as qemu, so both
> > memory usage and cpu time will be accounted properly.
> > 
> > This feature can already be used with libvirt by adding the following
> > to the XML domain definition:
> > 
> >   http://libvirt.org/schemas/domain/qemu/1.0;>
> >   
> > 
> 
> How does this work in practice ?  Libvirt should be blocking until

I don't know the inner details of how libvirt works..

> all processes in the cgroup have exited, including this cloned
> child process.

..but I tested it and it works

my impression is that libvirt by default is only waiting for the
main qemu process.

the only issue I have found is the log file, which stays open as long
as some file descriptors (which the cloned process inherits from the
main qemu process) stay open. A new VM cannot be started if its log file
is still open by the logger process. The close_range() call solves the
issue.

> 
> With regards,
> Daniel

[PATCH v2 1/1] osdep: asynchronous teardown for shutdown on Linux

2022-08-03 Thread Claudio Imbrenda

This patch adds support for asynchronously tearing down a VM on Linux.

When qemu terminates, either naturally or because of a fatal signal,
the VM is torn down. If the VM is huge, it can take a considerable
amount of time for it to be cleaned up. In case of a protected VM, it
might take even longer than a non-protected VM (this is the case on
s390x, for example).

Some users might want to shut down a VM and restart it immediately,
without having to wait. This is especially true if management
infrastructure like libvirt is used.

This patch implements a simple trick on Linux to allow qemu to return
immediately, with the teardown of the VM being performed
asynchronously.

If the new commandline option -async-teardown is used, a new process is
spawned from qemu at startup, using the clone syscall, in such way that
it will share its address space with qemu.

The new process will then simpy wait until qemu terminates, and then it
will exit itself.

This allows qemu to terminate quickly, without having to wait for the
whole address space to be torn down. The teardown process will exit
after qemu, so it will be the last user of the address space, and
therefore it will take care of the actual teardown.

The teardown process will share the same cgroups as qemu, so both
memory usage and cpu time will be accounted properly.

This feature can already be used with libvirt by adding the following
to the XML domain definition:

  http://libvirt.org/schemas/domain/qemu/1.0;>
  
  

Signed-off-by: Claudio Imbrenda 
---
 include/qemu/osdep.h |  2 ++
 os-posix.c   |  5 
 qemu-options.hx  | 17 ++
 util/osdep.c | 55 
 4 files changed, 79 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b1c161c035..3154759d79 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -549,6 +549,8 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t 
count)
 
 void qemu_set_cloexec(int fd);
 
+void init_async_teardown(void);
+
 /* Return a dynamically allocated directory path that is appropriate for 
storing
  * local state.
  *
diff --git a/os-posix.c b/os-posix.c
index 321fc4bd13..dd3e42b4c4 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -150,6 +150,11 @@ int os_parse_cmd_args(int index, const char *optarg)
 case QEMU_OPTION_daemonize:
 daemonize = 1;
 break;
+#if defined(CONFIG_LINUX)
+case QEMU_OPTION_asyncteardown:
+init_async_teardown();
+break;
+#endif
 default:
 return -1;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index 3f23a42fa8..d434353159 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4743,6 +4743,23 @@ HXCOMM Internal use
 DEF("qtest", HAS_ARG, QEMU_OPTION_qtest, "", QEMU_ARCH_ALL)
 DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL)
 
+#ifdef __linux__
+DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
+"-async-teardown enable asynchronous teardown\n",
+QEMU_ARCH_ALL)
+#endif
+SRST
+``-async-teardown``
+Enable asynchronous teardown. A new teardown process will be
+created at startup, using clone. The teardown process will share
+the address space of the main qemu process, and wait for the main
+process to terminate. At that point, the teardown process will
+also exit. This allows qemu to terminate quickly if the guest was
+huge, leaving the teardown of the address space to the teardown
+process. Since the teardown process shares the same cgroups as the
+main qemu process, accounting is performed correctly.
+ERST
+
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
 "control error message format\n"
diff --git a/util/osdep.c b/util/osdep.c
index 60fcbbaebe..bb0baf97a0 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -23,6 +23,15 @@
  */
 #include "qemu/osdep.h"
 #include "qapi/error.h"
+
+#ifdef CONFIG_LINUX
+#include 
+#include 
+#include 
+#include 
+#include 
+#endif
+
 #include "qemu/cutils.h"
 #include "qemu/sockets.h"
 #include "qemu/error-report.h"
@@ -512,6 +521,52 @@ const char *qemu_hw_version(void)
 return hw_version;
 }
 
+#ifdef __linux__
+static int async_teardown_fn(void *arg)
+{
+sigset_t all_signals;
+fd_set r, w, e;
+int fd;
+
+/* open a pidfd descriptor for the parent qemu process */
+fd = syscall(__NR_pidfd_open, getppid(), 0);
+/* if something went wrong, or if the file descriptor is too big */
+if ((fd < 0) || (fd >= FD_SETSIZE)) {
+_exit(1);
+}
+/* zero all fd sets */
+FD_ZERO();
+FD_ZERO();
+FD_ZERO();
+/* set the fd for the pidfd in the "read" set */
+FD_SET(fd, );
+/* block all signals */
+sigfillset(_signals);
+sigprocmask(SIG_BLOCK, _signals, NULL);
+/* wait

Re: [PATCH v1 1/1] osdep: asynchronous teardown for shutdown on Linux

2021-12-06 Thread Claudio Imbrenda

On Mon, 6 Dec 2021 11:47:55 +
Daniel P. Berrangé  wrote:

> On Mon, Dec 06, 2021 at 12:43:12PM +0100, Claudio Imbrenda wrote:
> > On Mon, 6 Dec 2021 11:21:10 +
> > Daniel P. Berrangé  wrote:
> >   
> > > On Mon, Dec 06, 2021 at 12:06:11PM +0100, Claudio Imbrenda wrote:  
> > > > This patch adds support for asynchronously tearing down a VM on Linux.
> > > > 
> > > > When qemu terminates, either naturally or because of a fatal signal,
> > > > the VM is torn down. If the VM is huge, it can take a considerable
> > > > amount of time for it to be cleaned up. In case of a protected VM, it
> > > > might take even longer than a non-protected VM (this is the case on
> > > > s390x, for example).
> > > > 
> > > > Some users might want to shut down a VM and restart it immediately,
> > > > without having to wait. This is especially true if management
> > > > infrastructure like libvirt is used.
> > > > 
> > > > This patch implements a simple trick on Linux to allow qemu to return
> > > > immediately, with the teardown of the VM being performed
> > > > asynchronously.
> > > > 
> > > > If the new commandline option -async-teardown is used, a new process is
> > > > spawned from qemu using the clone syscall, so that it will share its
> > > > address space with qemu.
> > > > 
> > > > The new process will then wait until qemu terminates, and then it will
> > > > exit itself.
> > > > 
> > > > This allows qemu to terminate quickly, without having to wait for the
> > > > whole address space to be torn down. The teardown process will exit
> > > > after qemu, so it will be the last user of the address space, and
> > > > therefore it will take care of the actual teardown.
> > > > 
> > > > The teardown process will share the same cgroups as qemu, so both
> > > > memory usage and cpu time will be accounted properly.
> > > 
> > > If this suggested workaround has any benefit to the shutdown of a VM
> > > with libvirt, then it is a bug in libvirt IMHO.
> > > 
> > > When libvirt tears down a QEMU VM, it should be waiting for *every*
> > > process in the VM's cgroup to be terminated before it reports that
> > > the VM is shutoff. IOW, the fact that this workaround lets the main
> > > QEMU process exit quickly should not matter. libvirt should still
> > > be blocked in exactly the same place in its code, waiting for the
> > > "async" cleanup process to exit. IOW, this should not be async at
> > > all from libvirt's POV.  
> > 
> > interesting, I did not know that about libvirt.
> > 
> > maybe libvirt could be fixed/improved to allow this patch to work?  
> 
> That would not be desirable. When libvirt reports a VM as shutoff,
> it is expected that all resources associated with the VM huave been
> fully released, such that they are available for launching a new
> VM.  We can't allow resources to be asynchronously released as that
> violates app's expectation that the resources are released once the
> VM is shutoff.

what about people who do not use libvirt? should those also be
prevented from taking advantage of this feature only because libvirt
can't use it?

> 
> > surely without this patch an asynchronous teardown will not be possible
> > at all  
> 
> I appreciate that the current slow teardown is a pain, but async
> teardown does not sound like an appealing alternative given that
> the app can't use the resources again until the teardown is
> complete.

when a VM starts, it will not use all of the memory at once. it will
start using it a little at a time. time during which the asynchronous
process can complete the teardown.

consider what would happen if you need to shut down and restart a big
VM (>10TB)

> 
> Regards,
> Daniel

Re: [PATCH v1 1/1] osdep: asynchronous teardown for shutdown on Linux

2021-12-06 Thread Claudio Imbrenda

On Mon, 6 Dec 2021 11:21:10 +
Daniel P. Berrangé  wrote:

> On Mon, Dec 06, 2021 at 12:06:11PM +0100, Claudio Imbrenda wrote:
> > This patch adds support for asynchronously tearing down a VM on Linux.
> > 
> > When qemu terminates, either naturally or because of a fatal signal,
> > the VM is torn down. If the VM is huge, it can take a considerable
> > amount of time for it to be cleaned up. In case of a protected VM, it
> > might take even longer than a non-protected VM (this is the case on
> > s390x, for example).
> > 
> > Some users might want to shut down a VM and restart it immediately,
> > without having to wait. This is especially true if management
> > infrastructure like libvirt is used.
> > 
> > This patch implements a simple trick on Linux to allow qemu to return
> > immediately, with the teardown of the VM being performed
> > asynchronously.
> > 
> > If the new commandline option -async-teardown is used, a new process is
> > spawned from qemu using the clone syscall, so that it will share its
> > address space with qemu.
> > 
> > The new process will then wait until qemu terminates, and then it will
> > exit itself.
> > 
> > This allows qemu to terminate quickly, without having to wait for the
> > whole address space to be torn down. The teardown process will exit
> > after qemu, so it will be the last user of the address space, and
> > therefore it will take care of the actual teardown.
> > 
> > The teardown process will share the same cgroups as qemu, so both
> > memory usage and cpu time will be accounted properly.  
> 
> If this suggested workaround has any benefit to the shutdown of a VM
> with libvirt, then it is a bug in libvirt IMHO.
> 
> When libvirt tears down a QEMU VM, it should be waiting for *every*
> process in the VM's cgroup to be terminated before it reports that
> the VM is shutoff. IOW, the fact that this workaround lets the main
> QEMU process exit quickly should not matter. libvirt should still
> be blocked in exactly the same place in its code, waiting for the
> "async" cleanup process to exit. IOW, this should not be async at
> all from libvirt's POV.

interesting, I did not know that about libvirt.

maybe libvirt could be fixed/improved to allow this patch to work?

surely without this patch an asynchronous teardown will not be possible
at all

> 
> 
> Regards,
> Daniel

[PATCH v1 1/1] osdep: asynchronous teardown for shutdown on Linux

2021-12-06 Thread Claudio Imbrenda

This patch adds support for asynchronously tearing down a VM on Linux.

When qemu terminates, either naturally or because of a fatal signal,
the VM is torn down. If the VM is huge, it can take a considerable
amount of time for it to be cleaned up. In case of a protected VM, it
might take even longer than a non-protected VM (this is the case on
s390x, for example).

Some users might want to shut down a VM and restart it immediately,
without having to wait. This is especially true if management
infrastructure like libvirt is used.

This patch implements a simple trick on Linux to allow qemu to return
immediately, with the teardown of the VM being performed
asynchronously.

If the new commandline option -async-teardown is used, a new process is
spawned from qemu using the clone syscall, so that it will share its
address space with qemu.

The new process will then wait until qemu terminates, and then it will
exit itself.

This allows qemu to terminate quickly, without having to wait for the
whole address space to be torn down. The teardown process will exit
after qemu, so it will be the last user of the address space, and
therefore it will take care of the actual teardown.

The teardown process will share the same cgroups as qemu, so both
memory usage and cpu time will be accounted properly.

Signed-off-by: Claudio Imbrenda 
---
 include/qemu/osdep.h |  2 ++
 os-posix.c   |  3 +++
 qemu-options.hx  | 17 
 util/osdep.c | 47 
 4 files changed, 69 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 60718fc342..f5493c9489 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -657,6 +657,8 @@ const char *qemu_hw_version(void);
 void fips_set_state(bool requested);
 bool fips_get_state(void);
 
+void init_async_teardown(void);
+
 /* Return a dynamically allocated pathname denoting a file or directory that is
  * appropriate for storing local state.
  *
diff --git a/os-posix.c b/os-posix.c
index ae6c9f2a5e..d37a654e2c 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -158,6 +158,9 @@ int os_parse_cmd_args(int index, const char *optarg)
 "to enable FIPS compliance");
 fips_set_state(true);
 break;
+case QEMU_OPTION_asyncteardown:
+init_async_teardown();
+break;
 #endif
 default:
 return -1;
diff --git a/qemu-options.hx b/qemu-options.hx
index ae2c6dbbfc..b4ce828726 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4600,6 +4600,23 @@ SRST
 Enable FIPS 140-2 compliance mode.
 ERST
 
+#ifdef __linux__
+DEF("async-teardown", 0, QEMU_OPTION_asyncteardown,
+"-async-teardown enable asynchronous teardown\n",
+QEMU_ARCH_ALL)
+#endif
+SRST
+``-async-teardown``
+Enable asynchronous teardown. A new teardown process will be
+created at startup, using clone. The teardown process will share
+the address space of the main qemu process, and wait for the main
+process to terminate. At that point, the teardown process will
+also exit. This allows qemu to terminate quickly if the guest was
+huge, leaving the teardown of the address space to the teardown
+process. Since the teardown process shares the same cgroups as the
+main qemu process, accounting is performed correctly.
+ERST
+
 DEF("msg", HAS_ARG, QEMU_OPTION_msg,
 "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n"
 "control error message format\n"
diff --git a/util/osdep.c b/util/osdep.c
index 42a0a4986a..f36be51262 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -33,6 +33,14 @@
 extern int madvise(char *, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include 
+#include 
+#include 
+#include 
+#include 
+#endif
+
 #include "qemu-common.h"
 #include "qemu/cutils.h"
 #include "qemu/sockets.h"
@@ -548,6 +556,45 @@ bool fips_get_state(void)
 return fips_enabled;
 }
 
+#ifdef __linux__
+static int async_teardown_fn(void *arg)
+{
+sigset_t all_signals;
+fd_set r, w, e;
+int fd;
+
+/* open a pidfd descriptor for the parent qemu process */
+fd = syscall(__NR_pidfd_open, getppid(), 0);
+/* if something went wrong, or if the file descriptor is too big */
+if ((fd < 0) || (fd >= FD_SETSIZE)) {
+_exit(1);
+}
+/* zero all fd sets */
+FD_ZERO();
+FD_ZERO();
+FD_ZERO();
+/* set the fd for the pidfd in the "read" set */
+FD_SET(fd, );
+/* block all signals */
+sigfillset(_signals);
+sigprocmask(SIG_BLOCK, _signals, NULL);
+/* wait for the pid to disappear -> fd will appear as ready for read */
+(void) select(fd + 1, , , , NULL);
+_exit(0);
+}
+
+void init_async_teardown(void)
+{
+const int size = 8192; /* should be more than enough */
+char *stack = malloc(size);
+
+/* start a new process sharing the address space with qemu */
+

Re: [PATCH v1] s390x/tcg: fix and optimize SPX (SET PREFIX)

2021-08-05 Thread Claudio Imbrenda

On Thu,  5 Aug 2021 14:59:38 +0200
David Hildenbrand  wrote:

> We not only invalidate the translation of the range 0x0-0x2000, we
> also invalidate the translation of the new prefix range and the
> translation of the old prefix range -- because real2abs would return
> different results for all of these ranges when changing the prefix
> location.

looks like a good idea

> This fixes the kvm-unit-tests "edat" test that just hangs before this
> patch because we end up clearing the new prefix area instead of the
> old prefix area.
> 
> While at it, let's not do anything in case the prefix doesn't change.

also looks like a good idea

Reviewed-by: Claudio Imbrenda 
 
> Cc: Richard Henderson 
> Cc: David Hildenbrand 
> Cc: Cornelia Huck 
> Cc: Thomas Huth 
> Cc: Claudio Imbrenda 
> Cc: qemu-s3...@nongnu.org
> Signed-off-by: David Hildenbrand 
> ---
>  target/s390x/tcg/misc_helper.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/target/s390x/tcg/misc_helper.c
> b/target/s390x/tcg/misc_helper.c index 33e6999e15..aab9c47747 100644
> --- a/target/s390x/tcg/misc_helper.c
> +++ b/target/s390x/tcg/misc_helper.c
> @@ -151,13 +151,26 @@ void HELPER(diag)(CPUS390XState *env, uint32_t
> r1, uint32_t r3, uint32_t num) /* Set Prefix */
>  void HELPER(spx)(CPUS390XState *env, uint64_t a1)
>  {
> +const uint32_t prefix = a1 & 0x7fffe000;
> +const uint32_t old_prefix = env->psa;
>  CPUState *cs = env_cpu(env);
> -uint32_t prefix = a1 & 0x7fffe000;
> +
> +if (prefix == old_prefix) {
> +return;
> +}
>  
>  env->psa = prefix;
>  HELPER_LOG("prefix: %#x\n", prefix);
>  tlb_flush_page(cs, 0);
>  tlb_flush_page(cs, TARGET_PAGE_SIZE);
> +if (prefix != 0) {
> +tlb_flush_page(cs, prefix);
> +tlb_flush_page(cs, prefix + TARGET_PAGE_SIZE);
> +}
> +if (old_prefix != 0) {
> +tlb_flush_page(cs, old_prefix);
> +tlb_flush_page(cs, old_prefix + TARGET_PAGE_SIZE);
> +}
>  }
>  
>  static void update_ckc_timer(CPUS390XState *env)

Re: [PATCH v6 0/8] s390: Extended-Length SCCB & DIAGNOSE 0x318

2020-09-25 Thread Claudio Imbrenda

On Fri, 25 Sep 2020 17:18:55 +0200
Cornelia Huck  wrote:

> On Fri, 25 Sep 2020 11:13:49 -0400
> Collin Walling  wrote:
> 
> > On 9/16/20 1:15 PM, Collin Walling wrote:  
> > > On 9/16/20 11:53 AM, Cornelia Huck wrote:
> > > 
> > > [...]
> > > 
> > >>>
> > >>
> > >> Thanks, applied.
> > >>
> > >>
> > > 
> > > Thanks Conny.
> > > 
> > > Much appreciated for everyone's patience and review. The only
> > > thing I'd like to hold out on for now is for someone to take a
> > > peek at patch #3 with respect to the protected virtualization
> > > stuff. I don't know too much about it, honestly, and I want to
> > > ensure that dynamically allocating memory for the SCCB makes
> > > sense there. The alternative would be to allocate a static 4K for
> > > the work_sccb. 
> > 
> > I had someone take a look at the patch for PV and was told
> > everything looks sane. Since the patches have already been applied,
> > it seems like it's too late to add a reviewed-by from someone?  
> 
> Have the reviewer reply with their R-b, and I'll happily add it, as I
> rebase s390-next before doing a pull req anyway :)

well it was me :)

you can add a 

Reviewed-by: Claudio Imbrenda 

for the first 6 patches, and an

Acked-by: Claudio Imbrenda 

for the last one


thanks!

Re: [PATCH for-5.2 3/6] pc-bios/s390-ccw: Move the inner logic of find_subch() to a separate function

2020-08-04 Thread Claudio Imbrenda

On Tue, 4 Aug 2020 15:24:09 +0200
Thomas Huth  wrote:

> On 03/08/2020 10.46, Claudio Imbrenda wrote:
> > On Tue, 28 Jul 2020 20:37:31 +0200
> > Thomas Huth  wrote:
> >   
> >> Move the code to a separate function to be able to re-use it from a
> >> different spot later.
> >>
> >> Signed-off-by: Thomas Huth 
> >> ---
> >>  pc-bios/s390-ccw/main.c | 99
> >> - 1 file changed, 57
> >> insertions(+), 42 deletions(-)
> >>
> >> diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
> >> index 9b64eb0c24..9477313188 100644
> >> --- a/pc-bios/s390-ccw/main.c
> >> +++ b/pc-bios/s390-ccw/main.c
> >> @@ -51,6 +51,60 @@ unsigned int get_loadparm_index(void)
> >>  return atoui(loadparm_str);
> >>  }
> >>  
> >> +static int check_sch_no(int dev_no, int sch_no)
> >> +{
> >> +bool is_virtio;
> >> +Schib schib;
> >> +int r;
> >> +
> >> +blk_schid.sch_no = sch_no;
> >> +r = stsch_err(blk_schid, );
> >> +if (r == 3 || r == -EIO) {
> >> +return -EIO;
> >> +}
> >> +if (!schib.pmcw.dnv) {
> >> +return false;
> >> +}
> >> +
> >> +enable_subchannel(blk_schid);
> >> +cutype = cu_type(blk_schid);
> >> +
> >> +/*
> >> + * Note: we always have to run virtio_is_supported() here to
> >> make
> >> + * sure that the vdev.senseid data gets pre-initialized
> >> correctly
> >> + */
> >> +is_virtio = virtio_is_supported(blk_schid);
> >> +
> >> +/* No specific devno given, just return 1st possibly bootable
> >> device */
> >> +if (dev_no < 0) {
> >> +switch (cutype) {
> >> +case CU_TYPE_VIRTIO:
> >> +if (is_virtio) {
> >> +/*
> >> + * Skip net devices since no IPLB is created and
> >> therefore
> >> + * no network bootloader has been loaded
> >> +     */
> >> +if (virtio_get_device_type() != VIRTIO_ID_NET) {
> >> +return true;
> >> +}  
> > 
> > here it seems you are returning true for any non-network virtio
> > device, is this the intended behaviour? (I know it was like this in
> > the old code) like, non-block devices?  
> 
> Yes. Other devices are already ignored by the virtio_is_supported()
> call some lines earlier in this function.

ah, that makes sense


Reviewed-by: Claudio Imbrenda

Re: [PATCH for-5.2 3/6] pc-bios/s390-ccw: Move the inner logic of find_subch() to a separate function

2020-08-04 Thread Claudio Imbrenda

On Tue, 28 Jul 2020 20:37:31 +0200
Thomas Huth  wrote:

> Move the code to a separate function to be able to re-use it from a
> different spot later.
> 
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/main.c | 99
> - 1 file changed, 57
> insertions(+), 42 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
> index 9b64eb0c24..9477313188 100644
> --- a/pc-bios/s390-ccw/main.c
> +++ b/pc-bios/s390-ccw/main.c
> @@ -51,6 +51,60 @@ unsigned int get_loadparm_index(void)
>  return atoui(loadparm_str);
>  }
>  
> +static int check_sch_no(int dev_no, int sch_no)
> +{
> +bool is_virtio;
> +Schib schib;
> +int r;
> +
> +blk_schid.sch_no = sch_no;
> +r = stsch_err(blk_schid, );
> +if (r == 3 || r == -EIO) {
> +return -EIO;
> +}
> +if (!schib.pmcw.dnv) {
> +return false;
> +}
> +
> +enable_subchannel(blk_schid);
> +cutype = cu_type(blk_schid);
> +
> +/*
> + * Note: we always have to run virtio_is_supported() here to make
> + * sure that the vdev.senseid data gets pre-initialized correctly
> + */
> +is_virtio = virtio_is_supported(blk_schid);
> +
> +/* No specific devno given, just return 1st possibly bootable
> device */
> +if (dev_no < 0) {
> +switch (cutype) {
> +case CU_TYPE_VIRTIO:
> +if (is_virtio) {
> +/*
> + * Skip net devices since no IPLB is created and
> therefore
> + * no network bootloader has been loaded
> + */
> +if (virtio_get_device_type() != VIRTIO_ID_NET) {
> +return true;
> +}

here it seems you are returning true for any non-network virtio device,
is this the intended behaviour? (I know it was like this in the old
code)

like, non-block devices?

> +}
> +return false;
> +case CU_TYPE_DASD_3990:
> +case CU_TYPE_DASD_2107:
> +return true;
> +default:
> +return false;
> +}
> +}
> +
> +/* Caller asked for a specific devno */
> +if (schib.pmcw.dev == dev_no) {
> +return true;
> +}
> +
> +return false;
> +}
> +
>  /*
>   * Find the subchannel connected to the given device (dev_no) and
> fill in the
>   * subchannel information block (schib) with the connected
> subchannel's info. @@ -62,53 +116,14 @@ unsigned int
> get_loadparm_index(void) */
>  static bool find_subch(int dev_no)
>  {
> -Schib schib;
>  int i, r;
> -bool is_virtio;
>  
>  for (i = 0; i < 0x1; i++) {
> -blk_schid.sch_no = i;
> -r = stsch_err(blk_schid, );
> -if ((r == 3) || (r == -EIO)) {
> +r = check_sch_no(dev_no, i);
> +if (r < 0) {
>  break;
>  }
> -if (!schib.pmcw.dnv) {
> -continue;
> -}
> -
> -enable_subchannel(blk_schid);
> -cutype = cu_type(blk_schid);
> -
> -/*
> - * Note: we always have to run virtio_is_supported() here to
> make
> - * sure that the vdev.senseid data gets pre-initialized
> correctly
> - */
> -is_virtio = virtio_is_supported(blk_schid);
> -
> -/* No specific devno given, just return 1st possibly
> bootable device */
> -if (dev_no < 0) {
> -switch (cutype) {
> -case CU_TYPE_VIRTIO:
> -if (is_virtio) {
> -/*
> - * Skip net devices since no IPLB is created and
> therefore
> - * no network bootloader has been loaded
> - */
> -if (virtio_get_device_type() != VIRTIO_ID_NET) {
> -return true;
> -}
> -}
> -continue;
> -case CU_TYPE_DASD_3990:
> -case CU_TYPE_DASD_2107:
> -return true;
> -default:
> -continue;
> -}
> -}
> -
> -/* Caller asked for a specific devno */
> -if (schib.pmcw.dev == dev_no) {
> +if (r == true) {
>  return true;
>  }
>  }

Re: [PATCH for-5.2 5/6] pc-bios/s390-ccw: Scan through all boot devices if none has been specified

2020-08-04 Thread Claudio Imbrenda

On Tue, 28 Jul 2020 20:37:33 +0200
Thomas Huth  wrote:

> If no boot device has been specified (via "bootindex=..."), the
> s390-ccw bios scans through all devices to find a bootable device.

maybe a better title for the patch is "scan through all devices if no
boot device specified" then, since it seems we will scan all
devices, not just "boot" devices?

> But so far, it stops at the very first block device (including
> virtio-scsi controllers without attached devices) that it finds, no
> matter whether it is bootable or not. That leads to some weird
> situatation where it is e.g. possible to boot via:
> 
>  qemu-system-s390x -hda /path/to/disk.qcow2
> 
> but not if there is e.g. a virtio-scsi controller specified before:
> 
>  qemu-system-s390x -device virtio-scsi -hda /path/to/disk.qcow2
> 
> While using "bootindex=..." is clearly the preferred way of booting
> on s390x, we still can make the life for the users at least a little
> bit easier if we look at all available devices to find a bootable one.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1846975
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/main.c | 46
> +++-- 1 file changed, 31
> insertions(+), 15 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
> index 3cd01cd80f..0af872f9e3 100644
> --- a/pc-bios/s390-ccw/main.c
> +++ b/pc-bios/s390-ccw/main.c
> @@ -182,20 +182,8 @@ static void boot_setup(void)
>  static void find_boot_device(void)
>  {
>  VDev *vdev = virtio_get_device();
> -int ssid;
>  bool found;
>  
> -if (!have_iplb) {
> -for (ssid = 0; ssid < 0x3; ssid++) {
> -blk_schid.ssid = ssid;
> -found = find_subch(-1);
> -if (found) {
> -return;
> -}
> -}
> -panic("Could not find a suitable boot device (none
> specified)\n");
> -}
> -
>  switch (iplb.pbt) {
>  case S390_IPL_TYPE_CCW:
>  debug_print_int("device no. ", iplb.ccw.devno);
> @@ -260,14 +248,42 @@ static void ipl_boot_device(void)
>  }
>  }
>  
> +/*
> + * No boot device has been specified, so we have to scan through the
> + * channels to find one.
> + */
> +static void probe_boot_device(void)
> +{
> +int ssid, sch_no, ret;
> +
> +for (ssid = 0; ssid < 0x3; ssid++) {
> +blk_schid.ssid = ssid;
> +for (sch_no = 0; sch_no < 0x1; sch_no++) {
> +ret = check_sch_no(-1, sch_no);
> +if (ret < 0) {
> +break;
> +}
> +if (ret == true) {
> +ipl_boot_device();  /* Only returns if
> unsuccessful */
> +}
> +}
> +}
> +
> +sclp_print("Could not find a suitable boot device (none
> specified)\n"); +}
> +
>  int main(void)
>  {
>  sclp_setup();
>  css_setup();
>  boot_setup();
> -find_boot_device();
> -enable_subchannel(blk_schid);
> -ipl_boot_device();
> +if (have_iplb) {
> +find_boot_device();
> +enable_subchannel(blk_schid);
> +ipl_boot_device();
> +} else {
> +probe_boot_device();
> +}
>  
>  panic("Failed to load OS from hard disk\n");
>  return 0; /* make compiler happy */

Re: [PATCH for-5.2 2/6] pc-bios/s390-ccw: Move ipl-related code from main() into a separate function

2020-07-29 Thread Claudio Imbrenda

On Tue, 28 Jul 2020 20:37:30 +0200
Thomas Huth  wrote:

> Let's move this part of the code into a separate function to be able
> to use it from multiple spots later.
> 
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/main.c | 20 
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
> index 146a50760b..9b64eb0c24 100644
> --- a/pc-bios/s390-ccw/main.c
> +++ b/pc-bios/s390-ccw/main.c
> @@ -223,14 +223,8 @@ static void virtio_setup(void)
>  }
>  }
>  
> -int main(void)
> +static void ipl_boot_device(void)
>  {
> -sclp_setup();
> -css_setup();
> -boot_setup();
> -find_boot_device();
> -enable_subchannel(blk_schid);
> -
>  switch (cutype) {
>  case CU_TYPE_DASD_3990:
>  case CU_TYPE_DASD_2107:
> @@ -242,8 +236,18 @@ int main(void)
>  break;
>  default:
>  print_int("Attempting to boot from unexpected device type",
> cutype);
> -panic("");
> +panic("\nBoot failed.\n");
>  }
> +}
> +
> +int main(void)
> +{
> +sclp_setup();
> +css_setup();
> +boot_setup();
> +find_boot_device();
> +enable_subchannel(blk_schid);
> +ipl_boot_device();
>  
>  panic("Failed to load OS from hard disk\n");
>  return 0; /* make compiler happy */

Reviewed-by: Claudio Imbrenda

Re: [PATCH for-5.2 1/6] pc-bios/s390-ccw/Makefile: Compile with -std=gnu99, -fwrapv and -fno-common

2020-07-29 Thread Claudio Imbrenda

On Tue, 28 Jul 2020 20:37:29 +0200
Thomas Huth  wrote:

> The main QEMU code is compiled with -std=gnu99, -fwrapv and
> -fno-common. We should use the same flags for the s390-ccw bios, too,
> to avoid that we get different behavior with different compiler
> versions that changed their default settings in the course of time
> (it happened at least with -std=... and -fno-common in the past
> already).
> 
> While we're at it, also group the other flags here in a little bit
> nicer fashion: Move the two "-m" flags out of the "-f" area and
> specify them on a separate line.
> 
> Signed-off-by: Thomas Huth 
> ---
>  pc-bios/s390-ccw/Makefile | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile
> index 50bc880272..9abb0ea4c0 100644
> --- a/pc-bios/s390-ccw/Makefile
> +++ b/pc-bios/s390-ccw/Makefile
> @@ -13,10 +13,11 @@ OBJECTS = start.o main.o bootmap.o jump2ipl.o
> sclp.o menu.o \ virtio.o virtio-scsi.o virtio-blkdev.o libc.o cio.o
> dasd-ipl.o 
>  QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS))
> -QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks
> -msoft-float -QEMU_CFLAGS += -march=z900 -fPIE -fno-strict-aliasing
> -QEMU_CFLAGS += -fno-asynchronous-unwind-tables
> +QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks
> -fno-common -fPIE +QEMU_CFLAGS += -fwrapv -fno-strict-aliasing
> -fno-asynchronous-unwind-tables QEMU_CFLAGS += $(call cc-option,
> $(QEMU_CFLAGS), -fno-stack-protector) +QEMU_CFLAGS += -msoft-float
> -march=z900 +QEMU_CFLAGS += -std=gnu99
>  LDFLAGS += -Wl,-pie -nostdlib
>  
>  build-all: s390-ccw.img s390-netboot.img

Reviewed-by: Claudio Imbrenda

Re: [PATCH RFCv2 3/6] s390x/diag: implement diag260

2020-07-14 Thread Claudio Imbrenda

On Fri, 10 Jul 2020 17:12:36 +0200
David Hildenbrand  wrote:

> Let's implement diag260 - "Access Certain Virtual Machine
> Information", used under z/VM to expose the storage configuration
> (especially, layout of storage extends and thereby holes). For now,
> the returned information is completely redundant to the information
> exposed via SCLP.
> 
> We want to reuse diag260 in QEMU to implement memory devices - to
> have a mechanism to indicate to the guest OS that the initial ram
> size and the maximum possible physical address differ.
> 
> The Linux kernel supports diag260 (0x10) to query the available memory
> since v4.20. Ancient Linux versions used diag 260 (0xc), but stopped
> doing so a while ago.
> 
> Let's unconditionally implement the new diag, without any migration
> checks (e.g., compatibility machine, CPU model). Although a guest OS
> could observe this when migrating between QEMU evrsions, it's somewhat
> unlikely to ever trigger due to the way diag260 is used within a guest
> OS - called only once or twice during boot.
> 
> Signed-off-by: David Hildenbrand 
> ---
>  target/s390x/diag.c| 51
> ++ target/s390x/internal.h|
> 2 ++ target/s390x/kvm.c | 11 
>  target/s390x/misc_helper.c |  6 +
>  target/s390x/translate.c   |  7 ++
>  5 files changed, 77 insertions(+)
> 
> diff --git a/target/s390x/diag.c b/target/s390x/diag.c
> index be70aecd72..5378fcf582 100644
> --- a/target/s390x/diag.c
> +++ b/target/s390x/diag.c
> @@ -23,6 +23,57 @@
>  #include "hw/s390x/pv.h"
>  #include "kvm_s390x.h"
>  
> +void handle_diag_260(CPUS390XState *env, uint64_t r1, uint64_t r3,
> uintptr_t ra) +{
> +MachineState *ms = MACHINE(qdev_get_machine());
> +const ram_addr_t initial_ram_size = ms->ram_size;
> +const uint64_t subcode = env->regs[r3];
> +
> +switch (subcode) {
> +case 0xc:
> +/* The first storage extent maps to our initial ram. */
> +env->regs[r1] = initial_ram_size - 1;
> +/* The highest addressable byte maps to the initial ram size
> for now. */
> +env->regs[r3] = initial_ram_size - 1;
> +break;
> +case 0x10: {
> +ram_addr_t addr, length;
> +uint64_t tmp;
> +
> +if (r1 & 1) {
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +return;
> +}
> +
> +addr = env->regs[r1];
> +length = env->regs[r1 + 1];
> +if (!QEMU_IS_ALIGNED(addr, 16) || !QEMU_IS_ALIGNED(length,
> 16) ||
> +!length) {
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +return;
> +}
> +if (!address_space_access_valid(_space_memory, addr,
> length,
> +true,
> MEMTXATTRS_UNSPECIFIED)) {
> +s390_program_interrupt(env, PGM_ADDRESSING, ra);
> +return;
> +}
> +
> +/* Indicate our initial memory ([0 .. ram_size - 1]) */
> +tmp = cpu_to_be64(0);
> +cpu_physical_memory_write(addr, , sizeof(tmp));
> +tmp = cpu_to_be64(initial_ram_size - 1);
> +cpu_physical_memory_write(addr + sizeof(tmp), ,
> sizeof(tmp)); +
> +/* Exactly one entry was stored, it always fits into the
> area. */

maybe I missed something, but I have the impression that your
implementation of DIAG 260 always only returns the first extent?

shouldn't it return all the hotplugged areas once hotplugging is
enabled?

> +env->regs[r3] = 1;
> +setcc(env_archcpu(env), 0);
> +break;
> +}
> +default:
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +}
> +}
> +
>  int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3)
>  {
>  uint64_t func = env->regs[r1];
> diff --git a/target/s390x/internal.h b/target/s390x/internal.h
> index b1e0ebf67f..a7a3df9a3b 100644
> --- a/target/s390x/internal.h
> +++ b/target/s390x/internal.h
> @@ -372,6 +372,8 @@ int mmu_translate_real(CPUS390XState *env,
> target_ulong raddr, int rw, 
>  
>  /* misc_helper.c */
> +void handle_diag_260(CPUS390XState *env, uint64_t r1, uint64_t r3,
> + uintptr_t ra);
>  int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3);
>  void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
>   uintptr_t ra);
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index f2f75d2a57..d6de3ad86c 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -1565,6 +1565,14 @@ static int handle_hypercall(S390CPU *cpu,
> struct kvm_run *run) return ret;
>  }
>  
> +static void kvm_handle_diag_260(S390CPU *cpu, struct kvm_run *run)
> +{
> +const uint64_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
> +const uint64_t r3 = run->s390_sieic.ipa & 0x000f;
> +
> +handle_diag_260(>env, r1, r3, 0);
> +}
> +
>  static void kvm_handle_diag_288(S390CPU *cpu, struct kvm_run *run)
>  {
>  uint64_t r1, r3;
> @@ -1614,6

Re: [PATCH v2 1/1] virtio-ccw: auto-manage VIRTIO_F_IOMMU_PLATFORM if PV

2020-06-09 Thread Claudio Imbrenda

On Tue, 9 Jun 2020 11:41:30 +0200
Halil Pasic  wrote:

[...]

> I don't know. Janosch could answer that, but he is on vacation. Adding
> Claudio maybe he can answer. My understanding is, that while it might
> be possible, it is ugly at best. The ability to do a transition is
> indicated by a CPU model feature. Indicating the feature to the guest
> and then failing the transition sounds wrong to me.

I agree. If the feature is advertised, then it has to work. I don't
think we even have an architected way to fail the transition for that
reason.

What __could__ be done is to prevent qemu from even starting if an
incompatible device is specified together with PV.

Another option is to disable PV at the qemu level if an incompatible
device is present. This will have the effect that trying to boot a
secure guest will fail mysteriously, which is IMHO also not too great.

do we really have that many incompatible devices?

Re: [PATCH v10 01/16] s390x: Move diagnose 308 subcodes and rcs into ipl.h

2020-03-19 Thread Claudio Imbrenda

On Wed, 18 Mar 2020 10:30:32 -0400
Janosch Frank  wrote:

> They are part of the IPL process, so let's put them into the ipl
> header.
> 
> Signed-off-by: Janosch Frank 


Reviewed-by: Claudio Imbrenda 


> ---
>  hw/s390x/ipl.h  | 11 +++
>  target/s390x/diag.c | 11 ---
>  2 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
> index 3e44abe1c651d8a0..a5665e6bfde2e8cf 100644
> --- a/hw/s390x/ipl.h
> +++ b/hw/s390x/ipl.h
> @@ -159,6 +159,17 @@ struct S390IPLState {
>  typedef struct S390IPLState S390IPLState;
>  QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of
> iplb wrong"); 
> +#define DIAG_308_RC_OK  0x0001
> +#define DIAG_308_RC_NO_CONF 0x0102
> +#define DIAG_308_RC_INVALID 0x0402
> +
> +#define DIAG308_RESET_MOD_CLR   0
> +#define DIAG308_RESET_LOAD_NORM 1
> +#define DIAG308_LOAD_CLEAR  3
> +#define DIAG308_LOAD_NORMAL_DUMP4
> +#define DIAG308_SET 5
> +#define DIAG308_STORE   6
> +
>  #define S390_IPL_TYPE_FCP 0x00
>  #define S390_IPL_TYPE_CCW 0x02
>  #define S390_IPL_TYPE_QEMU_SCSI 0xff
> diff --git a/target/s390x/diag.c b/target/s390x/diag.c
> index 54e5670b3fd6d960..8aba6341f94848e1 100644
> --- a/target/s390x/diag.c
> +++ b/target/s390x/diag.c
> @@ -49,17 +49,6 @@ int handle_diag_288(CPUS390XState *env, uint64_t
> r1, uint64_t r3) return diag288_class->handle_timer(diag288, func,
> timeout); }
>  
> -#define DIAG_308_RC_OK  0x0001
> -#define DIAG_308_RC_NO_CONF 0x0102
> -#define DIAG_308_RC_INVALID 0x0402
> -
> -#define DIAG308_RESET_MOD_CLR   0
> -#define DIAG308_RESET_LOAD_NORM 1
> -#define DIAG308_LOAD_CLEAR  3
> -#define DIAG308_LOAD_NORMAL_DUMP4
> -#define DIAG308_SET 5
> -#define DIAG308_STORE   6
> -
>  static int diag308_parm_check(CPUS390XState *env, uint64_t r1,
> uint64_t addr, uintptr_t ra, bool write)
>  {

Re: [PATCH v9 09/15] s390x: protvirt: Set guest IPL PSW

2020-03-13 Thread Claudio Imbrenda

On Fri, 13 Mar 2020 15:21:07 +0100
Janosch Frank  wrote:

> On 3/13/20 1:57 PM, Claudio Imbrenda wrote:
> > On Wed, 11 Mar 2020 09:21:45 -0400
> > Janosch Frank  wrote:
> >   
> >> Handling of CPU reset and setting of the IPL psw from guest
> >> storage at offset 0 is done by a Ultravisor call. Let's only fetch
> >> it if necessary.
> >>
> >> Signed-off-by: Janosch Frank 
> >> Reviewed-by: Thomas Huth 
> >> Reviewed-by: David Hildenbrand 
> >> ---
> >>  target/s390x/cpu.c | 22 +-
> >>  1 file changed, 13 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
> >> index 84029f14814b4980..a48d39f139cdc1c4 100644
> >> --- a/target/s390x/cpu.c
> >> +++ b/target/s390x/cpu.c
> >> @@ -78,16 +78,20 @@ static bool s390_cpu_has_work(CPUState *cs)
> >>  static void s390_cpu_load_normal(CPUState *s)
> >>  {
> >>  S390CPU *cpu = S390_CPU(s);
> >> -uint64_t spsw = ldq_phys(s->as, 0);
> >> -
> >> -cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> >> -/*
> >> - * Invert short psw indication, so SIE will report a
> >> specification
> >> - * exception if it was not set.
> >> - */
> >> -cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> >> -cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> >> +uint64_t spsw;
> >>  
> >> +if (!s390_is_pv()) {
> >> +spsw = ldq_phys(s->as, 0);
> >> +cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> >> +/*
> >> + * Invert short psw indication, so SIE will report a
> >> specification
> >> + * exception if it was not set.
> >> + */
> >> +cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> >> +cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> >> +} else {
> >> +s390_cpu_set_state(S390_CPU_STATE_LOAD, cpu);
> >> +}
> >>  s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu);
> >>  }
> >>  #endif  
> > 
> > I don't understand why you set the state to S390_CPU_STATE_LOAD and
> > then immediately afterwards to S390_CPU_STATE_OPERATING, especially
> > considering that both do the same
> >   
> 
> Have a look at the specs, wee need to set the load state before
> setting the cpu to operating.
> 
> I can add a comment to make it clearer if you want.

once you have added the comment, you can also add:

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 09/15] s390x: protvirt: Set guest IPL PSW

2020-03-13 Thread Claudio Imbrenda

On Fri, 13 Mar 2020 15:21:07 +0100
Janosch Frank  wrote:

> On 3/13/20 1:57 PM, Claudio Imbrenda wrote:
> > On Wed, 11 Mar 2020 09:21:45 -0400
> > Janosch Frank  wrote:
> >   
> >> Handling of CPU reset and setting of the IPL psw from guest
> >> storage at offset 0 is done by a Ultravisor call. Let's only fetch
> >> it if necessary.
> >>
> >> Signed-off-by: Janosch Frank 
> >> Reviewed-by: Thomas Huth 
> >> Reviewed-by: David Hildenbrand 
> >> ---
> >>  target/s390x/cpu.c | 22 +-
> >>  1 file changed, 13 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
> >> index 84029f14814b4980..a48d39f139cdc1c4 100644
> >> --- a/target/s390x/cpu.c
> >> +++ b/target/s390x/cpu.c
> >> @@ -78,16 +78,20 @@ static bool s390_cpu_has_work(CPUState *cs)
> >>  static void s390_cpu_load_normal(CPUState *s)
> >>  {
> >>  S390CPU *cpu = S390_CPU(s);
> >> -uint64_t spsw = ldq_phys(s->as, 0);
> >> -
> >> -cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> >> -/*
> >> - * Invert short psw indication, so SIE will report a
> >> specification
> >> - * exception if it was not set.
> >> - */
> >> -cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> >> -cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> >> +uint64_t spsw;
> >>  
> >> +if (!s390_is_pv()) {
> >> +spsw = ldq_phys(s->as, 0);
> >> +cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> >> +/*
> >> + * Invert short psw indication, so SIE will report a
> >> specification
> >> + * exception if it was not set.
> >> + */
> >> +cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> >> +cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> >> +} else {
> >> +s390_cpu_set_state(S390_CPU_STATE_LOAD, cpu);
> >> +}
> >>  s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu);
> >>  }
> >>  #endif  
> > 
> > I don't understand why you set the state to S390_CPU_STATE_LOAD and
> > then immediately afterwards to S390_CPU_STATE_OPERATING, especially
> > considering that both do the same
> >   
> 
> Have a look at the specs, wee need to set the load state before
> setting the cpu to operating.
> 
> I can add a comment to make it clearer if you want.

yes please.

Re: [PATCH v9 15/15] s390x: Add unpack facility feature to GA1

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:51 -0400
Janosch Frank  wrote:

> From: Christian Borntraeger 
> 
> The unpack facility is an indication that diagnose 308 subcodes 8-10
> are available to the guest. That means, that the guest can put itself
> into protected mode.
> 
> Once it is in protected mode, the hardware stops any attempt of VM
> introspection by the hypervisor.
> 
> Some features are currently not supported in protected mode:
>  * Passthrough devices
>  * Migration
>  * Huge page backings
> 
> Signed-off-by: Christian Borntraeger 
> Reviewed-by: David Hildenbrand 
> ---
>  target/s390x/gen-features.c | 1 +
>  target/s390x/kvm.c  | 5 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
> index 6278845b12b8dee8..8ddeebc54419a3e2 100644
> --- a/target/s390x/gen-features.c
> +++ b/target/s390x/gen-features.c
> @@ -562,6 +562,7 @@ static uint16_t full_GEN15_GA1[] = {
>  S390_FEAT_GROUP_MSA_EXT_9,
>  S390_FEAT_GROUP_MSA_EXT_9_PCKMO,
>  S390_FEAT_ETOKEN,
> +S390_FEAT_UNPACK,
>  };
>  
>  /* Default features (in order of release)
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index ff6027036ec2f14a..e11e895a3d9038bb 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -2403,6 +2403,11 @@ void kvm_s390_get_host_cpu_model(S390CPUModel
> *model, Error **errp) clear_bit(S390_FEAT_BPB, model->features);
>  }
>  
> +/* we do have the IPL enhancements */
> +if (cap_protected) {
> +set_bit(S390_FEAT_UNPACK, model->features);
> +}
> +
>  /* We emulate a zPCI bus and AEN, therefore we don't need HW
> support */ set_bit(S390_FEAT_ZPCI, model->features);
>  set_bit(S390_FEAT_ADAPTER_EVENT_NOTIFICATION, model->features);

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 04/15] s390x: protvirt: Inhibit balloon when switching to protected mode

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:40 -0400
Janosch Frank  wrote:

> Ballooning in protected VMs can only be done when the guest shares the
> pages it gives to the host. If pages are not shared, the integrity
> checks will fail once those pages have been altered and are given back
> to the guest.
> 
> As we currently do not yet have a solution for this we will continue
> like this:
> 
> 1. We block ballooning now in QEMU (with this patch)
> 
> 2. Later we will provide a change to virtio that removes the blocker
> and adds VIRTIO_F_IOMMU_PLATFORM automatically by QEMU when doing the
> protvirt switch. This is ok as the guest balloon driver will reject to
> work with the IOMMU change
> 
> 3. Later we can fix the guest balloon driver to accept the IOMMU
> feature bit and correctly exercise sharing and unsharing of balloon
> pages
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> Reviewed-by: Christian Borntraeger 
> ---
>  hw/s390x/s390-virtio-ccw.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index deb31e060052d279..066e01f303c35671 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -41,6 +41,7 @@
>  #include "hw/qdev-properties.h"
>  #include "hw/s390x/tod.h"
>  #include "sysemu/sysemu.h"
> +#include "sysemu/balloon.h"
>  #include "hw/s390x/pv.h"
>  #include "migration/blocker.h"
>  
> @@ -326,6 +327,7 @@ static void
> s390_machine_unprotect(S390CcwMachineState *ms) ms->pv = false;
>  migrate_del_blocker(pv_mig_blocker);
>  error_free_or_abort(_mig_blocker);
> +qemu_balloon_inhibit(false);
>  }
>  
>  static int s390_machine_protect(S390CcwMachineState *ms)
> @@ -333,10 +335,12 @@ static int
> s390_machine_protect(S390CcwMachineState *ms) Error *local_err = NULL;
>  int rc;
>  
> +qemu_balloon_inhibit(true);
>  error_setg(_mig_blocker,
> "protected VMs are currently not migrateable.");
>  rc = migrate_add_blocker(pv_mig_blocker, _err);
>  if (local_err) {
> +qemu_balloon_inhibit(false);
>  error_report_err(local_err);
>  error_free_or_abort(_mig_blocker);
>  return rc;
> @@ -345,6 +349,7 @@ static int
> s390_machine_protect(S390CcwMachineState *ms) /* Create SE VM */
>  rc = s390_pv_vm_enable();
>  if (rc) {
> +qemu_balloon_inhibit(false);
>  error_report_err(local_err);
>  migrate_del_blocker(pv_mig_blocker);
>  error_free_or_abort(_mig_blocker);

looks straightforward

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 09/15] s390x: protvirt: Set guest IPL PSW

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:45 -0400
Janosch Frank  wrote:

> Handling of CPU reset and setting of the IPL psw from guest storage at
> offset 0 is done by a Ultravisor call. Let's only fetch it if
> necessary.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: Thomas Huth 
> Reviewed-by: David Hildenbrand 
> ---
>  target/s390x/cpu.c | 22 +-
>  1 file changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
> index 84029f14814b4980..a48d39f139cdc1c4 100644
> --- a/target/s390x/cpu.c
> +++ b/target/s390x/cpu.c
> @@ -78,16 +78,20 @@ static bool s390_cpu_has_work(CPUState *cs)
>  static void s390_cpu_load_normal(CPUState *s)
>  {
>  S390CPU *cpu = S390_CPU(s);
> -uint64_t spsw = ldq_phys(s->as, 0);
> -
> -cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> -/*
> - * Invert short psw indication, so SIE will report a
> specification
> - * exception if it was not set.
> - */
> -cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> -cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> +uint64_t spsw;
>  
> +if (!s390_is_pv()) {
> +spsw = ldq_phys(s->as, 0);
> +cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
> +/*
> + * Invert short psw indication, so SIE will report a
> specification
> + * exception if it was not set.
> + */
> +cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
> +cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
> +} else {
> +s390_cpu_set_state(S390_CPU_STATE_LOAD, cpu);
> +}
>  s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu);
>  }
>  #endif

I don't understand why you set the state to S390_CPU_STATE_LOAD and
then immediately afterwards to S390_CPU_STATE_OPERATING, especially
considering that both do the same

Re: [PATCH v9 03/15] s390x: protvirt: Add migration blocker

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:39 -0400
Janosch Frank  wrote:

> Migration is not yet supported.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> ---
>  hw/s390x/s390-virtio-ccw.c | 18 ++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index 9569b777a0e1abd6..deb31e060052d279 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -42,6 +42,9 @@
>  #include "hw/s390x/tod.h"
>  #include "sysemu/sysemu.h"
>  #include "hw/s390x/pv.h"
> +#include "migration/blocker.h"
> +
> +static Error *pv_mig_blocker;
>  
>  S390CPU *s390_cpu_addr2state(uint16_t cpu_addr)
>  {
> @@ -321,15 +324,30 @@ static void
> s390_machine_unprotect(S390CcwMachineState *ms) {
>  s390_pv_vm_disable();
>  ms->pv = false;
> +migrate_del_blocker(pv_mig_blocker);
> +error_free_or_abort(_mig_blocker);
>  }
>  
>  static int s390_machine_protect(S390CcwMachineState *ms)
>  {
> +Error *local_err = NULL;
>  int rc;
>  
> +error_setg(_mig_blocker,
> +   "protected VMs are currently not migrateable.");
> +rc = migrate_add_blocker(pv_mig_blocker, _err);
> +if (local_err) {
> +error_report_err(local_err);
> +error_free_or_abort(_mig_blocker);
> +return rc;
> +}
> +
>  /* Create SE VM */
>  rc = s390_pv_vm_enable();
>  if (rc) {
> +error_report_err(local_err);
> +migrate_del_blocker(pv_mig_blocker);
> +error_free_or_abort(_mig_blocker);
>  return rc;
>  }
>  

looks rather straightforward

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 05/15] s390x: protvirt: KVM intercept changes

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:41 -0400
Janosch Frank  wrote:

> Protected VMs no longer intercept with code 4 for an instruction
> interception. Instead they have codes 104 and 108 for protected
> instruction interception and protected instruction notification
> respectively.
> 
> The 104 mirrors the 4 interception.
> 
> The 108 is a notification interception to let KVM and QEMU know that
> something changed and we need to update tracking information or
> perform specific tasks. It's currently taken for the following
> instructions:
> 
> * spx (To inform about the changed prefix location)
> * sclp (On incorrect SCCB values, so we can inject a IRQ)
> * sigp (All but "stop and store status")
> * diag308 (Subcodes 0/1)
> 
> Of these exits only sclp errors, state changing sigps and diag308 will
> reach QEMU. QEMU will do its parts of the job, while the ultravisor
> has done the instruction part of the job.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> Reviewed-by: Christian Borntraeger 
> ---
>  target/s390x/kvm.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index 1d6fd6a27b48e35f..eec0b92479465b9c 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -115,6 +115,8 @@
>  #define ICPT_CPU_STOP   0x28
>  #define ICPT_OPEREXC0x2c
>  #define ICPT_IO 0x40
> +#define ICPT_PV_INSTR   0x68
> +#define ICPT_PV_INSTR_NOTIFICATION  0x6c
>  
>  #define NR_LOCAL_IRQS 32
>  /*
> @@ -1693,6 +1695,8 @@ static int handle_intercept(S390CPU *cpu)
>  (long)cs->kvm_run->psw_addr);
>  switch (icpt_code) {
>  case ICPT_INSTRUCTION:
> +case ICPT_PV_INSTR:
> +case ICPT_PV_INSTR_NOTIFICATION:
>  r = handle_instruction(cpu, run);
>  break;
>  case ICPT_PROGRAM:

very straightforward

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 12/15] s390x: protvirt: Move IO control structures over SIDA

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:48 -0400
Janosch Frank  wrote:

> For protected guests, we need to put the IO emulation results into the
> SIDA, so SIE will write them into the guest at the next entry.
> 
> Signed-off-by: Janosch Frank 
> ---
>  target/s390x/ioinst.c | 87
> ++- 1 file changed, 61
> insertions(+), 26 deletions(-)
> 
> diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c
> index 481d789de9e09a04..61095bdc9ffef436 100644
> --- a/target/s390x/ioinst.c
> +++ b/target/s390x/ioinst.c
> @@ -136,9 +136,13 @@ void ioinst_handle_msch(S390CPU *cpu, uint64_t
> reg1, uint32_t ipb, uintptr_t ra) s390_program_interrupt(env,
> PGM_SPECIFICATION, ra); return;
>  }
> -if (s390_cpu_virt_mem_read(cpu, addr, ar, ,
> sizeof(schib))) {
> -s390_cpu_virt_mem_handle_exc(cpu, ra);
> -return;
> +if (s390_is_pv()) {
> +s390_cpu_pv_mem_read(cpu, addr, , sizeof(schib));
> +} else {
> +if (s390_cpu_virt_mem_read(cpu, addr, ar, ,
> sizeof(schib))) {
> +s390_cpu_virt_mem_handle_exc(cpu, ra);
> +return;
> +}
>  }
>  if (ioinst_disassemble_sch_ident(reg1, , , ,
> ) || !ioinst_schib_valid()) {
> @@ -193,9 +197,13 @@ void ioinst_handle_ssch(S390CPU *cpu, uint64_t
> reg1, uint32_t ipb, uintptr_t ra) s390_program_interrupt(env,
> PGM_SPECIFICATION, ra); return;
>  }
> -if (s390_cpu_virt_mem_read(cpu, addr, ar, _orb,
> sizeof(orb))) {
> -s390_cpu_virt_mem_handle_exc(cpu, ra);
> -return;
> +if (s390_is_pv()) {
> +s390_cpu_pv_mem_read(cpu, addr, _orb, sizeof(orb));
> +} else {
> +if (s390_cpu_virt_mem_read(cpu, addr, ar, _orb,
> sizeof(orb))) {
> +s390_cpu_virt_mem_handle_exc(cpu, ra);
> +return;
> +}
>  }
>  copy_orb_from_guest(, _orb);
>  if (ioinst_disassemble_sch_ident(reg1, , , ,
> ) || @@ -229,14 +237,19 @@ void ioinst_handle_stcrw(S390CPU
> *cpu, uint32_t ipb, uintptr_t ra) cc = css_do_stcrw();
>  /* 0 - crw stored, 1 - zeroes stored */
>  
> -if (s390_cpu_virt_mem_write(cpu, addr, ar, , sizeof(crw)) ==
> 0) {
> +if (s390_is_pv()) {
> +s390_cpu_pv_mem_write(cpu, addr, , sizeof(crw));
>  setcc(cpu, cc);
>  } else {
> -if (cc == 0) {
> -/* Write failed: requeue CRW since STCRW is suppressing
> */
> -css_undo_stcrw();
> +if (s390_cpu_virt_mem_write(cpu, addr, ar, ,
> sizeof(crw)) == 0) {
> +setcc(cpu, cc);
> +} else {
> +if (cc == 0) {
> +/* Write failed: requeue CRW since STCRW is
> suppressing */
> +css_undo_stcrw();
> +}
> +s390_cpu_virt_mem_handle_exc(cpu, ra);
>  }
> -s390_cpu_virt_mem_handle_exc(cpu, ra);
>  }
>  }
>  
> @@ -258,6 +271,9 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t
> reg1, uint32_t ipb, }
>  
>  if (ioinst_disassemble_sch_ident(reg1, , , ,
> )) {
> +if (s390_is_pv()) {
> +return;
> +}

If the operand is invalid, should we not inject an operand exception?

If instead the Ultravisor or KVM make sure that we don't end up here,
then, why handle this at all?

>  /*
>   * As operand exceptions have a lower priority than access
> exceptions,
>   * we check whether the memory area is writeable (injecting
> the @@ -290,14 +306,19 @@ void ioinst_handle_stsch(S390CPU *cpu,
> uint64_t reg1, uint32_t ipb, }
>  }
>  if (cc != 3) {
> -if (s390_cpu_virt_mem_write(cpu, addr, ar, ,
> -sizeof(schib)) != 0) {
> -s390_cpu_virt_mem_handle_exc(cpu, ra);
> -return;
> +if (s390_is_pv()) {
> +s390_cpu_pv_mem_write(cpu, addr, , sizeof(schib));
> +} else {
> +if (s390_cpu_virt_mem_write(cpu, addr, ar, ,
> +sizeof(schib)) != 0) {
> +s390_cpu_virt_mem_handle_exc(cpu, ra);
> +return;
> +}
>  }
>  } else {
>  /* Access exceptions have a higher priority than cc3 */
> -if (s390_cpu_virt_mem_check_write(cpu, addr, ar,
> sizeof(schib)) != 0) {
> +if (!s390_is_pv() &&
> +s390_cpu_virt_mem_check_write(cpu, addr, ar,
> sizeof(schib)) != 0) { s390_cpu_virt_mem_handle_exc(cpu, ra);
>  return;
>  }
> @@ -334,15 +355,20 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t
> reg1, uint32_t ipb, uintptr_t ra) }
>  /* 0 - status pending, 1 - not status pending, 3 - not
> operational */ if (cc != 3) {
> -if (s390_cpu_virt_mem_write(cpu, addr, ar, , irb_len) !=
> 0) {
> -s390_cpu_virt_mem_handle_exc(cpu, ra);
> -return -EFAULT;
> +if (s390_is_pv()) {
> +s390_cpu_pv_mem_write(cpu, addr, , irb_len);
> +} else {
> +if (s390_cpu_virt_mem_write(cpu, addr, ar, ,
>

Re: [PATCH v9 13/15] s390x: protvirt: Handle SIGP store status correctly

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:49 -0400
Janosch Frank  wrote:

> For protected VMs status storing is not done by QEMU anymore.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: Thomas Huth 
> Reviewed-by: David Hildenbrand 
> ---
>  target/s390x/helper.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/target/s390x/helper.c b/target/s390x/helper.c
> index ed726849114f2f35..5022df8812d406c9 100644
> --- a/target/s390x/helper.c
> +++ b/target/s390x/helper.c
> @@ -25,6 +25,7 @@
>  #include "qemu/timer.h"
>  #include "qemu/qemu-print.h"
>  #include "hw/s390x/ioinst.h"
> +#include "hw/s390x/pv.h"
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/runstate.h"
>  #ifndef CONFIG_USER_ONLY
> @@ -246,6 +247,11 @@ int s390_store_status(S390CPU *cpu, hwaddr addr,
> bool store_arch) hwaddr len = sizeof(*sa);
>  int i;
>  
> +/* Storing will occur on next SIE entry for protected VMs */
> +if (s390_is_pv()) {
> +    return 0;
> +}
> +
>  sa = cpu_physical_memory_map(addr, , true);
>  if (!sa) {
>  return -EFAULT;

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 06/15] s390x: Add SIDA memory ops

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:42 -0400
Janosch Frank  wrote:

> Protected guests save the instruction control blocks in the SIDA
> instead of QEMU/KVM directly accessing the guest's memory.
> 
> Let's introduce new functions to access the SIDA.
> 
> Also the new memops are available with KVM_CAP_S390_PROTECTED, so
> let's check for that.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> ---
>  target/s390x/cpu.h|  7 ++-
>  target/s390x/kvm.c| 25 +
>  target/s390x/kvm_s390x.h  |  2 ++
>  target/s390x/mmu_helper.c | 14 ++
>  4 files changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
> index 1d17709d6e10b5e0..035427521cec2528 100644
> --- a/target/s390x/cpu.h
> +++ b/target/s390x/cpu.h
> @@ -823,7 +823,12 @@ int s390_cpu_virt_mem_rw(S390CPU *cpu, vaddr
> laddr, uint8_t ar, void *hostbuf, #define
> s390_cpu_virt_mem_check_write(cpu, laddr, ar, len)   \
> s390_cpu_virt_mem_rw(cpu, laddr, ar, NULL, len, true) void
> s390_cpu_virt_mem_handle_exc(S390CPU *cpu, uintptr_t ra); -
> +int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void
> *hostbuf,
> +   int len, bool is_write);
> +#define s390_cpu_pv_mem_read(cpu, offset, dest, len)\
> +s390_cpu_pv_mem_rw(cpu, offset, dest, len, false)
> +#define s390_cpu_pv_mem_write(cpu, offset, dest, len)   \
> +s390_cpu_pv_mem_rw(cpu, offset, dest, len, true)
>  
>  /* sigp.c */
>  int s390_cpu_restart(S390CPU *cpu);
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index eec0b92479465b9c..cdcd538b4f7fb318 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -154,6 +154,7 @@ static int cap_ri;
>  static int cap_gs;
>  static int cap_hpage_1m;
>  static int cap_vcpu_resets;
> +static int cap_protected;
>  
>  static int active_cmma;
>  
> @@ -346,6 +347,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP);
>  cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ);
>  cap_vcpu_resets = kvm_check_extension(s,
> KVM_CAP_S390_VCPU_RESETS);
> +cap_protected = kvm_check_extension(s, KVM_CAP_S390_PROTECTED);
>  
>  if (!kvm_check_extension(s, KVM_CAP_S390_GMAP)
>  || !kvm_check_extension(s, KVM_CAP_S390_COW)) {
> @@ -846,6 +848,29 @@ int kvm_s390_mem_op(S390CPU *cpu, vaddr addr,
> uint8_t ar, void *hostbuf, return ret;
>  }
>  
> +int kvm_s390_mem_op_pv(S390CPU *cpu, uint64_t offset, void *hostbuf,
> +   int len, bool is_write)
> +{
> +struct kvm_s390_mem_op mem_op = {
> +.sida_offset = offset,
> +.size = len,
> +.op = is_write ? KVM_S390_MEMOP_SIDA_WRITE
> +   : KVM_S390_MEMOP_SIDA_READ,
> +.buf = (uint64_t)hostbuf,
> +};
> +int ret;
> +
> +if (!cap_mem_op || !cap_protected) {
> +return -ENOSYS;
> +}
> +
> +ret = kvm_vcpu_ioctl(CPU(cpu), KVM_S390_MEM_OP, _op);
> +if (ret < 0) {
> +error_report("KVM_S390_MEM_OP failed: %s", strerror(-ret));
> +}
> +return ret;
> +}
> +
>  /*
>   * Legacy layout for s390:
>   * Older S390 KVM requires the topmost vma of the RAM to be
> diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h
> index 0b21789796d7c462..9c38f6ccce83e39e 100644
> --- a/target/s390x/kvm_s390x.h
> +++ b/target/s390x/kvm_s390x.h
> @@ -19,6 +19,8 @@ void kvm_s390_vcpu_interrupt(S390CPU *cpu, struct
> kvm_s390_irq *irq); void kvm_s390_access_exception(S390CPU *cpu,
> uint16_t code, uint64_t te_code); int kvm_s390_mem_op(S390CPU *cpu,
> vaddr addr, uint8_t ar, void *hostbuf, int len, bool is_write);
> +int kvm_s390_mem_op_pv(S390CPU *cpu, vaddr addr, void *hostbuf, int
> len,
> +   bool is_write);
>  void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code);
>  int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state);
>  void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu);
> diff --git a/target/s390x/mmu_helper.c b/target/s390x/mmu_helper.c
> index 0be2f300bbe4ac8b..7d9f3059cd502c49 100644
> --- a/target/s390x/mmu_helper.c
> +++ b/target/s390x/mmu_helper.c
> @@ -474,6 +474,20 @@ static int translate_pages(S390CPU *cpu, vaddr
> addr, int nr_pages, return 0;
>  }
>  
> +int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void
> *hostbuf,
> +   int len, bool is_write)
> +{
> +int ret;
> +
> +    if (kvm_enabled()) {
> +ret = kvm_s390_mem_op_pv(cpu, offset, hostbuf, len,
> is_write);
> +} else {
> +/* Protected Virtualization is a KVM/Hardware only feature */
> +g_assert_not_reached();
> +}
> +return ret;
> +}
> +
>  /**
>   * s390_cpu_virt_mem_rw:
>   * @laddr: the logical start address

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 14/15] docs: Add protvirt docs

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:50 -0400
Janosch Frank  wrote:

> Lets add some documentation for the Protected VM functionality.
> 
> Signed-off-by: Janosch Frank 
> ---
>  docs/system/index.rst|  1 +
>  docs/system/protvirt.rst | 56
>  2 files changed, 57
> insertions(+) create mode 100644 docs/system/protvirt.rst
> 
> diff --git a/docs/system/index.rst b/docs/system/index.rst
> index 6e5f20fa1333ce23..74afbd7cc3fc0296 100644
> --- a/docs/system/index.rst
> +++ b/docs/system/index.rst
> @@ -34,3 +34,4 @@ Contents:
> deprecated
> build-platforms
> license
> +   protvirt
> diff --git a/docs/system/protvirt.rst b/docs/system/protvirt.rst
> new file mode 100644
> index ..6c8cf0f7910eae86
> --- /dev/null
> +++ b/docs/system/protvirt.rst
> @@ -0,0 +1,56 @@
> +Protected Virtualization on s390x
> +=
> +
> +The memory and most of the registers of Protected Virtual Machines
> +(PVMs) are encrypted or inaccessible to the hypervisor, effectively
> +prohibiting VM introspection when the VM is running. At rest, PVMs
> are +encrypted and can only be decrypted by the firmware, represented
> by an +entity called Ultravisor, of specific IBM Z machines.
> +
> +
> +Prerequisites
> +-
> +
> +To run PVMs a machine with the Protected Virtualization feature
> +which is indicated by the Ultravisor Call facility (stfle bit
> +158) is required. The Ultravisor needs to be initialized at boot by
> +setting `prot_virt=1` on the kernel command line.

I'd add "of the host" just to make it extra clear

> +
> +If those requirements are met, the capability
> `KVM_CAP_S390_PROTECTED` +will indicate that KVM can support PVMs on
> that LPAR. +
> +
> +QEMU Settings
> +-
> +
> +To indicate to the VM that it can transition into protected mode, the
> +`Unpack facility` (stfle bit 161 represented by the feature
> +`S390_FEAT_UNPACK`) needs to be part of the cpu model of the VM.
> +
> +All I/O devices need to use the IOMMU.
> +Passthrough (vfio) devices are currently not supported.
> +
> +Host huge page backings are not supported. However guests can use
> huge +pages as indicated by its facilities.
> +
> +
> +Boot Process
> +
> +
> +A secure guest image can either be loaded from disk or supplied on
> the +QEMU command line. Booting from disk is done by the unmodified
> +s390-ccw BIOS. I.e., the bootmap is interpreted, multiple components
> +are read into memory and control is transferred to one of the
> +components (zipl stage3). Stag3 does some fixups and then transfers
> +control to some program residing in guest memory, which is normally
> +the OS kernel. The secure image has another component prepended
> +(stage3a) that uses the new diag308 subcodes 8 and 10 to trigger the
> +transition into secure mode.
> +
> +Booting from the image supplied via the QEMU command line requires
> +that the file passed via -kernel has the same memory layout as would
> +result from the disk boot. This memory layout includes the encrypted
> +components (kernel, initrd, cmdline), the stage3a loader and
> +metadata. In case this boot method is used, the command line
> +options -initrd and -cmdline are ineffective. The preparation of a
> PVM +image is done by genprotimg of the s390-tools package.

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 02/15] s390x: protvirt: Support unpack facility

2020-03-13 Thread Claudio Imbrenda

return ccw->pv;
> +}
> +
> +/* we have to bail out for the "none" machine */
> +obj = object_dynamic_cast(qdev_get_machine(),
> +  TYPE_S390_CCW_MACHINE);
> +if (!obj) {
> +return false;
> +}
> +ccw = S390_CCW_MACHINE(obj);
> +return ccw->pv;
> +}
> +
> +int s390_pv_vm_enable(void);
> +void s390_pv_vm_disable(void);
> +int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
> +int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
> +void s390_pv_perf_clear_reset(void);
> +int s390_pv_verify(void);
> +void s390_pv_unshare(void);
> +#else
> +static inline bool s390_is_pv(void) { return false; }
> +static inline int s390_pv_vm_enable(void) { return 0; }
> +static inline void s390_pv_vm_disable(void) {}
> +static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t
> length) { return 0; } +static inline int s390_pv_unpack(uint64_t
> addr, uint64_t size, uint64_t tweak) { return 0; } +static inline
> void s390_pv_perf_clear_reset(void) {} +static inline int
> s390_pv_verify(void) { return 0; } +static inline void
> s390_pv_unshare(void) {} +#endif
> +
> +
> +
> +#endif /* HW_S390_PV_H */
> diff --git a/include/hw/s390x/s390-virtio-ccw.h
> b/include/hw/s390x/s390-virtio-ccw.h index
> 8aa27199c9123bab..cd1dccc6e3ba8645 100644 ---
> a/include/hw/s390x/s390-virtio-ccw.h +++
> b/include/hw/s390x/s390-virtio-ccw.h @@ -28,6 +28,7 @@ typedef struct
> S390CcwMachineState { /*< public >*/
>  bool aes_key_wrap;
>  bool dea_key_wrap;
> +bool pv;
>  uint8_t loadparm[8];
>  } S390CcwMachineState;
>  
> diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
> index 3dd396e870357944..84029f14814b4980 100644
> --- a/target/s390x/cpu.c
> +++ b/target/s390x/cpu.c
> @@ -37,6 +37,8 @@
>  #include "sysemu/hw_accel.h"
>  #include "hw/qdev-properties.h"
>  #ifndef CONFIG_USER_ONLY
> +#include "hw/s390x/s390-virtio-ccw.h"
> +#include "hw/s390x/pv.h"
>  #include "hw/boards.h"
>  #include "sysemu/arch_init.h"
>  #include "sysemu/sysemu.h"
> diff --git a/target/s390x/cpu_features_def.inc.h
> b/target/s390x/cpu_features_def.inc.h index
> 31dff0d84e972451..60db28351d059091 100644 ---
> a/target/s390x/cpu_features_def.inc.h +++
> b/target/s390x/cpu_features_def.inc.h @@ -107,6 +107,7 @@
> DEF_FEAT(DEFLATE_BASE, "deflate-base", STFL, 151, "Deflate-conversion
> facility ( DEF_FEAT(VECTOR_PACKED_DECIMAL_ENH, "vxpdeh", STFL, 152,
> "Vector-Packed-Decimal-Enhancement Facility") DEF_FEAT(MSA_EXT_9,
> "msa9-base", STFL, 155, "Message-security-assist-extension-9 facility
> (excluding subfunctions)") DEF_FEAT(ETOKEN, "etoken", STFL, 156,
> "Etoken facility") +DEF_FEAT(UNPACK, "unpack", STFL, 161, "Unpack
> facility") /* Features exposed via SCLP SCCB Byte 80 - 98  (bit
> numbers relative to byte-80) */ DEF_FEAT(SIE_GSLS, "gsls",
> SCLP_CONF_CHAR, 40, "SIE: Guest-storage-limit-suppression facility")
> diff --git a/target/s390x/diag.c b/target/s390x/diag.c index
> 54e5670b3fd6d960..b245e557037ded06 100644 --- a/target/s390x/diag.c
> +++ b/target/s390x/diag.c @@ -20,6 +20,7 @@
>  #include "sysemu/cpus.h"
>  #include "hw/s390x/ipl.h"
>  #include "hw/s390x/s390-virtio-ccw.h"
> +#include "hw/s390x/pv.h"
>  
>  int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3)
>  {
> @@ -52,6 +53,7 @@ int handle_diag_288(CPUS390XState *env, uint64_t
> r1, uint64_t r3) #define DIAG_308_RC_OK  0x0001
>  #define DIAG_308_RC_NO_CONF 0x0102
>  #define DIAG_308_RC_INVALID 0x0402
> +#define DIAG_308_RC_NO_PV_CONF  0x0902
>  
>  #define DIAG308_RESET_MOD_CLR   0
>  #define DIAG308_RESET_LOAD_NORM 1
> @@ -59,10 +61,17 @@ int handle_diag_288(CPUS390XState *env, uint64_t
> r1, uint64_t r3) #define DIAG308_LOAD_NORMAL_DUMP4
>  #define DIAG308_SET 5
>  #define DIAG308_STORE   6
> +#define DIAG308_PV_SET  8
> +#define DIAG308_PV_STORE9
> +#define DIAG308_PV_START10
>  
>  static int diag308_parm_check(CPUS390XState *env, uint64_t r1,
> uint64_t addr, uintptr_t ra, bool write)
>  {
> +/* Handled by the Ultravisor */
> +if (s390_is_pv()) {
> +return 0;
> +}
>  if ((r1 & 1) || (addr & ~TARGET_PAGE_MASK)) {
>  s390_program_interrupt(env, PGM_SPECIFICATION, ra);
>  return -1;
> @@ -93,6 +102,11 @@ void handle_diag_308(CPUS390XState *env, uint64_t
> r1, uint64_t r3, uintptr_t ra) return;
>  }
>  
> +if (subcode >= DIAG308_PV_SET &&
> !s390_has_feat(S390_FEAT_UNPACK)) {
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +return;
> +}
> +
>  switch (subcode) {
>  case DIAG308_RESET_MOD_CLR:
>  s390_ipl_reset_request(cs, S390_RESET_MODIFIED_CLEAR);
> @@ -105,6 +119,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t
> r1, uint64_t r3, uintptr_t ra) s390_ipl_reset_request(cs,
> S390_RESET_REIPL); break;
>  case DIAG308_SET:
> +case DIAG308_PV_SET:
>  if (diag308_parm_check(env, r1, addr, ra, false)) {
>  return;
>  }
> @@ -128,10 +143,15 @@ out:
>  g_free(iplb);
>  return;
>  case DIAG308_STORE:
> +case DIAG308_PV_STORE:
>  if (diag308_parm_check(env, r1, addr, ra, true)) {
>  return;
>  }
> -iplb = s390_ipl_get_iplb();
> +if (subcode == DIAG308_PV_STORE) {
> +iplb = s390_ipl_get_iplb_pv();
> +} else {
> +iplb = s390_ipl_get_iplb();
> +}
>  if (iplb) {
>  cpu_physical_memory_write(addr, iplb,
> be32_to_cpu(iplb->len)); env->regs[r1 + 1] = DIAG_308_RC_OK;
> @@ -139,6 +159,15 @@ out:
>  env->regs[r1 + 1] = DIAG_308_RC_NO_CONF;
>  }
>  return;
> +case DIAG308_PV_START:
> +iplb = s390_ipl_get_iplb_pv();
> +if (!iplb) {
> +env->regs[r1 + 1] = DIAG_308_RC_NO_PV_CONF;
> +return;
> +}
> +
> +s390_ipl_reset_request(cs, S390_RESET_PV);
> +break;
>  default:
>  s390_program_interrupt(env, PGM_SPECIFICATION, ra);
>  break;

with the two typos fixed and the fixup:

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 10/15] s390x: protvirt: Move diag 308 data over SIDA

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:46 -0400
Janosch Frank  wrote:

> For protected guests the IPIB is written/read to/from the SIDA, so we
> need those accesses to go through s390_cpu_pv_mem_read/write().
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> ---
>  target/s390x/diag.c | 27 +--
>  1 file changed, 21 insertions(+), 6 deletions(-)
> 
> diff --git a/target/s390x/diag.c b/target/s390x/diag.c
> index b245e557037ded06..a733485caf162111 100644
> --- a/target/s390x/diag.c
> +++ b/target/s390x/diag.c
> @@ -88,6 +88,7 @@ static int diag308_parm_check(CPUS390XState *env,
> uint64_t r1, uint64_t addr, void handle_diag_308(CPUS390XState *env,
> uint64_t r1, uint64_t r3, uintptr_t ra) {
>  CPUState *cs = env_cpu(env);
> +S390CPU *cpu = S390_CPU(cs);
>  uint64_t addr =  env->regs[r1];
>  uint64_t subcode = env->regs[r3];
>  IplParameterBlock *iplb;
> @@ -124,13 +125,22 @@ void handle_diag_308(CPUS390XState *env,
> uint64_t r1, uint64_t r3, uintptr_t ra) return;
>  }
>  iplb = g_new0(IplParameterBlock, 1);
> -cpu_physical_memory_read(addr, iplb, sizeof(iplb->len));
> +if (!s390_is_pv()) {
> +cpu_physical_memory_read(addr, iplb, sizeof(iplb->len));
> +} else {
> +s390_cpu_pv_mem_read(cpu, 0, iplb, sizeof(iplb->len));
> +}
> +
>  if (!iplb_valid_len(iplb)) {
>  env->regs[r1 + 1] = DIAG_308_RC_INVALID;
>  goto out;
>  }
>  
> -cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len));
> +if (!s390_is_pv()) {
> +cpu_physical_memory_read(addr, iplb,
> be32_to_cpu(iplb->len));
> +} else {
> +s390_cpu_pv_mem_read(cpu, 0, iplb,
> be32_to_cpu(iplb->len));
> +}
>  
>  if (!iplb_valid(iplb)) {
>  env->regs[r1 + 1] = DIAG_308_RC_INVALID;
> @@ -152,12 +162,17 @@ out:
>  } else {
>  iplb = s390_ipl_get_iplb();
>  }
> -if (iplb) {
> -cpu_physical_memory_write(addr, iplb,
> be32_to_cpu(iplb->len));
> -env->regs[r1 + 1] = DIAG_308_RC_OK;
> -} else {
> +if (!iplb) {
>  env->regs[r1 + 1] = DIAG_308_RC_NO_CONF;
> +return;
>  }
> +
> +if (!s390_is_pv()) {
> +cpu_physical_memory_write(addr, iplb,
> be32_to_cpu(iplb->len));
> +} else {
> +    s390_cpu_pv_mem_write(cpu, 0, iplb,
> be32_to_cpu(iplb->len));
> +}
> +env->regs[r1 + 1] = DIAG_308_RC_OK;
>  return;
>  case DIAG308_PV_START:
>  iplb = s390_ipl_get_iplb_pv();

Reviewed-by: Claudio Imbrenda

Re: [PATCH v9 08/15] s390x: protvirt: SCLP interpretation

2020-03-13 Thread Claudio Imbrenda

On Wed, 11 Mar 2020 09:21:44 -0400
Janosch Frank  wrote:

> SCLP for a protected guest is done over the SIDAD, so we need to use
> the s390_cpu_pv_mem_* functions to access the SIDAD instead of guest
> memory when reading/writing SCBs.
> 
> To not confuse the sclp emulation, we set 0x4000 as the SCCB address,
> since the function that injects the sclp external interrupt would
> reject a zero sccb address.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: David Hildenbrand 
> ---
>  hw/s390x/sclp.c | 30 ++
>  include/hw/s390x/sclp.h |  2 ++
>  target/s390x/kvm.c  | 24 +++-
>  3 files changed, 51 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
> index af0bfbc2eca74767..5f3aa30d6283dce5 100644
> --- a/hw/s390x/sclp.c
> +++ b/hw/s390x/sclp.c
> @@ -193,6 +193,36 @@ static void sclp_execute(SCLPDevice *sclp, SCCB
> *sccb, uint32_t code) }
>  }
>  
> +/*
> + * We only need the address to have something valid for the
> + * service_interrupt call.
> + */
> +#define SCLP_PV_DUMMY_ADDR 0x4000
> +int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb,
> +uint32_t code)
> +{
> +SCLPDevice *sclp = get_sclp_device();
> +SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp);
> +SCCB work_sccb;
> +hwaddr sccb_len = sizeof(SCCB);
> +
> +/*
> + * Only a very limited amount of calls is permitted by the
> + * Ultravisor and we support all of them, so we don't check for
> + * them. All other specification exceptions are also interpreted
> + * by the Ultravisor and hence never cause an exit we need to
> + * handle.
> + *
> + * Setting the CC is also done by the Ultravisor.
> + */
> +s390_cpu_pv_mem_read(env_archcpu(env), 0, _sccb, sccb_len);
> +sclp_c->execute(sclp, _sccb, code);
> +s390_cpu_pv_mem_write(env_archcpu(env), 0, _sccb,
> +  be16_to_cpu(work_sccb.h.length));
> +sclp_c->service_interrupt(sclp, SCLP_PV_DUMMY_ADDR);
> +return 0;
> +}
> +
>  int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t
> code) {
>  SCLPDevice *sclp = get_sclp_device();
> diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h
> index c54413b78cf01b27..c0a3faa37d730453 100644
> --- a/include/hw/s390x/sclp.h
> +++ b/include/hw/s390x/sclp.h
> @@ -217,5 +217,7 @@ void s390_sclp_init(void);
>  void sclp_service_interrupt(uint32_t sccb);
>  void raise_irq_cpu_hotplug(void);
>  int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t
> code); +int sclp_service_call_protected(CPUS390XState *env, uint64_t
> sccb,
> +uint32_t code);
>  
>  #endif
> diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
> index 8085d5030e7c6454..ff6027036ec2f14a 100644
> --- a/target/s390x/kvm.c
> +++ b/target/s390x/kvm.c
> @@ -1227,12 +1227,26 @@ static void kvm_sclp_service_call(S390CPU
> *cpu, struct kvm_run *run, sccb = env->regs[ipbh0 & 0xf];
>  code = env->regs[(ipbh0 & 0xf0) >> 4];
>  
> -r = sclp_service_call(env, sccb, code);
> -if (r < 0) {
> -kvm_s390_program_interrupt(cpu, -r);
> -return;
> +switch (run->s390_sieic.icptcode) {
> +case ICPT_PV_INSTR_NOTIFICATION:
> +g_assert(s390_is_pv());
> +/* The notification intercepts are currently handled by KVM
> */
> +error_report("unexpected SCLP PV notification");
> +exit(1);
> +break;
> +case ICPT_PV_INSTR:
> +g_assert(s390_is_pv());
> +sclp_service_call_protected(env, sccb, code);
> +break;
> +case ICPT_INSTRUCTION:
> +g_assert(!s390_is_pv());
> +r = sclp_service_call(env, sccb, code);
> +if (r < 0) {
> +kvm_s390_program_interrupt(cpu, -r);
> +return;
> +}
> +setcc(cpu, r);
>  }
> -setcc(cpu, r);
>  }
>  
>  static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)


Reviewed-by: Claudio Imbrenda

1 2 >

1 - 100 of 199 matches

Mail list logo