[Bug 1623998] Re: pulseaudio Invalid argument error

2019-10-04 Thread John Arbuckle
I tried using "-device usb-audio"and the guest was able to detect the
USB audio card, but for some reason no audio would play. I did not see
any messages in the terminal about pulseaudio.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1623998

Title:
  pulseaudio Invalid argument error

Status in QEMU:
  Incomplete

Bug description:
  When using qemu-system-ppc on Ubuntu Mate 15 with the usb audio card,
  I see these error messages:

  pulseaudio: set_sink_input_volume() failed
  pulseaudio: Reason: Invalid argument
  pulseaudio: set_sink_input_mute() failed
  pulseaudio: Reason: Invalid argument

  No audio plays. When an attempt is made, QEMU seems to freeze for a
  moment.

  I use "-device usb-audio" to add the usb sound card. This issue is
  present in both emulation and KVM mode.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1623998/+subscriptions



[PATCH] target/i386: log guest name and memory error type AO, AR for MCEs

2019-10-04 Thread Mario Smarduch
In a large VPC environment we want to log memory error occurrences
and log them with guest name and type - there are few use cases


- if VM crashes on AR mce inform the user about the reason and
  resolve the case
- if VM hangs notify the user to reboot and resume processing
- if VM continues to run let the user know, he/she maybe able to
  correlate to vm internal outage
- Rawhammer attacks - isolate/determine the attacker possible
  migrating it off the hypervisor
- In general track memory errors on a hyperviosr over time to determine
  trends

Monitoring our fleet we come across quite a few of these and been
able to take action where before there were no clues to the causes.

When memory error occurs we get a log entry in qemu log:

Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
addr of type BUS_MCEERR_AR injected

with enterprise logging environment we can to take further actions.

Signed-off-by: Mario Smarduch 
---
 target/i386/kvm.c | 27 ++-
 util/qemu-error.c | 24 
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 92069099ab..79ebccc684 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
paddr, int code)
(MCM_ADDR_PHYS << 6) | 0xc, flags);
 }

-static void hardware_memory_error(void)
+static void hardware_memory_error(void *addr)
 {
-fprintf(stderr, "Hardware memory error!\n");
+error_report("QEMU got Hardware memory error at addr %p", addr);
 exit(1);
 }

@@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
code, void *addr)
 kvm_physical_memory_addr_from_host(c->kvm_state, addr,
)) {
 kvm_hwpoison_page_add(ram_addr);
 kvm_mce_inject(cpu, paddr, code);
+/*
+ * Use different logging severity based on error type.
+ * If mcelog is running qemu va addr will help debug via
mcelog.
+ */
+if (code == BUS_MCEERR_AR) {
+error_report("Guest MCE Memory Error at qemu addr %p and "
+"guest %lx addr of type %s injected", addr, paddr,
+ "BUS_MCEERR_AR");
+} else {
+ warn_report("Guest MCE Memory Error at qemu addr %p and "
+ "guest %lx addr of type %s injected", addr,
+ paddr, "BUS_MCEERR_AO");
+}
+
 return;
 }

-fprintf(stderr, "Hardware memory error for memory used by "
-"QEMU itself instead of guest system!\n");
+if (code == BUS_MCEERR_AO) {
+warn_report("Hardware memory error at addr %p of type %s "
+"for memory used by QEMU itself instead of guest system!",
+addr, "BUS_MCEERR_AO");
+}
 }

 if (code == BUS_MCEERR_AR) {
-hardware_memory_error();
+hardware_memory_error(addr);
 }

 /* Hope we are lucky for AO MCE */
diff --git a/util/qemu-error.c b/util/qemu-error.c
index f373f3b3b0..2ebafd4405 100644
--- a/util/qemu-error.c
+++ b/util/qemu-error.c
@@ -11,6 +11,8 @@
  */

 #include "qemu/osdep.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
 #include "monitor/monitor.h"
 #include "qemu/error-report.h"

@@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
 return ret;
 }

+static const char *error_get_guestname(void)
+{
+QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
+return qemu_opt_get(opts, "guest");
+}
+
+/*
+ * Print guest name associated with error, to aid debugging errors from
+ * multiple guests in centralized logging environment.
+ */
+static void error_print_guestname(void)
+{
+const char *name;
+name = error_get_guestname();
+if (name != NULL && !cur_mon) {
+error_printf("Guest [%s] ", name);
+}
+}
+
 int error_printf_unless_qmp(const char *fmt, ...)
 {
 va_list ap;
 int ret;

+error_print_guestname();
 va_start(ap, fmt);
 ret = error_vprintf_unless_qmp(fmt, ap);
 va_end(ap);
@@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
 {
 va_list ap;

+error_print_guestname();
 va_start(ap, fmt);
 vreport(REPORT_TYPE_ERROR, fmt, ap);
 va_end(ap);
@@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
 {
 va_list ap;

+error_print_guestname();
 va_start(ap, fmt);
 vreport(REPORT_TYPE_WARNING, fmt, ap);
 va_end(ap);
--
2.17.1



Re: [PATCH v3] migration: Support gtree migration

2019-10-04 Thread Juan Quintela
Eric Auger  wrote:
> Introduce support for GTree migration. A custom save/restore
> is implemented. Each item is made of a key and a data.
>
> If the key is a pointer to an object, 2 VMSDs are passed into
> the GTree VMStateField.
>
> When putting the items, the tree is traversed in sorted order by
> g_tree_foreach.
>
> On the get() path, gtrees must be allocated using the proper
> key compare, key destroy and value destroy. This must be handled
> beforehand, for example in a pre_load method.
>
> Tests are added to test save/dump of structs containing gtrees
> including the virtio-iommu domain/mappings scenario.
>
> Signed-off-by: Eric Auger 

Overal I like the patch.  I think that I found a minor error.


> +static int put_gtree(QEMUFile *f, void *pv, size_t unused_size,
> + const VMStateField *field, QJSON *vmdesc)
> +{
> +bool direct_key = (!field->start);
> +const VMStateDescription *key_vmsd = direct_key ? NULL : >vmsd[0];
> +const VMStateDescription *val_vmsd = direct_key ? >vmsd[0] :
> +  >vmsd[1];
> +struct put_gtree_data capsule = {f, key_vmsd, val_vmsd, 0};

Please, consider change the last line to:

   struct put_gtree_data capsule = {
   .f = f,
   .key_vmsd = key_vmsd,
   .val_vmsd = val_vmsd,
   .ret = 0};

It makes much easier make changes on the future.

Once here, I think that you are missing on the middle a:

  .vmdesc = vmdesc,

No?  At least you use it on put_gtree_elem, and I don't see any place
where you assign it.  When I did the change is when I noticed that it
was missing.

> +GTree **pval = pv;
> +GTree *tree = *pval;
> +int ret;
> +
> +qemu_put_be32(f, g_tree_nnodes(tree));
> +g_tree_foreach(tree, put_gtree_elem, (gpointer));
> +qemu_put_byte(f, false);
> +ret = capsule.ret;
> +if (ret) {
> +error_report("%s : failed to save gtree (%d)", field->name, ret);
> +}
> +return ret;
> +}

As said before, with this changes you have my reviewed-by.

Later, Juan.



Re: ptimer use of bottom-half handlers

2019-10-04 Thread Richard Henderson
On 10/4/19 10:40 AM, Peter Maydell wrote:
> So we should probably cause that to happen in the new scheme
> (conceptually, something like "the trigger callback is
> implicitly considered to be called from within a tx begin/
> commit block, so (a) it doesn't need to do begin/commit itself
> and (b) if it does something resulting in a repeat trigger
> the second call will happen after it returns, not recursively" ?)

That sounds plausible.


r~



Re: Re: target/ppc: bug in optimised vsl/vsr implementation?

2019-10-04 Thread Aleksandar Markovic
>
> In comment #9 in the bug 
> (https://bugs.launchpad.net/qemu/+bug/1841990/comments/9), I note that the 
> issue was produced in running the test suite for the Power Vector Library 
> project (https://github.com/open-power-sdk/pveclib), which makes productive 
> use of dcbenq.
>
> Maybe that could be adopted or adapted to suit?
>
> PC

I will leave adoption/adaptation decision to Alex and others, but just
want to tell you that I took a brief look at pveclib code and
internals, and I got an impression that this is an excellent peace of
software, that required incredible meticulousness in order to
implement, all kudos.

Aleksandar



Re: [PATCH v2] target/arm/arch_dump: Add SVE notes

2019-10-04 Thread no-reply
Patchew URL: https://patchew.org/QEMU/20191004120313.5347-1-drjo...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20191004120313.5347-1-drjo...@redhat.com
Subject: [PATCH v2] target/arm/arch_dump: Add SVE notes

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Switched to a new branch 'test'
9d21a9a target/arm/arch_dump: Add SVE notes

=== OUTPUT BEGIN ===
ERROR: code indent should never use tabs
#21: FILE: include/elf.h:1653:
+#define NT_ARM_SVE^I0x405^I^I/* ARM Scalable Vector Extension$

WARNING: Block comments use a leading /* on a separate line
#21: FILE: include/elf.h:1653:
+#define NT_ARM_SVE 0x405   /* ARM Scalable Vector Extension

ERROR: code indent should never use tabs
#22: FILE: include/elf.h:1654:
+^I^I^I^I^I   registers */$

WARNING: Block comments use * on subsequent lines
#22: FILE: include/elf.h:1654:
+#define NT_ARM_SVE 0x405   /* ARM Scalable Vector Extension
+  registers */

WARNING: Block comments use a trailing */ on a separate line
#22: FILE: include/elf.h:1654:
+  registers */

total: 2 errors, 3 warnings, 183 lines checked

Commit 9d21a9a5cba2 (target/arm/arch_dump: Add SVE notes) has style problems, 
please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20191004120313.5347-1-drjo...@redhat.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 00/11] RFC crypto/luks: encryption key managment using amend interface

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> This patch series is continuation of my work to add encryption
> key managment to luks/qcow2 with luks.
> 
> This is second version of this patch set.
> The changes are mostly addressing the review feedback,
> plus I tested (and fixed sadly) the somewhat ugly code
> that allows to still write share a raw luks device,
> while preveting the key managment from happening in this case,
> as it is unsafe.
> I added a new iotest dedicated to that as well.
> 
> Best regards,
>   Maxim Levitsky

At least for an RFC looks good from my perspective.  I didn’t look at
the crypto things very closely (assuming Dan would do so), and I didn’t
check the iotests in detail.  (But it definitely doesn’t look like they
lack in breadth.  Maybe I’d like to see a test that you cannot have
other useful nodes attached to the LUKS or qcow2 node while the
amendment process is ongoing (because CONSISTENT_READ is unshared).  But
that’s the only thing I can think of.)

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v2 11/11] iotests : add tests for encryption key management

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> Note that currently I add tests 300-302, which are
> placeholders to ease the rebase. In final version
> of these patches I will update these.
> 
> Signed-off-by: Maxim Levitsky 
> ---
>  tests/qemu-iotests/300 | 202 +
>  tests/qemu-iotests/300.out |  98 +++
>  tests/qemu-iotests/301 |  90 +
>  tests/qemu-iotests/301.out |  30 +
>  tests/qemu-iotests/302 | 252 +
>  tests/qemu-iotests/302.out |  18 +++
>  tests/qemu-iotests/303 | 228 +
>  tests/qemu-iotests/303.out |  28 +
>  tests/qemu-iotests/group   |   9 ++
>  9 files changed, 955 insertions(+)
>  create mode 100755 tests/qemu-iotests/300
>  create mode 100644 tests/qemu-iotests/300.out
>  create mode 100755 tests/qemu-iotests/301
>  create mode 100644 tests/qemu-iotests/301.out
>  create mode 100644 tests/qemu-iotests/302
>  create mode 100644 tests/qemu-iotests/302.out
>  create mode 100644 tests/qemu-iotests/303
>  create mode 100644 tests/qemu-iotests/303.out

[...]

> diff --git a/tests/qemu-iotests/303.out b/tests/qemu-iotests/303.out
> new file mode 100644
> index 00..1cf3917208
> --- /dev/null
> +++ b/tests/qemu-iotests/303.out
> @@ -0,0 +1,28 @@
> +qemu-img: Failed to get shared "consistent read" lock
> +Is another process using the image 
> [/home/mlevitsk/USERSPACE/qemu/build-luks/tests/qemu-iotests/scratch/test.img]?

Ah, this should be filtered.

Max



signature.asc
Description: OpenPGP digital signature


[Bug 1846816] [NEW] Booting error on AIX 6.1 "Illegal Trap Instruction Interrupt in Kernel""

2019-10-04 Thread Leonardo
Public bug reported:

# ls -ltr
total 8750584
-rw-rw-r--  1 linux linux 4274997248 Oct  4 18:33 AIX.vol1.iso
-rw-rw-r--  1 linux linux 4293888000 Oct  4 18:45 AIX.vol2.iso
-rw-rw-r--  1 linux linux  391485440 Oct  4 18:50 AIX.vol3.iso
-rw-r--r--  1 root  root  204608 Oct  4 19:00 AIX61.img

# qemu-system-ppc64 -cpu POWER8,compat=power7 -machine pseries -m 8192 -serial 
mon:stdio \
> -drive file=/qemu/BST/AIX61.img,if=none,id=drive-virtio-disk0 \
> -device virtio-scsi-pci,id=scsi -device scsi-hd,drive=drive-virtio-disk0 \
> -cdrom /qemu/BST/AIX.vol1.iso \
> -prom-env boot-command='boot cdrom: -s verbose'


VNC server running on ::1:5900
qemu-system-ppc64: warning: TCG doesn't support requested feature, 
cap-ibs=workaround


SLOF **
QEMU Starting
 Build Date = Jul  3 2019 12:26:14
 FW Version = git-ba1ab360eebe6338
 Press "s" to enter Open Firmware.

Populating /vdevice methods
Populating /vdevice/vty@7100
Populating /vdevice/nvram@7101
Populating /vdevice/l-lan@7102
Populating /vdevice/v-scsi@7103
   SCSI: Looking for devices
  8200 CD-ROM   : "QEMU QEMU CD-ROM  2.5+"
Populating /pci@8002000
 00  (D) : 1234 qemu vga
 00 0800 (D) : 1033 0194serial bus [ usb-xhci ]
 00 1000 (D) : 1af4 1004virtio [ scsi ]
Populating /pci@8002000/scsi@2
   SCSI: Looking for devices
  100 DISK : "QEMU QEMU HARDDISK2.5+"
Installing QEMU fb


Scanning USB
  XHCI: Initializing
USB Keyboard
USB mouse
No console specified using screen & keyboard

  Welcome to Open Firmware

  Copyright (c) 2004, 2017 IBM Corporation All rights reserved.
  This program and the accompanying materials are made available
  under the terms of the BSD License available at
  http://www.opensource.org/licenses/bsd-license.php


Trying to load: -s verbose from: 
/vdevice/v-scsi@7103/disk@8200: ...   Successfully loaded
qemu-system-ppc64: Couldn't negotiate a suitable PVR during CAS
AIX
StarLED{814}

AIX Version 6.1
exec(/etc/init){1,0}

INIT: EXECUTING /sbin/rc.boot 1
exec(/usr/bin/sh,-c,/sbin/rc.boot 1){1114146,1}
exec(/sbin/rc.boot,/sbin/rc.boot,1){1114146,1}
+ PHASE=1
+ + bootinfo -p
exec(/usr/sbin/bootinfo,-p){1179684,1114146}
PLATFORM=chrp
+ [ ! -x /usr/lib/boot/bin/bootinfo_chrp ]
+ [ 1 -eq 1 ]
+ 1> /usr/lib/libc.a
+ init -c unlink /usr/lib/boot/bin/!(*_chrp)
exec(/etc/init,-c,unlink /usr/lib/boot/bin/!(*_chrp)){1179686,1114146}
+ chramfs -t
exec(/usr/sbin/chramfs,-t){1179688,1114146}
+ init -c unlink /usr/sbin/chramfs
+ 1> /dev/null
exec(/etc/init,-c,unlink /usr/sbin/chramfs){1179690,1114146}
+ + bootinfo -t
exec(/usr/sbin/bootinfo,-t){1179692,1114146}
BOOTYPE=3
+ [ 0 -ne 0 ]
+ [ -z 3 ]
+ unset pdev_to_ldev undolt native_netboot_cfg
+ unset disknet_odm_init config_ATM
+ /usr/lib/methods/showled 0x510 DEV CFG 1 START
exec(/usr/lib/methods/showled,0x510,DEV CFG 1 START){1179694,1114146}
+ cfgmgr -f -v
exec(/usr/sbin/cfgmgr,-f,-v){1179696,1114146}
cfgmgr is running in phase 1

Time: 0 LEDS: 0x538
Invoking top level program -- "/etc/methods/defsys"
exec(/bin/sh,-c,/etc/methods/defsys ){1245222,1179696}
exec(/etc/methods/defsys){1245222,1179696}
exec(/bin/sh,-c,/usr/lib/methods/define_rspc -n -c sys -s node -t 
chrp){1310760,1245222}
exec(/usr/lib/methods/define_rspc,-n,-c,sys,-s,node,-t,chrp){1310760,1245222}
Time: 0 LEDS: 0x539
Return code = 0
* stdout *
sys0

*** no stderr 

Attempting to configure device 'sys0'
Time: 0 LEDS: 0x811
Invoking /usr/lib/methods/cfgsys_chrp -1 -l sys0
exec(/bin/sh,-c,/usr/lib/methods/cfgsys_chrp -1 -l sys0){1245224,1179696}
Number of running methods: 1
exec(/usr/lib/methods/cfgsys_chrp,-1,-l,sys0){1245224,1179696}
LED{A20}
Illegal Trap Instruction Interrupt in Kernel
04151A74  tweqir0,0r0=0
KDB(0)>

** Affects: qemu
 Importance: Undecided
 Status: New


** Tags: aix

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1846816

Title:
  Booting error on AIX 6.1 "Illegal Trap Instruction Interrupt in
  Kernel""

Status in QEMU:
  New

Bug description:
  # ls -ltr
  total 8750584
  -rw-rw-r--  1 linux linux 4274997248 Oct  4 18:33 AIX.vol1.iso
  -rw-rw-r--  1 linux linux 4293888000 Oct  4 18:45 AIX.vol2.iso
  -rw-rw-r--  1 linux linux  391485440 Oct  4 18:50 AIX.vol3.iso
  -rw-r--r--  1 root  root  204608 Oct  4 19:00 AIX61.img

  # qemu-system-ppc64 -cpu POWER8,compat=power7 -machine pseries -m 8192 
-serial mon:stdio \
  > -drive file=/qemu/BST/AIX61.img,if=none,id=drive-virtio-disk0 \
  > -device virtio-scsi-pci,id=scsi -device scsi-hd,drive=drive-virtio-disk0 \
  > -cdrom /qemu/BST/AIX.vol1.iso \
  > -prom-env boot-command='boot cdrom: -s verbose'

  
  VNC server running on 

Re: [PATCH 6/7] tests/fw_cfg: Declare one QFWCFG for all tests

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> It is pointless to create/remove a QFWCFG object for each test.
> Move it to the test context and create/remove it only once.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/fw_cfg-test.c | 74 +
>  1 file changed, 27 insertions(+), 47 deletions(-)

Reviewed-by: Laszlo Ersek 




Re: [PATCH v2 09/11] block/qcow2: implement blockdev-amend

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> Currently only for changing crypto parameters

Yep, that elegantly avoids most of the problems we’d have otherwise. :-)

> Signed-off-by: Maxim Levitsky 
> ---
>  block/qcow2.c| 71 
>  qapi/block-core.json |  6 ++--
>  2 files changed, 75 insertions(+), 2 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 26f83aeb44..c8847ec6e2 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -3079,6 +3079,18 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
> Error **errp)
>  assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
>  qcow2_opts = _options->u.qcow2;
>  
> +if (!qcow2_opts->has_size) {
> +error_setg(errp, "Size is manadatory for image creation");
> +return -EINVAL;
> +
> +}
> +
> +if (!qcow2_opts->has_file) {
> +error_setg(errp, "'file' is manadatory for image creation");
> +return -EINVAL;
> +
> +}
> +
>  bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
>  if (bs == NULL) {
>  return -EIO;
> @@ -5111,6 +5123,64 @@ static int qcow2_amend_options(BlockDriverState *bs, 
> QemuOpts *opts,
>  return 0;
>  }
>  
> +
> +static int coroutine_fn qcow2_co_amend(BlockDriverState *bs,
> +   BlockdevCreateOptions *opts,
> +   bool force,
> +   Error **errp)
> +{
> +BlockdevCreateOptionsQcow2 *qopts = >u.qcow2;
> +BDRVQcow2State *s = bs->opaque;
> +int ret;
> +
> +/*
> + * This is ugly as hell, in later versions of this patch
> + * something has to be done about this

Well, at least the language of the comment needs adjustment. :-)

> + */
> +if (qopts->has_file || qopts->has_size || qopts->has_data_file ||
> +qopts->has_data_file_raw || qopts->has_version ||
> +qopts->has_backing_file || qopts->has_backing_fmt ||
> +qopts->has_cluster_size || qopts->has_preallocation ||
> +qopts->has_lazy_refcounts || qopts->has_refcount_bits) {
> +
> +error_setg(errp,
> +"Only LUKS encryption options can be amended for qcow2 with 
> blockdev-amend");
> +return -EOPNOTSUPP;
> +
> +}
> +
> +if (qopts->has_encrypt) {
> +if (!s->crypto) {
> +error_setg(errp, "QCOW2 image is not encrypted, can't amend");
> +return -EOPNOTSUPP;
> +}
> +
> +if (qopts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
> +error_setg(errp,
> +   "Amend can't be used to change the qcow2 encryption 
> format");
> +return -EOPNOTSUPP;
> +}
> +
> +if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
> +error_setg(errp,
> +   "Only LUKS encryption options can be amended for 
> qcow2 with blockdev-amend");
> +return -EOPNOTSUPP;
> +}
> +
> +ret = qcrypto_block_amend_options(s->crypto,
> +  qcow2_crypto_hdr_read_func,
> +  qcow2_crypto_hdr_write_func,
> +  bs,
> +  qopts->encrypt,
> +  force,
> +  errp);
> +if (ret) {
> +return ret;
> +}
> +}
> +return 0;

I suppose we need to do the same permission stuff as for LUKS, though;
i.e., unshare CONSISTENT_READ while this operation is ongoing.

> +}
> +
>  /*
>   * If offset or size are negative, respectively, they will not be included in
>   * the BLOCK_IMAGE_CORRUPTED event emitted.
> @@ -5303,6 +5373,7 @@ BlockDriver bdrv_qcow2 = {
>  .mutable_opts= mutable_opts,
>  .bdrv_co_check   = qcow2_co_check,
>  .bdrv_amend_options  = qcow2_amend_options,
> +.bdrv_co_amend   = qcow2_co_amend,
>  
>  .bdrv_detach_aio_context  = qcow2_detach_aio_context,
>  .bdrv_attach_aio_context  = qcow2_attach_aio_context,
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 4a6db98938..0eb4e45168 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -4294,6 +4294,7 @@
>  # Driver specific image creation options for qcow2.
>  #
>  # @file Node to create the image format on
> +#   Mandatory for create
>  # @data-fileNode to use as an external data file in which all guest
>  #   data is stored so that only metadata remains in the qcow2
>  #   file (since: 4.0)
> @@ -4301,6 +4302,7 @@
>  #   standalone (read-only) raw image without looking at qcow2
>  #   metadata (default: false; since: 4.0)
>  # @size Size of the virtual disk in bytes
> +#   Mandatory for create
>  # @version  Compatibility level 

Re: [PATCH 5/7] tests/libqos/fw_cfg: Pass QTestState as argument

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> Since a QFWCFG object is not tied to a particular test, we can
> call *_fw_cfg_init() once before creating QTests and use the same
> for all the tests, then release the object with g_free() once all
> the tests are run.
> 
> Refactor the qfw_cfg* API to take QTestState as argument.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/boot-order-test.c  | 12 
>  tests/fw_cfg-test.c  | 49 
>  tests/libqos/fw_cfg.c| 61 
>  tests/libqos/fw_cfg.h| 30 +---
>  tests/libqos/malloc-pc.c |  4 +--
>  5 files changed, 77 insertions(+), 79 deletions(-)

Not much fun to review. :)

Reviewed-by: Laszlo Ersek 



Re: [PATCH v2 07/11] block: add x-blockdev-amend qmp command

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> Signed-off-by: Maxim Levitsky 
> ---
>  block/Makefile.objs   |   2 +-
>  block/amend.c | 116 ++
>  include/block/block_int.h |  23 ++--
>  qapi/block-core.json  |  26 +
>  qapi/job.json |   4 +-
>  5 files changed, 163 insertions(+), 8 deletions(-)
>  create mode 100644 block/amend.c

I think I’d move this (and everything to belongs to it) to a different
series.

> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 35f3bca4d9..10d0308792 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -18,7 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
>  block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
>  block-obj-$(CONFIG_POSIX) += file-posix.o
>  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
> -block-obj-y += null.o mirror.o commit.o io.o create.o
> +block-obj-y += null.o mirror.o commit.o io.o create.o amend.o
>  block-obj-y += throttle-groups.o
>  block-obj-$(CONFIG_LINUX) += nvme.o
>  
> diff --git a/block/amend.c b/block/amend.c
> new file mode 100644
> index 00..9bd28e08e7
> --- /dev/null
> +++ b/block/amend.c
> @@ -0,0 +1,116 @@
> +/*
> + * Block layer code related to image options amend
> + *
> + * Copyright (c) 2018 Kevin Wolf 
> + * Copyright (c) 2019 Maxim Levitsky 
> + *
> + * Heavily based on create.c
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "block/block_int.h"
> +#include "qemu/job.h"
> +#include "qemu/main-loop.h"
> +#include "qapi/qapi-commands-block-core.h"
> +#include "qapi/qapi-visit-block-core.h"
> +#include "qapi/clone-visitor.h"
> +#include "qapi/error.h"
> +
> +typedef struct BlockdevAmendJob {
> +Job common;
> +BlockdevCreateOptions *opts;
> +BlockDriverState *bs;
> +bool force;
> +} BlockdevAmendJob;
> +
> +static int coroutine_fn blockdev_amend_run(Job *job, Error **errp)
> +{
> +BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
> +int ret;
> +
> +job_progress_set_remaining(>common, 1);
> +ret = s->bs->drv->bdrv_co_amend(s->bs, s->opts, s->force, errp);
> +job_progress_update(>common, 1);

It would be nice if the amend job could make use of the progress
reporting that we have in place for amend.

> +
> +qapi_free_BlockdevCreateOptions(s->opts);
> +
> +return ret;
> +}
> +
> +static const JobDriver blockdev_amend_job_driver = {
> +.instance_size = sizeof(BlockdevAmendJob),
> +.job_type  = JOB_TYPE_AMEND,
> +.run   = blockdev_amend_run,
> +};
> +
> +void qmp_x_blockdev_amend(const char *job_id,
> +const char *node_name,
> +BlockdevCreateOptions *options,
> +bool has_force,
> +bool force,
> +Error **errp)
> +{
> +BlockdevAmendJob *s;
> +const char *fmt = BlockdevDriver_str(options->driver);
> +BlockDriver *drv = bdrv_find_format(fmt);
> +BlockDriverState *bs = bdrv_find_node(node_name);
> +
> +/*
> + * If the driver is in the schema, we know that it exists. But it may not
> + * be whitelisted.
> + */
> +assert(drv);
> +if (bdrv_uses_whitelist() && !bdrv_is_whitelisted(drv, false)) {
> +error_setg(errp, "Driver is not whitelisted");
> +return;
> +}
> +
> +if (bs->drv != drv) {
> +error_setg(errp,
> +   "x-blockdev-amend doesn't support changing the block 
> driver");
> +return;
> +
> +}
> +
> +/* Error out if the driver doesn't support .bdrv_co_amend */
> +if (!drv->bdrv_co_amend) {
> +error_setg(errp, "Driver does not support x-blockdev-amend");
> +return;
> +}
> +
> +/*
> + * Create the block job
> + * TODO Running in the main context. Block drivers 

Re: [Virtio-fs] [PATCH] virtiofsd: Add pread64() to seccomp list for posix_fallocate()

2019-10-04 Thread Dr. David Alan Gilbert
* Misono Tomohiro (misono.tomoh...@jp.fujitsu.com) wrote:
> I test virtiofs with NFS 4.0 as backend and notice that fallocate
> causes system hang (kernel: 5.4-rc1, qemu: virtio-fs-dev branch):
>  $ mount -t virtiofs myfs /mnt
>  $ dd if=/dev/urandom bs=1000 seek=1 count=1 of=/mnt/file
>  $ fallocate -l 2000 /mnt/file # system hang
> 
> This is because:
>  1. virtiofs supports fallocate syscall while NFS 4.0 does not.
>  2. virtiofsd uses posix_fallocate() and it fallbacks to pread64()/
> pwrite64() sequence to reserve blocks if fallocate syscall is
> not available.
>  3. pread64() syscall is prohibited by seccomp and virtiofsd thread
> is killed by SIGSYS.
> 
> So, just add pread64() to seccomp white list to fix this problem.
> 
> Signed-off-by: Misono Tomohiro 

Thanks, I've squashed this into our seccomp commit.
(It'll be a little while before I push it out, I need to finish a tidyup
I'm doing).

Dave

> ---
>  contrib/virtiofsd/seccomp.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/contrib/virtiofsd/seccomp.c b/contrib/virtiofsd/seccomp.c
> index 93b679271d..88b61bca42 100644
> --- a/contrib/virtiofsd/seccomp.c
> +++ b/contrib/virtiofsd/seccomp.c
> @@ -61,6 +61,7 @@ static const int syscall_whitelist[] = {
>   SCMP_SYS(ppoll),
>   SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */
>   SCMP_SYS(preadv),
> + SCMP_SYS(pread64),
>   SCMP_SYS(pwritev),
>   SCMP_SYS(pwrite64),
>   SCMP_SYS(read),
> -- 
> 2.21.0
> 
> ___
> Virtio-fs mailing list
> virtio...@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



Re: [PATCH 4/7] tests/fw_cfg: Let the tests use a context

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> Introduce the local QTestCtx structure, and register the tests
> with qtest_add_data_func(ctx).
> For now the context only contains the machine name (which is
> fixed to the 'pc' machine, since this test only runs on the
> x86 architecture).
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/fw_cfg-test.c | 93 -
>  1 file changed, 59 insertions(+), 34 deletions(-)
> 
> diff --git a/tests/fw_cfg-test.c b/tests/fw_cfg-test.c
> index 53ae82f7c8..77a833d50e 100644
> --- a/tests/fw_cfg-test.c
> +++ b/tests/fw_cfg-test.c
> @@ -23,13 +23,18 @@ static uint16_t max_cpus = 1;
>  static uint64_t nb_nodes = 0;
>  static uint16_t boot_menu = 0;
>  
> -static void test_fw_cfg_signature(void)
> +typedef struct {
> +const char *machine_name;
> +} QTestCtx;
> +
> +static void test_fw_cfg_signature(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;

(1) I'd suggest:

const QTestCtx *ctx = opaque;

(I think that should work fine with the current patch; not sure if it
will work with the rest of the series, when you perhaps introduce more
members to QTestCtx.)

This request applies a few more times to this patch, of course.

Another (different) comment:

>  QFWCFG *fw_cfg;
>  QTestState *s;
>  char buf[5];
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  qfw_cfg_get(fw_cfg, FW_CFG_SIGNATURE, buf, 4);
> @@ -40,13 +45,14 @@ static void test_fw_cfg_signature(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_id(void)
> +static void test_fw_cfg_id(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  uint32_t id;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  id = qfw_cfg_get_u32(fw_cfg, FW_CFG_ID);
> @@ -56,8 +62,9 @@ static void test_fw_cfg_id(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_uuid(void)
> +static void test_fw_cfg_uuid(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  
> @@ -67,7 +74,7 @@ static void test_fw_cfg_uuid(void)
>  0x8a, 0xcb, 0x81, 0xc6, 0xea, 0x54, 0xf2, 0xd8,
>  };
>  
> -s = qtest_init("-uuid 4600cb32-38ec-4b2f-8acb-81c6ea54f2d8");
> +s = qtest_initf("-M %s -uuid 4600cb32-38ec-4b2f-8acb-81c6ea54f2d8", 
> ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  qfw_cfg_get(fw_cfg, FW_CFG_UUID, buf, 16);
> @@ -78,12 +85,13 @@ static void test_fw_cfg_uuid(void)
>  
>  }
>  
> -static void test_fw_cfg_ram_size(void)
> +static void test_fw_cfg_ram_size(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u64(fw_cfg, FW_CFG_RAM_SIZE), ==, ram_size);
> @@ -92,12 +100,13 @@ static void test_fw_cfg_ram_size(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_nographic(void)
> +static void test_fw_cfg_nographic(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_NOGRAPHIC), ==, 0);
> @@ -106,12 +115,13 @@ static void test_fw_cfg_nographic(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_nb_cpus(void)
> +static void test_fw_cfg_nb_cpus(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_NB_CPUS), ==, nb_cpus);
> @@ -120,12 +130,13 @@ static void test_fw_cfg_nb_cpus(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_max_cpus(void)
> +static void test_fw_cfg_max_cpus(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_MAX_CPUS), ==, max_cpus);
> @@ -133,14 +144,15 @@ static void test_fw_cfg_max_cpus(void)
>  qtest_quit(s);
>  }
>  
> -static void test_fw_cfg_numa(void)
> +static void test_fw_cfg_numa(const void *opaque)
>  {
> +QTestCtx *ctx = (QTestCtx *)opaque;
>  QFWCFG *fw_cfg;
>  QTestState *s;
>  uint64_t *cpu_mask;
>  uint64_t *node_mask;
>  
> -s = qtest_init("");
> +s = qtest_initf("-M %s", ctx->machine_name);
>  fw_cfg = 

Re: [PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Paul Clarke
On 10/4/19 8:43 AM, Stefan Brankovic wrote:
> In previous implementation, invocation of TCG shift function could request
> shift of TCG variable by 64 bits when variable 'sh' is 0, which is not
> supported in TCG (values can be shifted by 0 to 63 bits). This patch fixes
> this by using two separate invocation of TCG shift functions, with maximum
> shift amount of 32.
> 
> Name of variable 'shifted' is changed to 'carry' so variable naming
> is similar to old helper implementation.
> 
> Variables 'avrA' and 'avrB' are replaced with variable 'avr'.
> 
> Fixes: 4e6d0920e7547e6af4bbac5ffe9adfe6ea621822
> Reported-by: Paul Clark 

Preferred: "Paul A. Clarke" (for historical consistency)

> Reported-by: Mark Cave-Ayland 
> Suggested-by: Aleksandar Markovic 
> Signed-off-by: Stefan Brankovic 

Applying this patch on top of dce5a787c05fe1a3e54d92871cdeba2af6798e0d 
eliminated the failures that I reported in 
https://bugs.launchpad.net/qemu/+bug/1841990 associated with vsl/vsr.

Tested-by: Paul A. Clarke  

PC



Re: [PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Aleksandar Markovic
> Reported-by: Paul Clark 

Stefan,

Paul's full name is Paul A. Clarke.

Thanks for the fix!

Aleksandar



Re: [PATCH v2 05/11] block/crypto: implement the encryption key management

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> This implements the encryption key management
> using the generic code in qcrypto layer
> (currently only for qemu-img amend)
> 
> This code adds another 'write_func' because the initialization
> write_func works directly on the underlying file,
> because during the creation, there is no open instance
> of the luks driver, but during regular use, we have it,
> and should use it instead.
> 
> 
> This commit also adds a   'hack/workaround' I and Kevin Wolf (thanks)
> made to   make the driver still support write sharing,
> but be safe against concurrent  metadata update (the keys)
> Eventually write sharing for luks driver will be deprecated
> and removed together with this hack.
> 
> The hack is that we ask   (as a format driver) for
> BLK_PERM_CONSISTENT_READ always
> (technically always unless opened with BDRV_O_NO_IO)
> 
> and then when we want to update   the keys, we
> unshare   that permission. So if someone else
> has the   image open, even readonly, this will fail.
> 
> Also thanks to Daniel Berrange for the variant of
> that hack that involves   asking for read,
> rather that write permission
> 
> Signed-off-by: Maxim Levitsky 
> ---
>  block/crypto.c | 118 +++--
>  1 file changed, 115 insertions(+), 3 deletions(-)
> 
> diff --git a/block/crypto.c b/block/crypto.c
> index a6a3e1f1d8..f42fa057e6 100644
> --- a/block/crypto.c
> +++ b/block/crypto.c
> @@ -36,6 +36,7 @@ typedef struct BlockCrypto BlockCrypto;
>  
>  struct BlockCrypto {
>  QCryptoBlock *block;
> +bool updating_keys;
>  };
>  
>  
> @@ -70,6 +71,24 @@ static ssize_t block_crypto_read_func(QCryptoBlock *block,
>  return ret;
>  }
>  
> +static ssize_t block_crypto_write_func(QCryptoBlock *block,
> +   size_t offset,
> +   const uint8_t *buf,
> +   size_t buflen,
> +   void *opaque,
> +   Error **errp)

There’s already a function of this name for creation.

> +{
> +BlockDriverState *bs = opaque;
> +ssize_t ret;
> +
> +ret = bdrv_pwrite(bs->file, offset, buf, buflen);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret, "Could not write encryption header");
> +return ret;
> +}
> +return ret;
> +}
> +
>  
>  struct BlockCryptoCreateData {
>  BlockBackend *blk;

[...]

> +static void
> +block_crypto_child_perms(BlockDriverState *bs, BdrvChild *c,
> + const BdrvChildRole *role,
> + BlockReopenQueue *reopen_queue,
> + uint64_t perm, uint64_t shared,
> + uint64_t *nperm, uint64_t *nshared)
> +{
> +
> +BlockCrypto *crypto = bs->opaque;
> +
> +/*
> + * Ask for consistent read permission so that if
> + * someone else tries to open this image with this permission
> + * neither will be able to edit encryption keys
> + */
> +if (!(bs->open_flags & BDRV_O_NO_IO)) {
> +perm |= BLK_PERM_CONSISTENT_READ;
> +}
> +
> +/*
> + * This driver doesn't modify LUKS metadata except
> + * when updating the encryption slots.
> + * Thus unlike a proper format driver we don't ask for
> + * shared write permission. However we need it
> + * when we area updating keys, to ensure that only we
> + * had opened the device r/w
> + *
> + * Encryption update will set the crypto->updating_keys
> + * during that period and refresh permissions
> + *
> + */
> +
> +if (crypto->updating_keys) {
> +/*need exclusive write access for header update  */
> +perm |= BLK_PERM_WRITE;
> +shared &= ~(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE);
> +}
> +
> +bdrv_filter_default_perms(bs, c, role, reopen_queue,
> +perm, shared, nperm, nshared);
> +}

This will probably work, but usually drivers do it the other way around:
First call any of the default_perms(), and then adjust *nperm and
*nshared as required.

(perm/shared are what the parents need, *nperm/*nshared is what this
driver needs, so it makes more sense that way; and this way nobody has
to check whether the settings survived the default_perms() call.)

((But the permissions themselves do look correct.))

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 3/7] tests/libqos/fw_cfg: Document pc_fw_cfg_init to drop pc_fw_cfg_uninit

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> Document pc_fw_cfg_init() return value must be released
> with g_free(). Directly calling g_free() we don't really
> need pc_fw_cfg_uninit(): remove it.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/fw_cfg-test.c  | 22 +++---
>  tests/libqos/fw_cfg.h| 14 +-
>  tests/libqos/malloc-pc.c |  2 +-
>  3 files changed, 21 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/fw_cfg-test.c b/tests/fw_cfg-test.c
> index 1d3147f821..53ae82f7c8 100644
> --- a/tests/fw_cfg-test.c
> +++ b/tests/fw_cfg-test.c
> @@ -36,7 +36,7 @@ static void test_fw_cfg_signature(void)
>  buf[4] = 0;
>  
>  g_assert_cmpstr(buf, ==, "QEMU");
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -52,7 +52,7 @@ static void test_fw_cfg_id(void)
>  id = qfw_cfg_get_u32(fw_cfg, FW_CFG_ID);
>  g_assert((id == 1) ||
>   (id == 3));
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -73,7 +73,7 @@ static void test_fw_cfg_uuid(void)
>  qfw_cfg_get(fw_cfg, FW_CFG_UUID, buf, 16);
>  g_assert(memcmp(buf, uuid, sizeof(buf)) == 0);
>  
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  
>  }
> @@ -88,7 +88,7 @@ static void test_fw_cfg_ram_size(void)
>  
>  g_assert_cmpint(qfw_cfg_get_u64(fw_cfg, FW_CFG_RAM_SIZE), ==, ram_size);
>  
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -102,7 +102,7 @@ static void test_fw_cfg_nographic(void)
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_NOGRAPHIC), ==, 0);
>  
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -116,7 +116,7 @@ static void test_fw_cfg_nb_cpus(void)
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_NB_CPUS), ==, nb_cpus);
>  
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -129,7 +129,7 @@ static void test_fw_cfg_max_cpus(void)
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_MAX_CPUS), ==, max_cpus);
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -158,7 +158,7 @@ static void test_fw_cfg_numa(void)
>  
>  g_free(node_mask);
>  g_free(cpu_mask);
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -171,7 +171,7 @@ static void test_fw_cfg_boot_menu(void)
>  fw_cfg = pc_fw_cfg_init(s);
>  
>  g_assert_cmpint(qfw_cfg_get_u16(fw_cfg, FW_CFG_BOOT_MENU), ==, 
> boot_menu);
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -190,7 +190,7 @@ static void test_fw_cfg_reboot_timeout(void)
>  g_assert_cmpint(filesize, ==, sizeof(reboot_timeout));
>  reboot_timeout = le32_to_cpu(reboot_timeout);
>  g_assert_cmpint(reboot_timeout, ==, 15);
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> @@ -209,7 +209,7 @@ static void test_fw_cfg_splash_time(void)
>  g_assert_cmpint(filesize, ==, sizeof(splash_time));
>  splash_time = le16_to_cpu(splash_time);
>  g_assert_cmpint(splash_time, ==, 12);
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  qtest_quit(s);
>  }
>  
> diff --git a/tests/libqos/fw_cfg.h b/tests/libqos/fw_cfg.h
> index 3247fd4000..6316f4c354 100644
> --- a/tests/libqos/fw_cfg.h
> +++ b/tests/libqos/fw_cfg.h
> @@ -54,14 +54,18 @@ QFWCFG *mm_fw_cfg_init(QTestState *qts, uint64_t base);
>   */
>  QFWCFG *io_fw_cfg_init(QTestState *qts, uint16_t base);
>  
> +/**
> + * pc_fw_cfg_init():
> + * @qts: The #QTestState that will be referred to by the QFWCFG object.
> + *
> + * This function is specific to the PC machine (X86).
> + *
> + * Returns a newly allocated QFWCFG object which must be released
> + * with a call to g_free() when no longer required.
> + */
>  static inline QFWCFG *pc_fw_cfg_init(QTestState *qts)
>  {
>  return io_fw_cfg_init(qts, 0x510);
>  }
>  
> -static inline void pc_fw_cfg_uninit(QFWCFG *fw_cfg)
> -{
> -g_free(fw_cfg);
> -}
> -
>  #endif
> diff --git a/tests/libqos/malloc-pc.c b/tests/libqos/malloc-pc.c
> index 6f92ce4135..949a99361d 100644
> --- a/tests/libqos/malloc-pc.c
> +++ b/tests/libqos/malloc-pc.c
> @@ -29,5 +29,5 @@ void pc_alloc_init(QGuestAllocator *s, QTestState *qts, 
> QAllocOpts flags)
>  alloc_init(s, flags, 1 << 20, MIN(ram_size, 0xE000), PAGE_SIZE);
>  
>  /* clean-up */
> -pc_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  }
> 

I'm getting the impression we're trying to remove "PC" (x86) references
from the cleanup action :)

Reviewed-by: Laszlo Ersek 



Re: [PATCH 2/7] tests/libqos/fw_cfg: Document mm_fw_cfg_init to drop mm_fw_cfg_uninit

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> Document mm_fw_cfg_init() return value must be released
> with g_free(). mm_fw_cfg_uninit() was never used, remove it.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/libqos/fw_cfg.c |  5 -
>  tests/libqos/fw_cfg.h | 10 +-
>  2 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/libqos/fw_cfg.c b/tests/libqos/fw_cfg.c
> index 37c3f2cf4d..ddeec821db 100644
> --- a/tests/libqos/fw_cfg.c
> +++ b/tests/libqos/fw_cfg.c
> @@ -126,11 +126,6 @@ QFWCFG *mm_fw_cfg_init(QTestState *qts, uint64_t base)
>  return fw_cfg;
>  }
>  
> -void mm_fw_cfg_uninit(QFWCFG *fw_cfg)
> -{
> -g_free(fw_cfg);
> -}
> -
>  static void io_fw_cfg_select(QFWCFG *fw_cfg, uint16_t key)
>  {
>  qtest_outw(fw_cfg->qts, fw_cfg->base, key);
> diff --git a/tests/libqos/fw_cfg.h b/tests/libqos/fw_cfg.h
> index 15604040bd..3247fd4000 100644
> --- a/tests/libqos/fw_cfg.h
> +++ b/tests/libqos/fw_cfg.h
> @@ -34,8 +34,16 @@ uint64_t qfw_cfg_get_u64(QFWCFG *fw_cfg, uint16_t key);
>  size_t qfw_cfg_get_file(QFWCFG *fw_cfg, const char *filename,
>  void *data, size_t buflen);
>  
> +/**
> + * mm_fw_cfg_init():
> + * @qts: The #QTestState that will be referred to by the QFWCFG object.
> + * @base: The MMIO base address of the fw_cfg device in the guest.
> + *
> + * Returns a newly allocated QFWCFG object which must be released
> + * with a call to g_free() when no longer required.
> + */
>  QFWCFG *mm_fw_cfg_init(QTestState *qts, uint64_t base);
> -void mm_fw_cfg_uninit(QFWCFG *fw_cfg);
> +
>  /**
>   * io_fw_cfg_init():
>   * @qts: The #QTestState that will be referred to by the QFWCFG object.
> 

Sigh, we've never even called this function :/

Reviewed-by: Laszlo Ersek 



Re: [PATCH 1/7] tests/libqos/fw_cfg: Document io_fw_cfg_init to drop io_fw_cfg_uninit

2019-10-04 Thread Laszlo Ersek
On 10/04/19 00:54, Philippe Mathieu-Daudé wrote:
> Document io_fw_cfg_init() return value must be released
> with g_free(). Directly calling g_free() we don't really
> need io_fw_cfg_uninit(): remove it.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/libqos/fw_cfg.c |  5 -
>  tests/libqos/fw_cfg.h | 11 +--
>  2 files changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/tests/libqos/fw_cfg.c b/tests/libqos/fw_cfg.c
> index 1f46258f96..37c3f2cf4d 100644
> --- a/tests/libqos/fw_cfg.c
> +++ b/tests/libqos/fw_cfg.c
> @@ -157,8 +157,3 @@ QFWCFG *io_fw_cfg_init(QTestState *qts, uint16_t base)
>  
>  return fw_cfg;
>  }
> -
> -void io_fw_cfg_uninit(QFWCFG *fw_cfg)
> -{
> -g_free(fw_cfg);
> -}
> diff --git a/tests/libqos/fw_cfg.h b/tests/libqos/fw_cfg.h
> index 13325cc4ff..15604040bd 100644
> --- a/tests/libqos/fw_cfg.h
> +++ b/tests/libqos/fw_cfg.h
> @@ -36,8 +36,15 @@ size_t qfw_cfg_get_file(QFWCFG *fw_cfg, const char 
> *filename,
>  
>  QFWCFG *mm_fw_cfg_init(QTestState *qts, uint64_t base);
>  void mm_fw_cfg_uninit(QFWCFG *fw_cfg);
> +/**
> + * io_fw_cfg_init():
> + * @qts: The #QTestState that will be referred to by the QFWCFG object.
> + * @base: The I/O address of the fw_cfg device in the guest.
> + *
> + * Returns a newly allocated QFWCFG object which must be released
> + * with a call to g_free() when no longer required.
> + */
>  QFWCFG *io_fw_cfg_init(QTestState *qts, uint16_t base);
> -void io_fw_cfg_uninit(QFWCFG *fw_cfg);
>  
>  static inline QFWCFG *pc_fw_cfg_init(QTestState *qts)
>  {
> @@ -46,7 +53,7 @@ static inline QFWCFG *pc_fw_cfg_init(QTestState *qts)
>  
>  static inline void pc_fw_cfg_uninit(QFWCFG *fw_cfg)
>  {
> -io_fw_cfg_uninit(fw_cfg);
> +g_free(fw_cfg);
>  }
>  
>  #endif
> 


This more or less reverts 0729d833d6d6 ("tests/libqos: Add
io_fw_cfg_uninit() and mm_fw_cfg_uninit()", 2019-05-23).

I'm fine either way.

Reviewed-by: Laszlo Ersek 



Re: [PATCH v9 3/3] iotests: test nbd reconnect

2019-10-04 Thread Eric Blake

On 9/24/19 3:31 AM, Vladimir Sementsov-Ogievskiy wrote:


+def qemu_nbd_popen(*args):
+'''Run qemu-nbd in daemon mode and return the parent's exit code'''
+return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args))
+


Should you also use a pid file here, and wait for the existence of the
pid file before returning (rather than hard-coding sleep(1))?


What do you mean / how to do it?

We want to wait until listening socket is prepared..


In shell:

qemu-nbd --pid-file=/path/to/file ...
while [ ! -e /path/to/file ]; do
  sleep ... # fractional second, or exponential, or whatever...
done
# Now the listening socket is indeed prepared

You'd have to translate that idiom to python.

Or:

pre-open Unix socket at /path/to/socket
LISTEN_PID=... LISTEN_FDS=1 qemu-nbd ... 3<>/path/to/socket

Now the socket is pre-created and passed into qemu-nbd via systemd 
socket activation, so you know the listening socket is ready without 
having to do any loop at all.  Here's a patch in libnbd where we just 
switched from waiting for the port to appear (because the test predated 
qemu-nbd pidfile support) to instead using socket activation, for reference:

https://github.com/libguestfs/libnbd/commit/352331d177

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



Re: [PATCH v2 02/11] qcrypto-luks: extend the create options for upcoming encryption key management

2019-10-04 Thread Max Reitz
On 13.09.19 00:30, Maxim Levitsky wrote:
> Now you can specify which slot to put the encryption key to
> Plus add 'active' option which will let  user erase the key secret
> instead of adding it.
> Check that active=true it when creating.
> 
> Signed-off-by: Maxim Levitsky 
> ---
>  block/crypto.c |  2 ++
>  block/crypto.h | 16 +++
>  block/qcow2.c  |  2 ++
>  crypto/block-luks.c| 26 +++---
>  qapi/crypto.json   | 19 ++
>  tests/qemu-iotests/082.out | 54 ++
>  6 files changed, 115 insertions(+), 4 deletions(-)

(Just doing a cursory RFC-style review)

I think we also want to reject unlock-secret if it’s given for creation;
and I suppose it’d be more important to print which slots are OK than
the slot the user has given.  (It isn’t like we shouldn’t print that
slot index, but it’s more likely the user knows that than what the
limits are.  I think.)

Max



signature.asc
Description: OpenPGP digital signature


Re: ptimer use of bottom-half handlers

2019-10-04 Thread Peter Maydell
On Fri, 27 Sep 2019 at 11:01, Peter Maydell  wrote:
> The ptimer API is basically a down-counter (with a lot of bells
> and whistles), with functions to do things like "read current count"
> and "write current count". When the counter hits zero it typically
> reloads, and the ptimer code notifies a callback function in the
> device that's using ptimer. In the current implementation this
> is done using a QEMU bottom-half handler, so ptimer_trigger()
> calls replay_bh_schedule_event() on a QEMUBH that it was passed
> by the device when it was initialized. The comment says this is
> "to avoid reentrancy issues". That's a bit vague, but I assume the
> idea is to avoid the code of the device's 'triggered' callback
> being called from within eg ptimer_set_count(), because the
> device might be in the middle of changing multiple parts of the
> ptimer's state, its own state might not be consistent, etc.

In the course of trying to do some conversions of devices to
the new API I've proposed, I figured out the other part of what
this "to avoid reentrancy issues" probably is referring to:
if the device's 'triggered' callback itself calls a ptimer
function like ptimer_run() then potentially it could recurse:
 ptimer_trigger() -> trigger callback -> ptimer_run() ->
  ptimer_reload() -> ptimer_trigger() -> ...

I need to think a bit more carefully about what is supposed
to happen here and what we want to have happen. I guess at
the moment we'll just schedule the QEMU BH to run again,
so the trigger callback is called again, but not recursively.
So we should probably cause that to happen in the new scheme
(conceptually, something like "the trigger callback is
implicitly considered to be called from within a tx begin/
commit block, so (a) it doesn't need to do begin/commit itself
and (b) if it does something resulting in a repeat trigger
the second call will happen after it returns, not recursively" ?)

thanks
-- PMM



[PATCH v4 1/3] tests/migration: mem leak fix

2019-10-04 Thread Mao Zhongyi
‘data’ has the possibility of memory leaks, so use the
glib macros g_autofree recommended by CODING_STYLE.rst
to automatically release the memory that returned from
g_malloc().

Signed-off-by: Mao Zhongyi 
Reviewed-by: Alex Bennée 
---
 tests/migration/stress.c | 21 ++---
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/tests/migration/stress.c b/tests/migration/stress.c
index d9aa4afe92..d8a6f64af0 100644
--- a/tests/migration/stress.c
+++ b/tests/migration/stress.c
@@ -170,26 +170,14 @@ static unsigned long long now(void)
 static int stressone(unsigned long long ramsizeMB)
 {
 size_t pagesPerMB = 1024 * 1024 / PAGE_SIZE;
-char *ram = malloc(ramsizeMB * 1024 * 1024);
+g_autofree char *ram = g_malloc(ramsizeMB * 1024 * 1024);
 char *ramptr;
 size_t i, j, k;
-char *data = malloc(PAGE_SIZE);
+g_autofree char *data = g_malloc(PAGE_SIZE);
 char *dataptr;
 size_t nMB = 0;
 unsigned long long before, after;
 
-if (!ram) {
-fprintf(stderr, "%s (%05d): ERROR: cannot allocate %llu MB of RAM: 
%s\n",
-argv0, gettid(), ramsizeMB, strerror(errno));
-return -1;
-}
-if (!data) {
-fprintf(stderr, "%s (%d): ERROR: cannot allocate %d bytes of RAM: 
%s\n",
-argv0, gettid(), PAGE_SIZE, strerror(errno));
-free(ram);
-return -1;
-}
-
 /* We don't care about initial state, but we do want
  * to fault it all into RAM, otherwise the first iter
  * of the loop below will be quite slow. We cna't use
@@ -198,8 +186,6 @@ static int stressone(unsigned long long ramsizeMB)
 memset(ram, 0xfe, ramsizeMB * 1024 * 1024);
 
 if (random_bytes(data, PAGE_SIZE) < 0) {
-free(ram);
-free(data);
 return -1;
 }
 
@@ -227,9 +213,6 @@ static int stressone(unsigned long long ramsizeMB)
 }
 }
 }
-
-free(data);
-free(ram);
 }
 
 
-- 
2.17.1






[PATCH v4 3/3] tests/migration:fix unreachable path in stress test

2019-10-04 Thread Mao Zhongyi
If stressone() or stress() exits it's because of a failure
because the test runs forever otherwise, so change stressone
and stress type to void to make the exit_failure() as the exit
function of main().

Signed-off-by: Mao Zhongyi 
---
 tests/migration/stress.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/migration/stress.c b/tests/migration/stress.c
index f9626d50ee..a062ef6b55 100644
--- a/tests/migration/stress.c
+++ b/tests/migration/stress.c
@@ -167,7 +167,7 @@ static unsigned long long now(void)
 return (tv.tv_sec * 1000ull) + (tv.tv_usec / 1000ull);
 }
 
-static int stressone(unsigned long long ramsizeMB)
+static void stressone(unsigned long long ramsizeMB)
 {
 size_t pagesPerMB = 1024 * 1024 / PAGE_SIZE;
 g_autofree char *ram = g_malloc(ramsizeMB * 1024 * 1024);
@@ -186,7 +186,7 @@ static int stressone(unsigned long long ramsizeMB)
 memset(ram, 0xfe, ramsizeMB * 1024 * 1024);
 
 if (random_bytes(data, PAGE_SIZE) < 0) {
-return -1;
+return;
 }
 
 before = now();
@@ -225,7 +225,7 @@ static void *stressthread(void *arg)
 return NULL;
 }
 
-static int stress(unsigned long long ramsizeGB, int ncpus)
+static void stress(unsigned long long ramsizeGB, int ncpus)
 {
 size_t i;
 unsigned long long ramsizeMB = ramsizeGB * 1024 / ncpus;
@@ -238,8 +238,6 @@ static int stress(unsigned long long ramsizeGB, int ncpus)
 }
 
 stressone(ramsizeMB);
-
-return 0;
 }
 
 
@@ -335,8 +333,7 @@ int main(int argc, char **argv)
 fprintf(stdout, "%s (%05d): INFO: RAM %llu GiB across %d CPUs\n",
 argv0, gettid(), ramsizeGB, ncpus);
 
-if (stress(ramsizeGB, ncpus) < 0)
-exit_failure();
+stress(ramsizeGB, ncpus);
 
-exit_success();
+exit_failure();
 }
-- 
2.17.1






[PATCH v4 2/3] tests/migration: fix a typo in comment

2019-10-04 Thread Mao Zhongyi
Signed-off-by: Mao Zhongyi 
Reviewed-by: Alex Bennée 
Reviewed-by: Laurent Vivier 
Reviewed-by: Philippe Mathieu-Daudé 
---
 tests/migration/stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/migration/stress.c b/tests/migration/stress.c
index d8a6f64af0..f9626d50ee 100644
--- a/tests/migration/stress.c
+++ b/tests/migration/stress.c
@@ -180,7 +180,7 @@ static int stressone(unsigned long long ramsizeMB)
 
 /* We don't care about initial state, but we do want
  * to fault it all into RAM, otherwise the first iter
- * of the loop below will be quite slow. We cna't use
+ * of the loop below will be quite slow. We can't use
  * 0x0 as the byte as gcc optimizes that away into a
  * calloc instead :-) */
 memset(ram, 0xfe, ramsizeMB * 1024 * 1024);
-- 
2.17.1






[PATCH v4 0/3] some fix in tests/migration

2019-10-04 Thread Mao Zhongyi
This patchset mainly fixes memory leak, typo and return
value of stress function in stress test.

v4-v3:
p1:
- remove redundant g_malloc return value judgment.
 [Laurent Vivier]
p3:
- always use exit_failure() as the exit function of main().
 [Laurent Vivier] 
- update the commit message.

v3->v2:
p1: 
- replace malloc with g_malloc   [Laurent Vivier]
p3:
- change stressone type to void and stree return value
  to -1 to make the path of 'if (stress(ramsizeGB, ncpus) < 0)'
  can be reached.[Laurent Vivier]
- update the commit message.

v2->v1:
- use g_autofree to release memory automatically instead
  of free(). [Alex Bennée]
  
Cc: arm...@redhat.com 
Cc: laur...@vivier.eu 
Cc: tony.ngu...@bt.com
Cc: alex.ben...@linaro.org

Mao Zhongyi (3):
  tests/migration: mem leak fix
  tests/migration: fix a typo in comment
  tests/migration:fix unreachable path in stress test

 tests/migration/stress.c | 36 
 1 file changed, 8 insertions(+), 28 deletions(-)

-- 
2.17.1






Re: [PATCH v9 2/3] block/nbd: nbd reconnect

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
23 Sep 2019 22:23  Eric Blake  wrote:

On 9/17/19 12:13 PM, Vladimir Sementsov-Ogievskiy wrote:
> Implement reconnect. To achieve this:
>
> 1. add new modes:
>connecting-wait: means, that reconnecting is in progress, and there
>  were small number of reconnect attempts, so all requests are
>  waiting for the connection.
>connecting-nowait: reconnecting is in progress, there were a lot of
>  attempts of reconnect, all requests will return errors.
>
>two old modes are used too:
>connected: normal state
>quit: exiting after fatal error or on close
>
> Possible transitions are:
>
>* -> quit
>connecting-* -> connected
>connecting-wait -> connecting-nowait (transition is done after
>   reconnect-delay seconds in connecting-wait mode)
>connected -> connecting-wait
>
> 2. Implement reconnect in connection_co. So, in connecting-* mode,
> connection_co, tries to reconnect unlimited times.
>
> 3. Retry nbd queries on channel error, if we are in connecting-wait
> state.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 332 ++--
>  1 file changed, 269 insertions(+), 63 deletions(-)
>

> +
> +static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
> +{
> +Error *local_err = NULL;
> +
> +if (!nbd_client_connecting(s)) {
> +return;
> +}
> +
> +/* Wait for completion of all in-flight requests */
> +
> +qemu_co_mutex_lock(>send_mutex);
> +
> +while (s->in_flight > 0) {
> +qemu_co_mutex_unlock(>send_mutex);
> +nbd_recv_coroutines_wake_all(s);
> +s->wait_in_flight = true;
> +qemu_coroutine_yield();
> +s->wait_in_flight = false;
> +qemu_co_mutex_lock(>send_mutex);
> +}
> +
> +qemu_co_mutex_unlock(>send_mutex);
> +
> +if (!nbd_client_connecting(s)) {
> +return;
> +}
> +
> +/*
> + * Now we are sure that nobody is accessing the channel, and no one will
> + * try until we set the state to CONNECTED.
> + */
> +
> +/* Finalize previous connection if any */
> +if (s->ioc) {
> +nbd_client_detach_aio_context(s->bs);
> +object_unref(OBJECT(s->sioc));
> +s->sioc = NULL;
> +object_unref(OBJECT(s->ioc));
> +s->ioc = NULL;
> +}
> +
> +s->connect_status = nbd_client_connect(s->bs, _err);
> +error_free(s->connect_err);
> +s->connect_err = NULL;
> +error_propagate(>connect_err, local_err);
> +local_err = NULL;
> +

Looks like a dead assignment to local_err.  But I see elsewhere you add
it, because you convert straight-line code into loops where it matters.

> +if (s->connect_status < 0) {
> +/* failed attempt */
> +return;
> +}
> +
> +/* successfully connected */
> +s->state = NBD_CLIENT_CONNECTED;
> +qemu_co_queue_restart_all(>free_sema);
> +}
> +
> +static coroutine_fn void nbd_reconnect_loop(BDRVNBDState *s)

Coroutine functions should generally have '_co_' in their name.  I'd
prefer nbd_co_reconnect_loop.

> +{
> +uint64_t start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +uint64_t delay_ns = s->reconnect_delay * NANOSECONDS_PER_SECOND;
> +uint64_t timeout = 1 * NANOSECONDS_PER_SECOND;
> +uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND;
> +
> +nbd_reconnect_attempt(s);
> +
> +while (nbd_client_connecting(s)) {
> +if (s->state == NBD_CLIENT_CONNECTING_WAIT &&
> +qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time_ns > 
> delay_ns)
> +{
> +s->state = NBD_CLIENT_CONNECTING_NOWAIT;
> +qemu_co_queue_restart_all(>free_sema);
> +}
> +
> +qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout,
> +  >connection_co_sleep_ns_state);
> +if (s->drained) {
> +bdrv_dec_in_flight(s->bs);
> +s->wait_drained_end = true;
> +while (s->drained) {
> +/*
> + * We may be entered once from 
> nbd_client_attach_aio_context_bh
> + * and then from nbd_client_co_drain_end. So here is a loop.
> + */
> +qemu_coroutine_yield();
> +}
> +bdrv_inc_in_flight(s->bs);
> +}
> +if (timeout < max_timeout) {
> +timeout *= 2;
> +}

Exponential backup, ok.  If I read the loop correctly, you've hardcoded
the max_timeout at 16s, which means the overall timeout is about 30s
when adding in the time of the earlier iterations.  Does that need to be
tunable?  But for now I can live with it.

> +
> +nbd_reconnect_attempt(s);
> +}
>  }
>
>  static coroutine_fn void nbd_connection_entry(void *opaque)
> @@ -177,16 +330,26 @@ static coroutine_fn void nbd_connection_entry(void 
> *opaque)
>   * Therefore we keep an additional in_flight reference all the time 
> and
>   * only 

Re: [PATCH v6 00/10] Introduce the microvm machine type

2019-10-04 Thread no-reply
Patchew URL: https://patchew.org/QEMU/20191004093752.16564-1-...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20191004093752.16564-1-...@redhat.com
Subject: [PATCH v6 00/10] Introduce the microvm machine type

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] patchew/20191004171204.21040-1-eric.au...@redhat.com -> 
patchew/20191004171204.21040-1-eric.au...@redhat.com
Switched to a new branch 'test'
82de93f hw/i386: Introduce the microvm machine type
fda0032 docs/microvm.rst: document the new microvm machine type
8dc483d roms: add microvm-bios (qboot) as binary and git submodule
16f12bc hw/intc/apic: reject pic ints if isa_pic == NULL
22f8cab fw_cfg: add "modify" functions for all types
7cdaa3f hw/i386: make x86.c independent from PCMachineState
052084d hw/i386: split PCMachineState deriving X86MachineState from it
afc0d5a hw/i386/pc: move shared x86 functions to x86.c and export them
9c1dc683 hw/i386/pc: rename functions shared with non-PC machines
bd6947a hw/virtio: Factorize virtio-mmio headers

=== OUTPUT BEGIN ===
1/10 Checking commit bd6947a2e366 (hw/virtio: Factorize virtio-mmio headers)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#77: 
new file mode 100644

total: 0 errors, 1 warnings, 131 lines checked

Patch 1/10 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
2/10 Checking commit 9c1dc683f829 (hw/i386/pc: rename functions shared with 
non-PC machines)
3/10 Checking commit afc0d5a54977 (hw/i386/pc: move shared x86 functions to 
x86.c and export them)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#749: 
new file mode 100644

WARNING: Block comments use a leading /* on a separate line
#809: FILE: hw/i386/x86.c:56:
+/* Calculates initial APIC ID for a specific CPU index

WARNING: Block comments use a leading /* on a separate line
#866: FILE: hw/i386/x86.c:113:
+/* Calculates the limit to CPU APIC ID values

WARNING: Block comments should align the * on each line
#913: FILE: hw/i386/x86.c:160:
+ * -smp hasn't been parsed after it
+*/

WARNING: line over 80 characters
#926: FILE: hw/i386/x86.c:173:
+ms->possible_cpus->cpus[i].arch_id = x86_cpu_apic_id_from_index(pcms, 
i);

ERROR: spaces required around that '+' (ctx:VxV)
#1087: FILE: hw/i386/x86.c:334:
+cmdline_size = (strlen(kernel_cmdline)+16) & ~15;
   ^

ERROR: do not use assignment in if condition
#1091: FILE: hw/i386/x86.c:338:
+if (!f || !(kernel_size = get_file_size(f)) ||

ERROR: if this code is redundant consider removing it
#1100: FILE: hw/i386/x86.c:347:
+#if 0

ERROR: spaces required around that '+' (ctx:VxV)
#1101: FILE: hw/i386/x86.c:348:
+fprintf(stderr, "header magic: %#x\n", ldl_p(header+0x202));
^

ERROR: spaces required around that '+' (ctx:VxV)
#1103: FILE: hw/i386/x86.c:350:
+if (ldl_p(header+0x202) == 0x53726448) {
 ^

ERROR: spaces required around that '+' (ctx:VxV)
#1104: FILE: hw/i386/x86.c:351:
+protocol = lduw_p(header+0x206);
 ^

ERROR: if this code is redundant consider removing it
#1194: FILE: hw/i386/x86.c:441:
+#if 0

ERROR: spaces required around that '+' (ctx:VxV)
#1206: FILE: hw/i386/x86.c:453:
+lduw_p(header+0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
  ^

ERROR: spaces required around that '+' (ctx:VxV)
#1225: FILE: hw/i386/x86.c:472:
+initrd_max = ldl_p(header+0x22c);
  ^

ERROR: spaces required around that '+' (ctx:VxV)
#1235: FILE: hw/i386/x86.c:482:
+fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline)+1);
   ^

ERROR: spaces required around that '+' (ctx:VxV)
#1239: FILE: hw/i386/x86.c:486:
+stl_p(header+0x228, cmdline_addr);
 ^

ERROR: spaces required around that '+' (ctx:VxV)
#1241: FILE: hw/i386/x86.c:488:
+stw_p(header+0x20, 0xA33F);
 ^

ERROR: spaces required around that '+' (ctx:VxV)
#1242: FILE: hw/i386/x86.c:489:
+stw_p(header+0x22, cmdline_addr-real_addr);
 ^

ERROR: spaces required around that '-' (ctx:VxV)
#1242: FILE: hw/i386/x86.c:489:
+stw_p(header+0x22, cmdline_addr-real_addr);
^

ERROR: consider using qemu_strtol in preference to strtol
#1258: FILE: 

Re: [PATCH v6 00/10] Introduce the microvm machine type

2019-10-04 Thread no-reply
Patchew URL: https://patchew.org/QEMU/20191004093752.16564-1-...@redhat.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

qemu-system-x86_64: missing kernel image file name, required by microvm
Broken pipe
/tmp/qemu-test/src/tests/libqtest.c:140: kill_qemu() tried to terminate QEMU 
process but encountered exit status 1 (expected 0)
ERROR - too few tests run (expected 7, got 4)
make: *** [check-qtest-x86_64] Error 1
make: *** Waiting for unfinished jobs
  TESTiotest-qcow2: 159
  TESTiotest-qcow2: 161
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=479a42ee8d764327bfb3924069c8e2dc', '-u', 
'1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-0dfb_dst/src/docker-src.2019-10-04-13.10.50.4894:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=479a42ee8d764327bfb3924069c8e2dc
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-0dfb_dst/src'
make: *** [docker-run-test-quick@centos7] Error 2

real10m42.831s
user0m8.500s


The full log is available at
http://patchew.org/logs/20191004093752.16564-1-...@redhat.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[PULL 29/29] target/i386/kvm: Silence warning from Valgrind about uninitialized bytes

2019-10-04 Thread Paolo Bonzini
From: Thomas Huth 

When I run QEMU with KVM under Valgrind, I currently get this warning:

 Syscall param ioctl(generic) points to uninitialised byte(s)
at 0x95BA45B: ioctl (in /usr/lib64/libc-2.28.so)
by 0x429DC3: kvm_ioctl (kvm-all.c:2365)
by 0x51B249: kvm_arch_get_supported_msr_feature (kvm.c:469)
by 0x4C2A49: x86_cpu_get_supported_feature_word (cpu.c:3765)
by 0x4C4116: x86_cpu_expand_features (cpu.c:5065)
by 0x4C7F8D: x86_cpu_realizefn (cpu.c:5242)
by 0x5961F3: device_set_realized (qdev.c:835)
by 0x7038F6: property_set_bool (object.c:2080)
by 0x707EFE: object_property_set_qobject (qom-qobject.c:26)
by 0x705814: object_property_set_bool (object.c:1338)
by 0x498435: pc_new_cpu (pc.c:1549)
by 0x49C67D: pc_cpus_init (pc.c:1681)
  Address 0x1ffeffee74 is on thread 1's stack
  in frame #2, created by kvm_arch_get_supported_msr_feature (kvm.c:445)

It's harmless, but a little bit annoying, so silence it by properly
initializing the whole structure with zeroes.

Signed-off-by: Thomas Huth 
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index be4bbfb..11b9c85 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -188,7 +188,7 @@ static int kvm_get_tsc(CPUState *cs)
 struct {
 struct kvm_msrs info;
 struct kvm_msr_entry entries[1];
-} msr_data;
+} msr_data = {};
 int ret;
 
 if (env->tsc_valid) {
@@ -448,7 +448,7 @@ uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, 
uint32_t index)
 struct {
 struct kvm_msrs info;
 struct kvm_msr_entry entries[1];
-} msr_data;
+} msr_data = {};
 uint64_t value;
 uint32_t ret, can_be_one, must_be_one;
 
-- 
1.8.3.1




[PATCH] migration: Support QLIST migration

2019-10-04 Thread Eric Auger
94869d5c52 ("migration: migrate QTAILQ") introduced support
for QTAILQ migration. Let's do the same for QLIST.

Signed-off-by: Eric Auger 
---
 include/migration/vmstate.h |  21 ++
 include/qemu/queue.h|  30 
 migration/vmstate-types.c   |  60 
 tests/test-vmstate.c| 133 
 4 files changed, 244 insertions(+)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 1fbfd099dd..54ec9caeb0 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -224,6 +224,7 @@ extern const VMStateInfo vmstate_info_unused_buffer;
 extern const VMStateInfo vmstate_info_tmp;
 extern const VMStateInfo vmstate_info_bitmap;
 extern const VMStateInfo vmstate_info_qtailq;
+extern const VMStateInfo vmstate_info_qlist;
 
 #define type_check_2darray(t1,t2,n,m) ((t1(*)[n][m])0 - (t2*)0)
 /*
@@ -754,6 +755,26 @@ extern const VMStateInfo vmstate_info_qtailq;
 .start= offsetof(_type, _next),  \
 }
 
+/*
+ * For migrating a QLIST
+ * Target QLIST needs be properly initialized.
+ * _type: type of QLIST element
+ * _next: name of QLIST_ENTRY entry field in QLIST element
+ * _vmsd: VMSD for QLIST element
+ * size: size of QLIST element
+ * start: offset of QLIST_ENTRY in QTAILQ element
+ */
+#define VMSTATE_QLIST_V(_field, _state, _version, _vmsd, _type, _next)  \
+{\
+.name = (stringify(_field)), \
+.version_id   = (_version),  \
+.vmsd = &(_vmsd),\
+.size = sizeof(_type),   \
+.info = _info_qlist, \
+.offset   = offsetof(_state, _field),\
+.start= offsetof(_type, _next),  \
+}
+
 /* _f : field name
_f_n : num of elements field_name
_n : num of elements
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index 73bf4a984d..e965b4d18d 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -491,4 +491,34 @@ union {
 \
 QTAILQ_RAW_TQH_CIRC(head)->tql_prev = QTAILQ_RAW_TQE_CIRC(elm, entry); 
 \
 } while (/*CONSTCOND*/0)
 
+#define QLIST_RAW_FIRST(head)  
\
+field_at_offset(head, 0, void *)
+
+#define QLIST_RAW_NEXT(elm, entry) 
\
+field_at_offset(elm, entry, void *)
+
+#define QLIST_RAW_PREVIOUS(elm, entry) 
\
+field_at_offset(elm, entry + sizeof(void *), void *)
+
+#define QLIST_RAW_FOREACH(elm, head, entry)
\
+for ((elm) = *QLIST_RAW_FIRST(head);   
\
+ (elm);
\
+ (elm) = *QLIST_RAW_NEXT(elm, entry))
+
+#define QLIST_RAW_INSERT_TAIL(head, elm, entry) do {   
\
+void *iter, *last = NULL;  
\
+*QLIST_RAW_NEXT(elm, entry) = NULL;
\
+if (!*QLIST_RAW_FIRST(head)) { 
\
+*QLIST_RAW_FIRST(head) = elm;  
\
+*QLIST_RAW_PREVIOUS(elm, entry) = head;
\
+break; 
\
+}  
\
+for (iter = *QLIST_RAW_FIRST(head);
\
+ iter; last = iter, iter = *QLIST_RAW_NEXT(iter, entry))   
\
+{ }
\
+*QLIST_RAW_NEXT(last, entry) = elm;
\
+*QLIST_RAW_PREVIOUS(elm, entry) = last;
\
+} while (0)
+
+
 #endif /* QEMU_SYS_QUEUE_H */
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index bee658a1b2..0eedc614f1 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -691,3 +691,63 @@ const VMStateInfo vmstate_info_qtailq = {
 .get  = get_qtailq,
 .put  = put_qtailq,
 };
+
+static int put_qlist(QEMUFile *f, void *pv, size_t unused_size,
+ const VMStateField *field, QJSON *vmdesc)
+{
+const VMStateDescription *vmsd = field->vmsd;
+/* offset of the QTAILQ entry in a QTAILQ element*/
+size_t entry_offset = field->start;
+void *elm;
+int ret;
+
+QLIST_RAW_FOREACH(elm, pv, entry_offset) {
+qemu_put_byte(f, true);
+ret = 

[PULL 27/29] target/i386: add VMX features

2019-10-04 Thread Paolo Bonzini
Add code to convert the VMX feature words back into MSR values,
allowing the user to enable/disable VMX features as they wish.  The same
infrastructure enables support for limiting VMX features in named
CPU models.

Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c | 225 ++
 target/i386/cpu.h |   9 +++
 target/i386/kvm.c | 162 ++-
 3 files changed, 394 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 0a5d182..44f1bbd 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1232,6 +1232,163 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 .index = MSR_IA32_CORE_CAPABILITY,
 },
 },
+
+[FEAT_VMX_PROCBASED_CTLS] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, "vmx-vintr-pending", "vmx-tsc-offset",
+NULL, NULL, NULL, "vmx-hlt-exit",
+NULL, "vmx-invlpg-exit", "vmx-mwait-exit", "vmx-rdpmc-exit",
+"vmx-rdtsc-exit", NULL, NULL, "vmx-cr3-load-noexit",
+"vmx-cr3-store-noexit", NULL, NULL, "vmx-cr8-load-exit",
+"vmx-cr8-store-exit", "vmx-flexpriority", "vmx-vnmi-pending", 
"vmx-movdr-exit",
+"vmx-io-exit", "vmx-io-bitmap", NULL, "vmx-mtf",
+"vmx-msr-bitmap", "vmx-monitor-exit", "vmx-pause-exit", 
"vmx-secondary-ctls",
+},
+.msr = {
+.index = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+}
+},
+
+[FEAT_VMX_SECONDARY_CTLS] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+"vmx-apicv-xapic", "vmx-ept", "vmx-desc-exit", "vmx-rdtscp-exit",
+"vmx-apicv-x2apic", "vmx-vpid", "vmx-wbinvd-exit", 
"vmx-unrestricted-guest",
+"vmx-apicv-register", "vmx-apicv-vid", "vmx-ple", 
"vmx-rdrand-exit",
+"vmx-invpcid-exit", "vmx-vmfunc", "vmx-shadow-vmcs", 
"vmx-encls-exit",
+"vmx-rdseed-exit", "vmx-pml", NULL, NULL,
+"vmx-xsaves", NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.msr = {
+.index = MSR_IA32_VMX_PROCBASED_CTLS2,
+}
+},
+
+[FEAT_VMX_PINBASED_CTLS] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+"vmx-intr-exit", NULL, NULL, "vmx-nmi-exit",
+NULL, "vmx-vnmi", "vmx-preemption-timer", "vmx-posted-intr",
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.msr = {
+.index = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+}
+},
+
+[FEAT_VMX_EXIT_CTLS] = {
+.type = MSR_FEATURE_WORD,
+/*
+ * VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE is copied from
+ * the LM CPUID bit.
+ */
+.feat_names = {
+NULL, NULL, "vmx-exit-nosave-debugctl", NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL /* vmx-exit-host-addr-space-size */, NULL, NULL,
+"vmx-exit-load-perf-global-ctrl", NULL, NULL, "vmx-exit-ack-intr",
+NULL, NULL, "vmx-exit-save-pat", "vmx-exit-load-pat",
+"vmx-exit-save-efer", "vmx-exit-load-efer",
+"vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs",
+NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.msr = {
+.index = MSR_IA32_VMX_TRUE_EXIT_CTLS,
+}
+},
+
+[FEAT_VMX_ENTRY_CTLS] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, "vmx-entry-noload-debugctl", NULL,
+NULL, NULL, NULL, NULL,
+NULL, "vmx-entry-ia32e-mode", NULL, NULL,
+NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", 
"vmx-entry-load-efer",
+"vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.msr = {
+.index = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+}
+},
+
+[FEAT_VMX_MISC] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "vmx-store-lma", "vmx-activity-hlt", "vmx-activity-shutdown",
+"vmx-activity-wait-sipi", NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, "vmx-vmwrite-vmexit-fields", "vmx-zero-len-inject", NULL,
+},
+.msr = {
+.index = MSR_IA32_VMX_MISC,
+}
+},
+
+[FEAT_VMX_EPT_VPID_CAPS] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+"vmx-ept-execonly", NULL, NULL, 

[PULL 26/29] vmxcap: correct the name of the variables

2019-10-04 Thread Paolo Bonzini
The low bits are 1 if the control must be one, the high bits
are 1 if the control can be one.  Correct the variable names
as they are very confusing.

Signed-off-by: Paolo Bonzini 
---
 scripts/kvm/vmxcap | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index d8c7d6d..5dfeb2e 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -51,15 +51,15 @@ class Control(object):
 return (val & 0x, val >> 32)
 def show(self):
 print(self.name)
-mbz, mb1 = self.read2(self.cap_msr)
-tmbz, tmb1 = 0, 0
+mb1, cb1 = self.read2(self.cap_msr)
+tmb1, tcb1 = 0, 0
 if self.true_cap_msr:
-tmbz, tmb1 = self.read2(self.true_cap_msr)
+tmb1, tcb1 = self.read2(self.true_cap_msr)
 for bit in sorted(self.bits.keys()):
-zero = not (mbz & (1 << bit))
-one = mb1 & (1 << bit)
-true_zero = not (tmbz & (1 << bit))
-true_one = tmb1 & (1 << bit)
+zero = not (mb1 & (1 << bit))
+one = cb1 & (1 << bit)
+true_zero = not (tmb1 & (1 << bit))
+true_one = tcb1 & (1 << bit)
 s= '?'
 if (self.true_cap_msr and true_zero and true_one
 and one and not zero):
-- 
1.8.3.1





[PULL 24/29] target/i386: expand feature words to 64 bits

2019-10-04 Thread Paolo Bonzini
VMX requires 64-bit feature words for the IA32_VMX_EPT_VPID_CAP
and IA32_VMX_BASIC MSRs.  (The VMX control MSRs are 64-bit wide but
actually have only 32 bits of information).

Signed-off-by: Paolo Bonzini 
---
 include/sysemu/kvm.h |  2 +-
 target/i386/cpu.c| 71 +++-
 target/i386/cpu.h|  2 +-
 target/i386/kvm.c|  2 +-
 4 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index fd67477..9d14328 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -462,7 +462,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned int 
extension);
 
 uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
   uint32_t index, int reg);
-uint32_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index);
+uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index);
 
 
 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len);
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 35d868d..0a5d182 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -789,7 +789,7 @@ typedef struct FeatureWordInfo {
  * In cases of disagreement between feature naming conventions,
  * aliases may be added.
  */
-const char *feat_names[32];
+const char *feat_names[64];
 union {
 /* If type==CPUID_FEATURE_WORD */
 struct {
@@ -803,11 +803,11 @@ typedef struct FeatureWordInfo {
 uint32_t index;
 } msr;
 };
-uint32_t tcg_features; /* Feature flags supported by TCG */
-uint32_t unmigratable_flags; /* Feature flags known to be unmigratable */
-uint32_t migratable_flags; /* Feature flags known to be migratable */
+uint64_t tcg_features; /* Feature flags supported by TCG */
+uint64_t unmigratable_flags; /* Feature flags known to be unmigratable */
+uint64_t migratable_flags; /* Feature flags known to be migratable */
 /* Features that shouldn't be auto-enabled by "-cpu host" */
-uint32_t no_autoenable_flags;
+uint64_t no_autoenable_flags;
 } FeatureWordInfo;
 
 static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
@@ -1236,7 +1236,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = 
{
 
 typedef struct FeatureMask {
 FeatureWord index;
-uint32_t mask;
+uint64_t mask;
 } FeatureMask;
 
 typedef struct FeatureDep {
@@ -1246,11 +1246,11 @@ typedef struct FeatureDep {
 static FeatureDep feature_dependencies[] = {
 {
 .from = { FEAT_7_0_EDX, CPUID_7_0_EDX_ARCH_CAPABILITIES },
-.to = { FEAT_ARCH_CAPABILITIES, ~0u },
+.to = { FEAT_ARCH_CAPABILITIES, ~0ull },
 },
 {
 .from = { FEAT_7_0_EDX, CPUID_7_0_EDX_CORE_CAPABILITY },
-.to = { FEAT_CORE_CAPABILITY,   ~0u },
+.to = { FEAT_CORE_CAPABILITY,   ~0ull },
 },
 };
 
@@ -1362,14 +1362,14 @@ const char *get_register_name_32(unsigned int reg)
  * Returns the set of feature flags that are supported and migratable by
  * QEMU, for a given FeatureWord.
  */
-static uint32_t x86_cpu_get_migratable_flags(FeatureWord w)
+static uint64_t x86_cpu_get_migratable_flags(FeatureWord w)
 {
 FeatureWordInfo *wi = _word_info[w];
-uint32_t r = 0;
+uint64_t r = 0;
 int i;
 
-for (i = 0; i < 32; i++) {
-uint32_t f = 1U << i;
+for (i = 0; i < 64; i++) {
+uint64_t f = 1ULL << i;
 
 /* If the feature name is known, it is implicitly considered 
migratable,
  * unless it is explicitly set in unmigratable_flags */
@@ -2931,7 +2931,7 @@ void x86_cpu_change_kvm_default(const char *prop, const 
char *value)
 assert(pv->prop);
 }
 
-static uint32_t x86_cpu_get_supported_feature_word(FeatureWord w,
+static uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
bool migratable_only);
 
 static bool lmce_supported(void)
@@ -3117,7 +3117,7 @@ static bool x86_cpu_have_filtered_features(X86CPU *cpu)
 return false;
 }
 
-static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint32_t 
mask,
+static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint64_t 
mask,
   const char *verbose_prefix)
 {
 CPUX86State *env = >env;
@@ -3134,8 +3134,8 @@ static void mark_unavailable_features(X86CPU *cpu, 
FeatureWord w, uint32_t mask,
 return;
 }
 
-for (i = 0; i < 32; ++i) {
-if ((1UL << i) & mask) {
+for (i = 0; i < 64; ++i) {
+if ((1ULL << i) & mask) {
 feat_word_str = feature_word_description(f, i);
 warn_report("%s: %s%s%s [bit %d]",
 verbose_prefix,
@@ -3378,7 +3378,7 @@ static void x86_cpu_get_feature_words(Object *obj, 
Visitor *v,
   const char *name, void *opaque,
   Error **errp)
 {
-   

[PULL 22/29] target/i386: handle filtered_features in a new function mark_unavailable_features

2019-10-04 Thread Paolo Bonzini
The next patch will add a different reason for filtering features, unrelated
to host feature support.  Extract a new function that takes care of disabling
the features and optionally reporting them.

Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c | 87 ++-
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 80cfab0..83f8981 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3096,17 +3096,41 @@ static char *feature_word_description(FeatureWordInfo 
*f, uint32_t bit)
 return NULL;
 }
 
-static void report_unavailable_features(FeatureWord w, uint32_t mask)
+static bool x86_cpu_have_filtered_features(X86CPU *cpu)
 {
+FeatureWord w;
+
+for (w = 0; w < FEATURE_WORDS; w++) {
+if (cpu->filtered_features[w]) {
+return true;
+}
+}
+
+return false;
+}
+
+static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint32_t 
mask,
+  const char *verbose_prefix)
+{
+CPUX86State *env = >env;
 FeatureWordInfo *f = _word_info[w];
 int i;
 char *feat_word_str;
 
+if (!cpu->force_features) {
+env->features[w] &= ~mask;
+}
+cpu->filtered_features[w] |= mask;
+
+if (!verbose_prefix) {
+return;
+}
+
 for (i = 0; i < 32; ++i) {
 if ((1UL << i) & mask) {
 feat_word_str = feature_word_description(f, i);
-warn_report("%s doesn't support requested feature: %s%s%s [bit 
%d]",
-accel_uses_host_cpuid() ? "host" : "TCG",
+warn_report("%s: %s%s%s [bit %d]",
+verbose_prefix,
 feat_word_str,
 f->feat_names[i] ? "." : "",
 f->feat_names[i] ? f->feat_names[i] : "", i);
@@ -3511,7 +3535,7 @@ static void x86_cpu_parse_featurestr(const char 
*typename, char *features,
 }
 
 static void x86_cpu_expand_features(X86CPU *cpu, Error **errp);
-static int x86_cpu_filter_features(X86CPU *cpu);
+static void x86_cpu_filter_features(X86CPU *cpu, bool verbose);
 
 /* Build a list with the name of all features on a feature word array */
 static void x86_cpu_list_feature_names(FeatureWordArray features,
@@ -3576,7 +3600,7 @@ static void 
x86_cpu_class_check_missing_features(X86CPUClass *xcc,
 next = >next;
 }
 
-x86_cpu_filter_features(xc);
+x86_cpu_filter_features(xc, false);
 
 x86_cpu_list_feature_names(xc->filtered_features, next);
 
@@ -3784,15 +3808,6 @@ static uint32_t 
x86_cpu_get_supported_feature_word(FeatureWord w,
 return r;
 }
 
-static void x86_cpu_report_filtered_features(X86CPU *cpu)
-{
-FeatureWord w;
-
-for (w = 0; w < FEATURE_WORDS; w++) {
-report_unavailable_features(w, cpu->filtered_features[w]);
-}
-}
-
 static void x86_cpu_apply_props(X86CPU *cpu, PropValue *props)
 {
 PropValue *pv;
@@ -5154,24 +5169,24 @@ out:
  *
  * Returns: 0 if all flags are supported by the host, non-zero otherwise.
  */
-static int x86_cpu_filter_features(X86CPU *cpu)
+static void x86_cpu_filter_features(X86CPU *cpu, bool verbose)
 {
 CPUX86State *env = >env;
 FeatureWord w;
-int rv = 0;
+const char *prefix = NULL;
+
+if (verbose) {
+prefix = accel_uses_host_cpuid()
+ ? "host doesn't support requested feature"
+ : "TCG doesn't support requested feature";
+}
 
 for (w = 0; w < FEATURE_WORDS; w++) {
 uint32_t host_feat =
 x86_cpu_get_supported_feature_word(w, false);
 uint32_t requested_features = env->features[w];
-uint32_t available_features = requested_features & host_feat;
-if (!cpu->force_features) {
-env->features[w] = available_features;
-}
-cpu->filtered_features[w] = requested_features & ~available_features;
-if (cpu->filtered_features[w]) {
-rv = 1;
-}
+uint32_t unavailable_features = requested_features & ~host_feat;
+mark_unavailable_features(cpu, w, unavailable_features, prefix);
 }
 
 if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) &&
@@ -5197,13 +5212,9 @@ static int x86_cpu_filter_features(X86CPU *cpu)
  * host can't emulate the capabilities we report on
  * cpu_x86_cpuid(), intel-pt can't be enabled on the current host.
  */
-env->features[FEAT_7_0_EBX] &= ~CPUID_7_0_EBX_INTEL_PT;
-cpu->filtered_features[FEAT_7_0_EBX] |= CPUID_7_0_EBX_INTEL_PT;
-rv = 1;
+mark_unavailable_features(cpu, FEAT_7_0_EBX, 
CPUID_7_0_EBX_INTEL_PT, prefix);
 }
 }
-
-return rv;
 }
 
 static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
@@ -5244,16 +5255,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error 
**errp)
 goto out;
 }
 
-if (x86_cpu_filter_features(cpu) &&
-   

[PULL 28/29] target/i386: work around KVM_GET_MSRS bug for secondary execution controls

2019-10-04 Thread Paolo Bonzini
Some secondary controls are automatically enabled/disabled based on the CPUID
values that are set for the guest.  However, they are still available at a
global level and therefore should be present when KVM_GET_MSRS is sent to
/dev/kvm.

Unfortunately KVM forgot to include those, so fix that.

Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 45f0a1f..be4bbfb 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -479,6 +479,23 @@ uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, 
uint32_t index)
 value = msr_data.entries[0].data;
 switch (index) {
 case MSR_IA32_VMX_PROCBASED_CTLS2:
+/* KVM forgot to add these bits for some time, do this ourselves.  */
+if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & 
CPUID_XSAVE_XSAVES) {
+value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
+}
+if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & CPUID_EXT_RDRAND) {
+value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
+}
+if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 
CPUID_7_0_EBX_INVPCID) {
+value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
+}
+if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 
CPUID_7_0_EBX_RDSEED) {
+value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
+}
+if (kvm_arch_get_supported_cpuid(s, 0x8001, 0, R_EDX) & 
CPUID_EXT2_RDTSCP) {
+value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
+}
+/* fall through */
 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-- 
1.8.3.1





[PULL 23/29] target/i386: introduce generic feature dependency mechanism

2019-10-04 Thread Paolo Bonzini
Sometimes a CPU feature does not make sense unless another is
present.  In the case of VMX features, KVM does not even allow
setting the VMX controls to some invalid combinations.

Therefore, this patch adds a generic mechanism that looks for bits
that the user explicitly cleared, and uses them to remove other bits
from the expanded CPU definition.  If these dependent bits were also
explicitly *set* by the user, this will be a warning for "-cpu check"
and an error for "-cpu enforce".  If not, then the dependent bits are
cleared silently, for convenience.

With VMX features, this will be used so that for example
"-cpu host,-rdrand" will also hide support for RDRAND exiting.

Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c | 72 ---
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 83f8981..35d868d 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -801,10 +801,6 @@ typedef struct FeatureWordInfo {
 /* If type==MSR_FEATURE_WORD */
 struct {
 uint32_t index;
-struct {   /*CPUID that enumerate this MSR*/
-FeatureWord cpuid_class;
-uint32_tcpuid_flag;
-} cpuid_dep;
 } msr;
 };
 uint32_t tcg_features; /* Feature flags supported by TCG */
@@ -1218,10 +1214,6 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 },
 .msr = {
 .index = MSR_IA32_ARCH_CAPABILITIES,
-.cpuid_dep = {
-FEAT_7_0_EDX,
-CPUID_7_0_EDX_ARCH_CAPABILITIES
-}
 },
 },
 [FEAT_CORE_CAPABILITY] = {
@@ -1238,14 +1230,30 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 },
 .msr = {
 .index = MSR_IA32_CORE_CAPABILITY,
-.cpuid_dep = {
-FEAT_7_0_EDX,
-CPUID_7_0_EDX_CORE_CAPABILITY,
-},
 },
 },
 };
 
+typedef struct FeatureMask {
+FeatureWord index;
+uint32_t mask;
+} FeatureMask;
+
+typedef struct FeatureDep {
+FeatureMask from, to;
+} FeatureDep;
+
+static FeatureDep feature_dependencies[] = {
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_ARCH_CAPABILITIES },
+.to = { FEAT_ARCH_CAPABILITIES, ~0u },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_CORE_CAPABILITY },
+.to = { FEAT_CORE_CAPABILITY,   ~0u },
+},
+};
+
 typedef struct X86RegisterInfo32 {
 /* Name of register */
 const char *name;
@@ -5063,9 +5071,26 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 {
 CPUX86State *env = >env;
 FeatureWord w;
+int i;
 GList *l;
 Error *local_err = NULL;
 
+for (l = plus_features; l; l = l->next) {
+const char *prop = l->data;
+object_property_set_bool(OBJECT(cpu), true, prop, _err);
+if (local_err) {
+goto out;
+}
+}
+
+for (l = minus_features; l; l = l->next) {
+const char *prop = l->data;
+object_property_set_bool(OBJECT(cpu), false, prop, _err);
+if (local_err) {
+goto out;
+}
+}
+
 /*TODO: Now cpu->max_features doesn't overwrite features
  * set using QOM properties, and we can convert
  * plus_features & minus_features to global properties
@@ -5083,19 +5108,18 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 }
 }
 
-for (l = plus_features; l; l = l->next) {
-const char *prop = l->data;
-object_property_set_bool(OBJECT(cpu), true, prop, _err);
-if (local_err) {
-goto out;
-}
-}
+for (i = 0; i < ARRAY_SIZE(feature_dependencies); i++) {
+FeatureDep *d = _dependencies[i];
+if (!(env->features[d->from.index] & d->from.mask)) {
+uint32_t unavailable_features = env->features[d->to.index] & 
d->to.mask;
 
-for (l = minus_features; l; l = l->next) {
-const char *prop = l->data;
-object_property_set_bool(OBJECT(cpu), false, prop, _err);
-if (local_err) {
-goto out;
+/* Not an error unless the dependent feature was added explicitly. 
 */
+mark_unavailable_features(cpu, d->to.index,
+  unavailable_features & 
env->user_features[d->to.index],
+  "This feature depends on other features 
that were not requested");
+
+env->user_features[d->to.index] |= unavailable_features;
+env->features[d->to.index] &= ~unavailable_features;
 }
 }
 
-- 
1.8.3.1





[PULL 25/29] target/i386: add VMX definitions

2019-10-04 Thread Paolo Bonzini
These will be used to compile the list of VMX features for named
CPU models, and/or by the code that sets up the VMX MSRs.

Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.h | 130 ++
 1 file changed, 130 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 312d70f..d908e98 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -452,6 +452,25 @@ typedef enum X86Seg {
 #define MSR_IA32_BNDCFGS0x0d90
 #define MSR_IA32_XSS0x0da0
 
+#define MSR_IA32_VMX_BASIC  0x0480
+#define MSR_IA32_VMX_PINBASED_CTLS  0x0481
+#define MSR_IA32_VMX_PROCBASED_CTLS 0x0482
+#define MSR_IA32_VMX_EXIT_CTLS  0x0483
+#define MSR_IA32_VMX_ENTRY_CTLS 0x0484
+#define MSR_IA32_VMX_MISC   0x0485
+#define MSR_IA32_VMX_CR0_FIXED0 0x0486
+#define MSR_IA32_VMX_CR0_FIXED1 0x0487
+#define MSR_IA32_VMX_CR4_FIXED0 0x0488
+#define MSR_IA32_VMX_CR4_FIXED1 0x0489
+#define MSR_IA32_VMX_VMCS_ENUM  0x048a
+#define MSR_IA32_VMX_PROCBASED_CTLS20x048b
+#define MSR_IA32_VMX_EPT_VPID_CAP   0x048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS  0x048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x0490
+#define MSR_IA32_VMX_VMFUNC 0x0491
+
 #define XSTATE_FP_BIT   0
 #define XSTATE_SSE_BIT  1
 #define XSTATE_YMM_BIT  2
@@ -752,6 +771,117 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 
 #define MSR_CORE_CAP_SPLIT_LOCK_DETECT  (1U << 5)
 
+/* VMX MSR features */
+#define MSR_VMX_BASIC_VMCS_REVISION_MASK 0x7FFFull
+#define MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK (0x1FFFull << 32)
+#define MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK (0x003Cull << 32)
+#define MSR_VMX_BASIC_DUAL_MONITOR   (1ULL << 49)
+#define MSR_VMX_BASIC_INS_OUTS   (1ULL << 54)
+#define MSR_VMX_BASIC_TRUE_CTLS  (1ULL << 55)
+
+#define MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK 0x1Full
+#define MSR_VMX_MISC_STORE_LMA   (1ULL << 5)
+#define MSR_VMX_MISC_ACTIVITY_HLT(1ULL << 6)
+#define MSR_VMX_MISC_ACTIVITY_SHUTDOWN   (1ULL << 7)
+#define MSR_VMX_MISC_ACTIVITY_WAIT_SIPI  (1ULL << 8)
+#define MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK  0x0E00ull
+#define MSR_VMX_MISC_VMWRITE_VMEXIT  (1ULL << 29)
+#define MSR_VMX_MISC_ZERO_LEN_INJECT (1ULL << 30)
+
+#define MSR_VMX_EPT_EXECONLY (1ULL << 0)
+#define MSR_VMX_EPT_PAGE_WALK_LENGTH_4   (1ULL << 6)
+#define MSR_VMX_EPT_PAGE_WALK_LENGTH_5   (1ULL << 7)
+#define MSR_VMX_EPT_UC   (1ULL << 8)
+#define MSR_VMX_EPT_WB   (1ULL << 14)
+#define MSR_VMX_EPT_2MB  (1ULL << 16)
+#define MSR_VMX_EPT_1GB  (1ULL << 17)
+#define MSR_VMX_EPT_INVEPT   (1ULL << 20)
+#define MSR_VMX_EPT_AD_BITS  (1ULL << 21)
+#define MSR_VMX_EPT_ADVANCED_VMEXIT_INFO (1ULL << 22)
+#define MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT(1ULL << 25)
+#define MSR_VMX_EPT_INVEPT_ALL_CONTEXT   (1ULL << 26)
+#define MSR_VMX_EPT_INVVPID  (1ULL << 32)
+#define MSR_VMX_EPT_INVVPID_SINGLE_ADDR  (1ULL << 40)
+#define MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT   (1ULL << 41)
+#define MSR_VMX_EPT_INVVPID_ALL_CONTEXT  (1ULL << 42)
+#define MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS (1ULL << 43)
+
+#define MSR_VMX_VMFUNC_EPT_SWITCHING (1ULL << 0)
+
+
+/* VMX controls */
+#define VMX_CPU_BASED_VIRTUAL_INTR_PENDING  0x0004
+#define VMX_CPU_BASED_USE_TSC_OFFSETING 0x0008
+#define VMX_CPU_BASED_HLT_EXITING   0x0080
+#define VMX_CPU_BASED_INVLPG_EXITING0x0200
+#define VMX_CPU_BASED_MWAIT_EXITING 0x0400
+#define VMX_CPU_BASED_RDPMC_EXITING 0x0800
+#define VMX_CPU_BASED_RDTSC_EXITING 0x1000
+#define VMX_CPU_BASED_CR3_LOAD_EXITING  0x8000
+#define VMX_CPU_BASED_CR3_STORE_EXITING 0x0001
+#define VMX_CPU_BASED_CR8_LOAD_EXITING  0x0008
+#define VMX_CPU_BASED_CR8_STORE_EXITING 0x0010
+#define VMX_CPU_BASED_TPR_SHADOW0x0020
+#define VMX_CPU_BASED_VIRTUAL_NMI_PENDING   0x0040
+#define VMX_CPU_BASED_MOV_DR_EXITING0x0080
+#define VMX_CPU_BASED_UNCOND_IO_EXITING 0x0100
+#define VMX_CPU_BASED_USE_IO_BITMAPS0x0200
+#define 

[PULL v3 00/29] Misc patches for 2019-10-02

2019-10-04 Thread Paolo Bonzini
The following changes since commit 7f21573c822805a8e6be379d9bcf3ad9effef3dc:

  Merge remote-tracking branch 
'remotes/huth-gitlab/tags/pull-request-2019-10-01' into staging (2019-10-01 
13:13:38 +0100)

are available in the git repository at:


  git://github.com/bonzini/qemu.git tags/for-upstream

for you to fetch changes up to a1834d975f7d329b128965dd69bc3aaa7195f5a2:

  target/i386/kvm: Silence warning from Valgrind about uninitialized bytes 
(2019-10-04 18:49:20 +0200)


* Compilation fix for KVM (Alex)
* SMM fix (Dmitry)
* VFIO error reporting (Eric)
* win32 fixes and workarounds (Marc-André)
* qemu-pr-helper crash bugfix (Maxim)
* Memory leak fixes (myself)
* VMX features (myself)
* Record-replay deadlock (Pavel)
* i386 CPUID bits (Sebastian)
* kconfig tweak (Thomas)
* Valgrind fix (Thomas)
* Autoconverge test (Yury)


Alex Bennée (1):
  accel/kvm: ensure ret always set

Dmitry Poletaev (1):
  Fix wrong behavior of cpu_memory_rw_debug() function in SMM

Eric Auger (2):
  vfio: Turn the container error into an Error handle
  memory: allow memory_region_register_iommu_notifier() to fail

Marc-André Lureau (3):
  util: WSAEWOULDBLOCK on connect should map to EINPROGRESS
  tests: skip serial test on windows
  win32: work around main-loop busy loop on socket/fd event

Maxim Levitsky (1):
  qemu-pr-helper: fix crash in mpath_reconstruct_sense

Paolo Bonzini (16):
  ide: fix leak from qemu_allocate_irqs
  microblaze: fix leak of fdevice tree blob
  mcf5208: fix leak from qemu_allocate_irqs
  hppa: fix leak from g_strdup_printf
  mips: fix memory leaks in board initialization
  cris: do not leak struct cris_disasm_data
  lm32: do not leak memory on object_new/object_unref
  docker: test-debug: disable LeakSanitizer
  tests/docker: only enable ubsan for test-clang
  target/i386: handle filtered_features in a new function 
mark_unavailable_features
  target/i386: introduce generic feature dependency mechanism
  target/i386: expand feature words to 64 bits
  target/i386: add VMX definitions
  vmxcap: correct the name of the variables
  target/i386: add VMX features
  target/i386: work around KVM_GET_MSRS bug for secondary execution controls

Pavel Dovgaluk (1):
  replay: don't synchronize memory operations in replay mode

Sebastian Andrzej Siewior (1):
  i386: Add CPUID bit for CLZERO and XSAVEERPTR

Thomas Huth (2):
  hw/isa: Introduce a CONFIG_ISA_SUPERIO switch for isa-superio.c
  target/i386/kvm: Silence warning from Valgrind about uninitialized bytes

Yury Kotov (1):
  tests/migration: Add a test for auto converge

 accel/kvm/kvm-all.c   |   6 +-
 disas/cris.c  |  59 +++---
 exec.c|  23 ++-
 hw/arm/smmuv3.c   |  18 +-
 hw/hppa/dino.c|   1 +
 hw/hppa/machine.c |   4 +-
 hw/i386/amd_iommu.c   |  17 +-
 hw/i386/intel_iommu.c |   8 +-
 hw/ide/cmd646.c   |   1 +
 hw/isa/Kconfig|  10 +-
 hw/isa/Makefile.objs  |   2 +-
 hw/m68k/mcf5208.c |   2 +
 hw/microblaze/boot.c  |   1 +
 hw/mips/Kconfig   |   1 +
 hw/mips/mips_int.c|   1 +
 hw/mips/mips_jazz.c   |   2 +
 hw/ppc/spapr_iommu.c  |   8 +-
 hw/timer/lm32_timer.c |   6 +-
 hw/timer/milkymist-sysctl.c   |  10 +-
 hw/vfio/common.c  |  52 +++--
 hw/vfio/spapr.c   |   4 +-
 hw/virtio/vhost.c |   9 +-
 include/exec/memory.h |  21 +-
 include/hw/vfio/vfio-common.h |   2 +-
 include/sysemu/kvm.h  |   2 +-
 memory.c  |  31 +--
 scripts/kvm/vmxcap|  14 +-
 scsi/qemu-pr-helper.c |   6 +-
 target/i386/cpu.c | 447 +-
 target/i386/cpu.h | 146 +-
 target/i386/helper.c  |   5 +-
 target/i386/kvm.c | 185 -
 tests/docker/test-clang   |   4 +-
 tests/docker/test-debug   |   1 +
 tests/migration-test.c| 157 +--
 tests/test-char.c |   4 +-
 util/async.c  |   6 +-
 util/oslib-win32.c|   6 +-
 38 files changed, 1038 insertions(+), 244 deletions(-)
-- 
1.8.3.1




Re: [PULL 12/30] Makefile: Remove generated files when doing 'distclean'

2019-10-04 Thread Paolo Bonzini
On 04/10/19 14:20, Peter Maydell wrote:
> On Wed, 2 Oct 2019 at 18:07, Paolo Bonzini  wrote:
>>
>> From: Thomas Huth 
>>
>> When running "make distclean" we currently leave a lot of generated
>> files in the build directory. Fix that.
>>
>> Signed-off-by: Thomas Huth 
>> Reviewed-by: John Snow 
>> Signed-off-by: Paolo Bonzini 
>> ---
> 
>> diff --git a/tests/Makefile.include b/tests/Makefile.include
>> index 3543451..48b52da 100644
>> --- a/tests/Makefile.include
>> +++ b/tests/Makefile.include
>> @@ -1176,11 +1176,21 @@ check: check-block check-qapi-schema check-unit 
>> check-softfloat check-qtest chec
>>  check-clean:
>> rm -rf $(check-unit-y) tests/*.o $(QEMU_IOTESTS_HELPERS-y)
>> rm -rf $(sort $(foreach target,$(SYSEMU_TARGET_LIST), 
>> $(check-qtest-$(target)-y)) $(check-qtest-generic-y))
>> -   rm -f tests/test-qapi-gen-timestamp
>> rm -rf $(TESTS_VENV_DIR) $(TESTS_RESULTS_DIR)
>> +   rm -f tests/qemu-iotests/common.env tests/qemu-iotests/check.*
>> +   rm -f tests/test-qapi-gen-timestamp tests/qht-bench$(EXESUF) \
>> +   tests/fp/fp-test tests/fp/*.out tests/qapi-schema/*.test.*
>>
>>  clean: check-clean
> 
> Hi; this change breaks the sequence
>  'make clean; make; make check'
> 
> because now 'make clean' removes tests/qemu-iotests/common.env.
> But this file is created by 'configure', not by 'make', so if there's
> no other reason why 'make' needs to re-run configure then we get
> to the 'make check' stage with the file not existing, and then
> when we try to run the iotests they fail with:
> 
> ./check: line 60:
> /home/petmay01/linaro/qemu-for-merges/build/alldbg/tests/qemu-iotests/common.env:
> No such file or directory
> check: failed to source common.env (make sure the qemu-iotests are run
> from tests/qemu-iotests in the build tree)
> /home/petmay01/linaro/qemu-for-merges/tests/Makefile.include:1102:
> recipe for target 'check-tests/check-block.sh' failed
> 
> thanks
> -- PMM
> 

I've dropped this patch and will send v3 that adds back the VMX patches.

Paolo



Re: libvirt? qemu change that mmaps ELF files breaks libvirt svirt handling for os.kernel

2019-10-04 Thread Paolo Bonzini
On 04/10/19 14:47, Christian Borntraeger wrote:
>> Please file a bz entry against the selinux-policy component for
>> whatever distro you're using, and/or Fedora rawhide, and CC me
>> on it too.
> 
> Done. This was on Fedora 30.
> https://bugzilla.redhat.com/show_bug.cgi?id=1758525
> 
>  Now sure about others like RHEL. RHV.
> 

We'll take care of that.  Thanks!

Paolo



Re: [PATCH 4/4] Revert "mirror: Only mirror granularity-aligned chunks"

2019-10-04 Thread Max Reitz
On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
> This reverts commit 9adc1cb49af8d4e54f57980b1eed5c0a4b2dafa6.
> "mirror: Only mirror granularity-aligned chunks"
> 
> Since previous commit unaligned chunks are supported by
> do_sync_target_write.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/mirror.c | 29 -
>  1 file changed, 29 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Max Reitz
On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset aligned-up
> region in the dirty bitmap, which means that we may not copy some bytes
> and assume them copied, which actually leads to producing corrupted
> target.
> 
> So 9adc1cb49af8d forced dirty bitmap granularity to be
> request_alignment for mirror-top filter, so we are not working with
> unaligned requests. However forcing large alignment obviously decreases
> performance of unaligned requests.
> 
> This commit provides another solution for the problem: if unaligned
> padding is already dirty, we can safely ignore it, as
> 1. It's dirty, it will be copied by mirror_iteration anyway
> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>bitmap and therefore don't damage "synchronicity" of the
>write-blocking mirror.
> 
> If unaligned padding is not dirty, we just write it, no reason to touch
> dirty bitmap if we succeed (on failure we'll set the whole region
> ofcourse, but we loss "synchronicity" on failure anyway).
> 
> Note: we need to disable dirty_bitmap, otherwise we will not be able to
> see in do_sync_target_write bitmap state before current operation. We
> may of course check dirty bitmap before the operation in
> bdrv_mirror_top_do_write and remember it, but we don't need active
> dirty bitmap for write-blocking mirror anyway.
> 
> New code-path is unused until the following commit reverts
> 9adc1cb49af8d.
> 
> Suggested-by: Denis V. Lunev 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/mirror.c | 39 ++-
>  1 file changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index d176bf5920..d192f6a96b 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -1204,6 +1204,39 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod 
> method,
>   QEMUIOVector *qiov, int flags)
>  {
>  int ret;
> +size_t qiov_offset = 0;
> +
> +if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
> +bdrv_dirty_bitmap_get(job->dirty_bitmap, offset)) {
> +/*
> + * Dirty unaligned padding
> + * 1. It's already dirty, no damage to "actively_synced" if we 
> just
> + *skip unaligned part.
> + * 2. If we copy it, we can't reset corresponding bit in
> + *dirty_bitmap as there may be some "dirty" bytes still not
> + *copied.
> + * So, just ignore it.
> + */
> +qiov_offset = QEMU_ALIGN_UP(offset, job->granularity) - offset;
> +if (bytes <= qiov_offset) {
> +/* nothing to do after shrink */
> +return;
> +}
> +offset += qiov_offset;
> +bytes -= qiov_offset;
> +}
> +
> +if (!QEMU_IS_ALIGNED(offset + bytes, job->granularity) &&
> +bdrv_dirty_bitmap_get(job->dirty_bitmap, offset + bytes - 1))
> +{
> +uint64_t tail = (offset + bytes) % job->granularity;
> +
> +if (bytes <= tail) {
> +/* nothing to do after shrink */
> +return;
> +}
> +bytes -= tail;
> +}
>  
>  bdrv_reset_dirty_bitmap(job->dirty_bitmap, offset, bytes);
>  

The bdrv_set_dirty_bitmap() in the error case below needs to use the
original offset/bytes, I suppose.

Apart from that, looks good to me.

Max



signature.asc
Description: OpenPGP digital signature


[PATCH 4/5] hw/arm/boot: Expose the pmem nodes in the DT

2019-10-04 Thread Shameer Kolothum
From: Eric Auger 

In case of NV-DIMM slots, let's add /pmem DT nodes.

Signed-off-by: Eric Auger 
Signed-off-by: Shameer Kolothum 
---
 hw/arm/boot.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index c264864c11..bd6d72b33e 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -20,6 +20,7 @@
 #include "hw/boards.h"
 #include "sysemu/reset.h"
 #include "hw/loader.h"
+#include "hw/mem/memory-device.h"
 #include "elf.h"
 #include "sysemu/device_tree.h"
 #include "qemu/config-file.h"
@@ -523,6 +524,44 @@ static void fdt_add_psci_node(void *fdt)
 qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn);
 }
 
+static int fdt_add_pmem_node(void *fdt, uint32_t acells, uint32_t scells)
+{
+MemoryDeviceInfoList *info, *info_list = qmp_memory_device_list();
+MemoryDeviceInfo *mi;
+int ret;
+
+for (info = info_list; info != NULL; info = info->next) {
+mi = info->value;
+
+if (mi->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+PCDIMMDeviceInfo *di = mi->u.nvdimm.data;
+char *nodename;
+
+nodename = g_strdup_printf("/pmem@%" PRIx64, di->addr);
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_string(fdt, nodename, "compatible", 
"pmem-region");
+ret = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg", acells,
+   di->addr, scells, di->size);
+/* only set the NUMA ID if it is specified */
+if (!ret && di->node >= 0) {
+ret = qemu_fdt_setprop_cell(fdt, nodename, "numa-node-id",
+di->node);
+}
+
+g_free(nodename);
+
+if (ret < 0) {
+fprintf(stderr, "couldn't add NVDIMM /memory@%"PRIx64" node\n",
+di->addr);
+goto out;
+}
+}
+}
+out:
+qapi_free_MemoryDeviceInfoList(info_list);
+return ret;
+}
+
 int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
  hwaddr addr_limit, AddressSpace *as, MachineState *ms)
 {
@@ -622,6 +661,12 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
*binfo,
 }
 }
 
+rc = fdt_add_pmem_node(fdt, acells, scells);
+if (rc < 0) {
+fprintf(stderr, "couldn't add pmem memory nodes\n");
+goto fail;
+}
+
 rc = fdt_path_offset(fdt, "/chosen");
 if (rc < 0) {
 qemu_fdt_add_subnode(fdt, "/chosen");
-- 
2.17.1





[PATCH 2/5] nvdimm: Use configurable ACPI IO base and size

2019-10-04 Thread Shameer Kolothum
From: Kwangwoo Lee 

This patch makes IO base and size configurable to create NPIO AML for
ACPI NFIT. Since a different architecture like AArch64 does not use
port-mapped IO, a configurable IO base is required to create correct
mapping of ACPI IO address and size.

Signed-off-by: Kwangwoo Lee 
Signed-off-by: Eric Auger 
Signed-off-by: Shameer Kolothum 
---
 hw/acpi/nvdimm.c| 32 ++--
 hw/i386/acpi-build.c|  6 ++
 hw/i386/acpi-build.h|  3 +++
 hw/i386/pc_piix.c   |  2 ++
 hw/i386/pc_q35.c|  2 ++
 include/hw/mem/nvdimm.h |  3 +++
 6 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..f91eea3802 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -926,11 +926,13 @@ void nvdimm_acpi_plug_cb(HotplugHandler *hotplug_dev, 
DeviceState *dev)
 }
 
 void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io,
+struct AcpiGenericAddress dsm_io,
 FWCfgState *fw_cfg, Object *owner)
 {
+state->dsm_io = dsm_io;
 memory_region_init_io(>io_mr, owner, _dsm_ops, state,
-  "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
-memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, >io_mr);
+  "nvdimm-acpi-io", dsm_io.bit_width >> 3);
+memory_region_add_subregion(io, dsm_io.address, >io_mr);
 
 state->dsm_mem = g_array_new(false, true /* clear */, 1);
 acpi_data_push(state->dsm_mem, sizeof(NvdimmDsmIn));
@@ -959,12 +961,14 @@ void nvdimm_init_acpi_state(NVDIMMState *state, 
MemoryRegion *io,
 
 #define NVDIMM_QEMU_RSVD_UUID   "648B9CF2-CDA1-4312-8AD9-49C4AF32BD62"
 
-static void nvdimm_build_common_dsm(Aml *dev)
+static void nvdimm_build_common_dsm(Aml *dev,
+NVDIMMState *nvdimm_state)
 {
 Aml *method, *ifctx, *function, *handle, *uuid, *dsm_mem, *elsectx2;
 Aml *elsectx, *unsupport, *unpatched, *expected_uuid, *uuid_invalid;
 Aml *pckg, *pckg_index, *pckg_buf, *field, *dsm_out_buf, *dsm_out_buf_size;
 uint8_t byte_list[1];
+AmlRegionSpace rs;
 
 method = aml_method(NVDIMM_COMMON_DSM, 5, AML_SERIALIZED);
 uuid = aml_arg(0);
@@ -975,9 +979,16 @@ static void nvdimm_build_common_dsm(Aml *dev)
 
 aml_append(method, aml_store(aml_name(NVDIMM_ACPI_MEM_ADDR), dsm_mem));
 
+if (nvdimm_state->dsm_io.space_id == AML_AS_SYSTEM_IO) {
+rs = AML_SYSTEM_IO;
+} else {
+rs = AML_SYSTEM_MEMORY;
+}
+
 /* map DSM memory and IO into ACPI namespace. */
-aml_append(method, aml_operation_region(NVDIMM_DSM_IOPORT, AML_SYSTEM_IO,
-   aml_int(NVDIMM_ACPI_IO_BASE), NVDIMM_ACPI_IO_LEN));
+aml_append(method, aml_operation_region(NVDIMM_DSM_IOPORT, rs,
+   aml_int(nvdimm_state->dsm_io.address),
+   nvdimm_state->dsm_io.bit_width >> 3));
 aml_append(method, aml_operation_region(NVDIMM_DSM_MEMORY,
AML_SYSTEM_MEMORY, dsm_mem, sizeof(NvdimmDsmIn)));
 
@@ -992,7 +1003,7 @@ static void nvdimm_build_common_dsm(Aml *dev)
 field = aml_field(NVDIMM_DSM_IOPORT, AML_DWORD_ACC, AML_NOLOCK,
   AML_PRESERVE);
 aml_append(field, aml_named_field(NVDIMM_DSM_NOTIFY,
-   NVDIMM_ACPI_IO_LEN * BITS_PER_BYTE));
+  (nvdimm_state->dsm_io.bit_width >> 3) * BITS_PER_BYTE));
 aml_append(method, field);
 
 /*
@@ -1260,7 +1271,8 @@ static void nvdimm_build_nvdimm_devices(Aml *root_dev, 
uint32_t ram_slots)
 }
 
 static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
-  BIOSLinker *linker, GArray *dsm_dma_area,
+  BIOSLinker *linker,
+  NVDIMMState *nvdimm_state,
   uint32_t ram_slots)
 {
 Aml *ssdt, *sb_scope, *dev;
@@ -1288,7 +1300,7 @@ static void nvdimm_build_ssdt(GArray *table_offsets, 
GArray *table_data,
  */
 aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0012")));
 
-nvdimm_build_common_dsm(dev);
+nvdimm_build_common_dsm(dev, nvdimm_state);
 
 /* 0 is reserved for root device. */
 nvdimm_build_device_dsm(dev, 0);
@@ -1307,7 +1319,7 @@ static void nvdimm_build_ssdt(GArray *table_offsets, 
GArray *table_data,
NVDIMM_ACPI_MEM_ADDR);
 
 bios_linker_loader_alloc(linker,
- NVDIMM_DSM_MEM_FILE, dsm_dma_area,
+ NVDIMM_DSM_MEM_FILE, nvdimm_state->dsm_mem,
  sizeof(NvdimmDsmIn), false /* high memory */);
 bios_linker_loader_add_pointer(linker,
 ACPI_BUILD_TABLE_FILE, mem_addr_offset, sizeof(uint32_t),
@@ -1329,7 +1341,7 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray 
*table_data,
 return;
 }
 
-nvdimm_build_ssdt(table_offsets, table_data, linker, state->dsm_mem,
+

[PATCH 5/5] hw/arm/virt: Add nvdimm hotplug support

2019-10-04 Thread Shameer Kolothum
This adds support for nvdimm hotplug events through GED
and enables nvdimm for the arm/virt. Now Guests with DT boot
can have nvdimm cold plug and with ACPI both cold/hot plug.

Hot removal functionality is not yet supported.

Signed-off-by: Shameer Kolothum 
---
 docs/specs/acpi_hw_reduced_hotplug.rst |  1 +
 hw/acpi/generic_event_device.c | 13 +
 hw/arm/virt.c  | 20 +++-
 include/hw/acpi/generic_event_device.h |  1 +
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst 
b/docs/specs/acpi_hw_reduced_hotplug.rst
index 911a98255b..e3abe975bf 100644
--- a/docs/specs/acpi_hw_reduced_hotplug.rst
+++ b/docs/specs/acpi_hw_reduced_hotplug.rst
@@ -63,6 +63,7 @@ GED IO interface (4 byte access)
 bits:
0: Memory hotplug event
1: System power down event
+   2: NVDIMM hotplug event
 2-31: Reserved
 
 **write_access:**
diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 9cee90cc70..ad1b684304 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -16,6 +16,7 @@
 #include "hw/acpi/generic_event_device.h"
 #include "hw/irq.h"
 #include "hw/mem/pc-dimm.h"
+#include "hw/mem/nvdimm.h"
 #include "hw/qdev-properties.h"
 #include "migration/vmstate.h"
 #include "qemu/error-report.h"
@@ -23,6 +24,7 @@
 static const uint32_t ged_supported_events[] = {
 ACPI_GED_MEM_HOTPLUG_EVT,
 ACPI_GED_PWR_DOWN_EVT,
+ACPI_GED_NVDIMM_HOTPLUG_EVT,
 };
 
 /*
@@ -110,6 +112,11 @@ void build_ged_aml(Aml *table, const char *name, 
HotplugHandler *hotplug_dev,
aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE),
   aml_int(0x80)));
 break;
+case ACPI_GED_NVDIMM_HOTPLUG_EVT:
+aml_append(if_ctx,
+   aml_notify(aml_name("\\_SB.NVDR"),
+  aml_int(0x80)));
+break;
 default:
 /*
  * Please make sure all the events in ged_supported_events[]
@@ -175,7 +182,11 @@ static void acpi_ged_device_plug_cb(HotplugHandler 
*hotplug_dev,
 AcpiGedState *s = ACPI_GED(hotplug_dev);
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
+nvdimm_acpi_plug_cb(hotplug_dev, dev);
+} else {
 acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
+}
 } else {
 error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
@@ -192,6 +203,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, 
AcpiEventStatusBits ev)
 sel = ACPI_GED_MEM_HOTPLUG_EVT;
 } else if (ev & ACPI_POWER_DOWN_STATUS) {
 sel = ACPI_GED_PWR_DOWN_EVT;
+} else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) {
+sel = ACPI_GED_NVDIMM_HOTPLUG_EVT;
 } else {
 /* Unknown event. Return without generating interrupt. */
 warn_report("GED: Unsupported event %d. No irq injected", ev);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 30bc8a7803..acdcba9638 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -543,6 +543,10 @@ static inline DeviceState 
*create_acpi_ged(VirtMachineState *vms, qemu_irq *pic)
 event |= ACPI_GED_MEM_HOTPLUG_EVT;
 }
 
+if (ms->nvdimms_state->is_enabled) {
+event |= ACPI_GED_NVDIMM_HOTPLUG_EVT;
+}
+
 dev = qdev_create(NULL, TYPE_ACPI_GED);
 qdev_prop_set_uint32(dev, "ged-event", event);
 
@@ -1938,9 +1942,12 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 }
 
 if (!vms->acpi_dev) {
-error_setg(errp,
-   "memory hotplug is not enabled: missing acpi-ged device");
-return;
+/* Allow nvdimm DT or cold plug */
+if (!(is_nvdimm && !dev->hotplugged)) {
+error_setg(errp,
+   "memory hotplug is not enabled: missing acpi-ged 
device");
+return;
+ }
 }
 
 pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp);
@@ -1964,8 +1971,10 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 nvdimm_plug(ms->nvdimms_state);
 }
 
-hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
-hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
+if (vms->acpi_dev) {
+hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
+hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
+}
 out:
 error_propagate(errp, local_err);
 }
@@ -2073,6 +2082,7 @@ static void virt_machine_class_init(ObjectClass *oc, void 
*data)
 hc->plug = virt_machine_device_plug_cb;
 hc->unplug_request = virt_machine_device_unplug_request_cb;
 mc->numa_mem_supported = true;
+mc->nvdimm_supported = true;
 mc->auto_enable_numa_with_memhp 

[PATCH 3/5] hw/arm/virt: Add nvdimm hot-plug infrastructure

2019-10-04 Thread Shameer Kolothum
From: Kwangwoo Lee 

Pre-plug and plug handlers are prepared for NVDIMM support.
Please note nvdimm_support is not yet enabled.

Signed-off-by: Eric Auger 
Signed-off-by: Kwangwoo Lee 
Signed-off-by: Shameer Kolothum 
---
 hw/arm/Kconfig   |  1 +
 hw/arm/virt-acpi-build.c |  6 ++
 hw/arm/virt.c| 22 +-
 hw/mem/Kconfig   |  2 +-
 include/hw/arm/virt.h|  1 +
 5 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index c6e7782580..851dd81289 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -24,6 +24,7 @@ config ARM_VIRT
 select DIMM
 select ACPI_MEMORY_HOTPLUG
 select ACPI_HW_REDUCED
+select ACPI_NVDIMM
 
 config CHEETAH
 bool
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 074e0c858e..4e63f5da48 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -44,6 +44,7 @@
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
 #include "hw/arm/virt.h"
+#include "hw/mem/nvdimm.h"
 #include "sysemu/numa.h"
 #include "sysemu/reset.h"
 #include "kvm_arm.h"
@@ -835,6 +836,11 @@ void virt_acpi_build(VirtMachineState *vms, 
AcpiBuildTables *tables)
 }
 }
 
+if (ms->nvdimms_state->is_enabled) {
+nvdimm_build_acpi(table_offsets, tables_blob, tables->linker,
+  ms->nvdimms_state, ms->ram_slots);
+}
+
 if (its_class_name() && !vmc->no_its) {
 acpi_add_table(table_offsets, tables_blob);
 build_iort(tables_blob, tables->linker, vms);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index d4bedc2607..30bc8a7803 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -143,6 +143,7 @@ static const MemMapEntry base_memmap[] = {
 [VIRT_SMMU] =   { 0x0905, 0x0002 },
 [VIRT_PCDIMM_ACPI] ={ 0x0907, MEMORY_HOTPLUG_IO_LEN },
 [VIRT_ACPI_GED] =   { 0x0908, ACPI_GED_EVT_SEL_LEN },
+[VIRT_NVDIMM_ACPI] ={ 0x0909, NVDIMM_ACPI_IO_LEN},
 [VIRT_MMIO] =   { 0x0a00, 0x0200 },
 /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
 [VIRT_PLATFORM_BUS] =   { 0x0c00, 0x0200 },
@@ -1750,6 +1751,18 @@ static void machvirt_init(MachineState *machine)
 
 create_platform_bus(vms, pic);
 
+if (machine->nvdimms_state->is_enabled) {
+const struct AcpiGenericAddress arm_virt_nvdimm_acpi_dsmio = {
+.space_id = AML_AS_SYSTEM_MEMORY,
+.address = vms->memmap[VIRT_NVDIMM_ACPI].base,
+.bit_width = NVDIMM_ACPI_IO_LEN << 3
+};
+
+nvdimm_init_acpi_state(machine->nvdimms_state, sysmem,
+   arm_virt_nvdimm_acpi_dsmio,
+   vms->fw_cfg, OBJECT(vms));
+}
+
 vms->bootinfo.ram_size = machine->ram_size;
 vms->bootinfo.nb_cpus = smp_cpus;
 vms->bootinfo.board_id = -1;
@@ -1916,9 +1929,10 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
  Error **errp)
 {
 VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+MachineState *ms = MACHINE(hotplug_dev);
 const bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
 
-if (is_nvdimm) {
+if (is_nvdimm && (!ms->nvdimms_state->is_enabled)) {
 error_setg(errp, "nvdimm is not yet supported");
 return;
 }
@@ -1937,6 +1951,8 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 {
 HotplugHandlerClass *hhc;
 VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+MachineState *ms = MACHINE(hotplug_dev);
+bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
 Error *local_err = NULL;
 
 pc_dimm_plug(PC_DIMM(dev), MACHINE(vms), _err);
@@ -1944,6 +1960,10 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 goto out;
 }
 
+if (is_nvdimm) {
+nvdimm_plug(ms->nvdimms_state);
+}
+
 hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
 hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
 out:
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..0d5f8f321a 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on PC || ARM_VIRT
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 0b41083e9d..06d5e75611 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -79,6 +79,7 @@ enum {
 VIRT_SECURE_MEM,
 VIRT_PCDIMM_ACPI,
 VIRT_ACPI_GED,
+VIRT_NVDIMM_ACPI,
 VIRT_LOWMEMMAP_LAST,
 };
 
-- 
2.17.1





[PATCH 0/5] ARM virt: Add NVDIMM support

2019-10-04 Thread Shameer Kolothum
This series adds NVDIMM support to arm/virt platform.
This has a dependency on [0] and make use of the GED
device for NVDIMM hotplug events. The series reuses
some of the patches posted by Eric in his earlier
attempt here[1].

Patch 1/5 is a fix to the Guest reboot issue on NVDIMM
hot add case described here[2].

I have done basic sanity testing of NVDIMM deviecs with
both ACPI and DT Guest boot. Further testing is always
welcome.

Please let me know your feedback.

Thanks,
Shameer

[0] https://patchwork.kernel.org/cover/11150345/
[1] https://patchwork.kernel.org/cover/10830777/
[2] https://patchwork.kernel.org/patch/11154757/

Eric Auger (1):
  hw/arm/boot: Expose the pmem nodes in the DT

Kwangwoo Lee (2):
  nvdimm: Use configurable ACPI IO base and size
  hw/arm/virt: Add nvdimm hot-plug infrastructure

Shameer Kolothum (2):
  hw/arm: Align ACPI blob len to PAGE size
  hw/arm/virt: Add nvdimm hotplug support

 docs/specs/acpi_hw_reduced_hotplug.rst |  1 +
 hw/acpi/generic_event_device.c | 13 
 hw/acpi/nvdimm.c   | 32 --
 hw/arm/Kconfig |  1 +
 hw/arm/boot.c  | 45 ++
 hw/arm/virt-acpi-build.c   | 20 
 hw/arm/virt.c  | 42 
 hw/i386/acpi-build.c   |  6 
 hw/i386/acpi-build.h   |  3 ++
 hw/i386/pc_piix.c  |  2 ++
 hw/i386/pc_q35.c   |  2 ++
 hw/mem/Kconfig |  2 +-
 include/hw/acpi/generic_event_device.h |  1 +
 include/hw/arm/virt.h  |  1 +
 include/hw/mem/nvdimm.h|  3 ++
 15 files changed, 157 insertions(+), 17 deletions(-)

-- 
2.17.1





[PATCH 1/5] block: Mark commit and mirror as filter drivers

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
From: Max Reitz 

The commit and mirror block nodes are filters, so they should be marked
as such.

Signed-off-by: Max Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
   [squash comment fix from another Max's patch and adjust commit msg]
---
 include/block/block_int.h | 8 +---
 block/commit.c| 2 ++
 block/mirror.c| 2 ++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 0422acdf1c..4af0fa8fe4 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -89,9 +89,11 @@ struct BlockDriver {
 int instance_size;
 
 /* set to true if the BlockDriver is a block filter. Block filters pass
- * certain callbacks that refer to data (see block.c) to their bs->file if
- * the driver doesn't implement them. Drivers that do not wish to forward
- * must implement them and return -ENOTSUP.
+ * certain callbacks that refer to data (see block.c) to their bs->file
+ * or bs->backing (whichever one exists) if the driver doesn't implement
+ * them. Drivers that do not wish to forward must implement them and return
+ * -ENOTSUP.
+ * Note that filters are not allowed to modify data.
  */
 bool is_filter;
 /* for snapshots block filter like Quorum can implement the
diff --git a/block/commit.c b/block/commit.c
index bc8454463d..07e7a3cd0a 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -253,6 +253,8 @@ static BlockDriver bdrv_commit_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_commit_top_refresh_filename,
 .bdrv_child_perm= bdrv_commit_top_child_perm,
+
+.is_filter  = true,
 };
 
 void commit_start(const char *job_id, BlockDriverState *bs,
diff --git a/block/mirror.c b/block/mirror.c
index fe984efb90..838781dfaa 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1487,6 +1487,8 @@ static BlockDriver bdrv_mirror_top = {
 .bdrv_refresh_filename  = bdrv_mirror_top_refresh_filename,
 .bdrv_child_perm= bdrv_mirror_top_child_perm,
 .bdrv_refresh_limits= bdrv_mirror_top_refresh_limits,
+
+.is_filter  = true,
 };
 
 static BlockJob *mirror_start_job(
-- 
2.21.0




[PATCH 0/5] fix migration with bitmaps and mirror

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
Hi all!

It's a continuation for
"bitmap migration bug with -drive while block mirror runs"
<315cff78-dcdb-a3ce-2742-da3cc9f0c...@redhat.com>
https://lists.gnu.org/archive/html/qemu-devel/2019-09/msg07241.html

The problem is that bitmaps migrated to node with same node-name or
blk-parent name. And currently only the latter actually work in libvirt.
And with mirror-top filter it doesn't work, because
bdrv_get_device_or_node_name don't go through filters.

Fix this by handling filtered children of block backends in separate.

Max Reitz (1):
  block: Mark commit and mirror as filter drivers

Vladimir Sementsov-Ogievskiy (4):
  migretion/block-dirty-bitmap: refactor init_dirty_bitmap_migration
  block/dirty-bitmap: add bdrv_has_named_bitmaps helper
  migration/block-dirty-bitmap: fix bitmaps migration during mirror job
  iotests: 194: test also migration of dirty bitmap

 include/block/block_int.h  |   8 ++-
 include/block/dirty-bitmap.h   |   1 +
 block/commit.c |   2 +
 block/dirty-bitmap.c   |  13 
 block/mirror.c |   2 +
 migration/block-dirty-bitmap.c | 118 +++--
 tests/qemu-iotests/194 |  14 ++--
 tests/qemu-iotests/194.out |   6 ++
 8 files changed, 121 insertions(+), 43 deletions(-)

-- 
2.21.0




[PATCH 1/5] hw/arm: Align ACPI blob len to PAGE size

2019-10-04 Thread Shameer Kolothum
If ACPI blob length modifications happens after the initial
virt_acpi_build() call, and the changed blob length is within
the PAGE size boundary, then the revised size is not seen by
the firmware on Guest reboot. The is because in the
virt_acpi_build_update() -> acpi_ram_update() -> qemu_ram_resize()
path, qemu_ram_resize() uses ram_block size which is aligned
to PAGE size and the "resize callback" to update the size seen
by firmware is not getting invoked. Hence align ACPI blob sizes
to PAGE boundary.

Signed-off-by: Shameer Kolothum 
---
More details on this issue can be found here,
https://patchwork.kernel.org/patch/11154757/

---
 hw/arm/virt-acpi-build.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 4cd50175e0..074e0c858e 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -790,6 +790,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables 
*tables)
 GArray *table_offsets;
 unsigned dsdt, xsdt;
 GArray *tables_blob = tables->table_data;
+GArray *cmd_blob = tables->linker->cmd_blob;
 MachineState *ms = MACHINE(vms);
 
 table_offsets = g_array_new(false, true /* clear */,
@@ -854,6 +855,19 @@ void virt_acpi_build(VirtMachineState *vms, 
AcpiBuildTables *tables)
 build_rsdp(tables->rsdp, tables->linker, _data);
 }
 
+/*
+ * Align the ACPI blob lengths to PAGE size so that on ACPI table
+ * regeneration, the length that firmware sees really gets updated
+ * through 'resize' callback in qemu_ram_resize() in the
+ * virt_acpi_build_update() -> acpi_ram_update() -> qemu_ram_resize()
+ * path.
+ */
+g_array_set_size(tables_blob,
+ TARGET_PAGE_ALIGN(acpi_data_len(tables_blob)));
+g_array_set_size(tables->rsdp,
+ TARGET_PAGE_ALIGN(acpi_data_len(tables->rsdp)));
+g_array_set_size(cmd_blob,
+ TARGET_PAGE_ALIGN(acpi_data_len(cmd_blob)));
 /* Cleanup memory that's no longer used. */
 g_array_free(table_offsets, true);
 }
-- 
2.17.1





[PATCH 4/5] migration/block-dirty-bitmap: fix bitmaps migration during mirror job

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
Important thing for bitmap migration is to select destination block
node to obtain the migrated bitmap.

Prepatch, on source we use bdrv_get_device_or_node_name() to identify
the node, and on target we do bdrv_lookup_bs.
bdrv_get_device_or_node_name() returns blk name only for direct
children of blk. So, bitmaps of direct children of blks are migrated by
blk name and others - by node name.

Libvirt currently is unprepared to bitmap migration by node-name,
node-names are mostly auto-generated. So actually only migration by blk
name works.

Now, consider classic libvirt migrations assisted by mirror block job:
mirror block job inserts filter, so our source is not a direct child of
blk, and bitmaps are migrated by node-names. And this just don't work.

Let's fix it by allowing use blk-name even if some implicit filters are
inserted.

Note, that we possibly want to allow explicit filters skipping too, but
this is another story.

Note2: we, of course, can't skip filters and use blk name to migrate
bitmaps in filtered node by blk name for this blk if these filters have
named bitmaps which should be migrated.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1652424
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 migration/block-dirty-bitmap.c | 39 +-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 46641b7861..3105479c50 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -321,14 +321,48 @@ static int init_dirty_bitmap_migration(void)
 {
 BlockDriverState *bs;
 DirtyBitmapMigBitmapState *dbms;
+GHashTable *handled_by_blk = g_hash_table_new(NULL, NULL);
+BlockBackend *blk;
 
 dirty_bitmap_mig_state.bulk_completed = false;
 dirty_bitmap_mig_state.prev_bs = NULL;
 dirty_bitmap_mig_state.prev_bitmap = NULL;
 dirty_bitmap_mig_state.no_bitmaps = false;
 
+/*
+ * Use blockdevice name for direct (or filtered) children of named block
+ * backends.
+ */
+for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
+const char *name = blk_name(blk);
+
+if (!name || strcmp(name, "") == 0) {
+continue;
+}
+
+bs = blk_bs(blk);
+
+/* Skip filters without bitmaos */
+while (bs && bs->drv && bs->drv->is_filter &&
+   !bdrv_has_named_bitmaps(bs))
+{
+bs = bs->backing->bs ?: bs->file->bs;
+}
+
+if (bs && bs->drv && !bs->drv->is_filter) {
+if (add_bitmaps_to_list(bs, name)) {
+goto fail;
+}
+g_hash_table_add(handled_by_blk, bs);
+}
+}
+
 for (bs = bdrv_next_all_states(NULL); bs; bs = bdrv_next_all_states(bs)) {
-if (add_bitmaps_to_list(bs, bdrv_get_device_or_node_name(bs))) {
+if (g_hash_table_contains(handled_by_blk, bs)) {
+continue;
+}
+
+if (add_bitmaps_to_list(bs, bdrv_get_node_name(bs))) {
 goto fail;
 }
 }
@@ -342,9 +376,12 @@ static int init_dirty_bitmap_migration(void)
 dirty_bitmap_mig_state.no_bitmaps = true;
 }
 
+g_hash_table_destroy(handled_by_blk);
+
 return 0;
 
 fail:
+g_hash_table_destroy(handled_by_blk);
 dirty_bitmap_mig_cleanup();
 
 return -1;
-- 
2.21.0




[PATCH 3/5] block/dirty-bitmap: add bdrv_has_named_bitmaps helper

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
To be used for bitmap migration in further commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/dirty-bitmap.h |  1 +
 block/dirty-bitmap.c | 13 +
 2 files changed, 14 insertions(+)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 4b4b731b46..4c0ebe5c2a 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -101,6 +101,7 @@ int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs, int64_t bytes);
 bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap);
 bool bdrv_has_readonly_bitmaps(BlockDriverState *bs);
+bool bdrv_has_named_bitmaps(BlockDriverState *bs);
 bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_get_persistence(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_inconsistent(const BdrvDirtyBitmap *bitmap);
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 134e0c9a0c..cfc957ebc2 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -742,6 +742,19 @@ bool bdrv_has_readonly_bitmaps(BlockDriverState *bs)
 return false;
 }
 
+bool bdrv_has_named_bitmaps(BlockDriverState *bs)
+{
+BdrvDirtyBitmap *bm;
+
+QLIST_FOREACH(bm, >dirty_bitmaps, list) {
+if (bdrv_dirty_bitmap_name(bm)) {
+return true;
+}
+}
+
+return false;
+}
+
 /* Called with BQL taken. */
 void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap *bitmap, bool 
persistent)
 {
-- 
2.21.0




[PATCH 5/5] iotests: 194: test also migration of dirty bitmap

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
Test that dirty bitmap migration works when we deal with mirror.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/194 | 14 ++
 tests/qemu-iotests/194.out |  6 ++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
index d746ab1e21..bcf55a03e1 100755
--- a/tests/qemu-iotests/194
+++ b/tests/qemu-iotests/194
@@ -42,6 +42,8 @@ with iotests.FilePath('source.img') as source_img_path, \
 .add_incoming('unix:{0}'.format(migration_sock_path))
 .launch())
 
+source_vm.qmp_log('block-dirty-bitmap-add', node='drive0', name='bitmap0')
+
 iotests.log('Launching NBD server on destination...')
 iotests.log(dest_vm.qmp('nbd-server-start', addr={'type': 'unix', 'data': 
{'path': nbd_sock_path}}))
 iotests.log(dest_vm.qmp('nbd-server-add', device='drive0', writable=True))
@@ -61,12 +63,14 @@ with iotests.FilePath('source.img') as source_img_path, \
 filters=[iotests.filter_qmp_event])
 
 iotests.log('Starting migration...')
-source_vm.qmp('migrate-set-capabilities',
-  capabilities=[{'capability': 'events', 'state': True}])
-dest_vm.qmp('migrate-set-capabilities',
-capabilities=[{'capability': 'events', 'state': True}])
+capabilities = [{'capability': 'events', 'state': True},
+{'capability': 'dirty-bitmaps', 'state': True}]
+source_vm.qmp('migrate-set-capabilities', capabilities=capabilities)
+dest_vm.qmp('migrate-set-capabilities', capabilities=capabilities)
 iotests.log(source_vm.qmp('migrate', 
uri='unix:{0}'.format(migration_sock_path)))
 
+source_vm.qmp_log('migrate-start-postcopy')
+
 while True:
 event1 = source_vm.event_wait('MIGRATION')
 iotests.log(event1, filters=[iotests.filter_qmp_event])
@@ -82,3 +86,5 @@ with iotests.FilePath('source.img') as source_img_path, \
 iotests.log('Stopping the NBD server on destination...')
 iotests.log(dest_vm.qmp('nbd-server-stop'))
 break
+
+iotests.log(source_vm.qmp('query-block')['return'][0]['dirty-bitmaps'])
diff --git a/tests/qemu-iotests/194.out b/tests/qemu-iotests/194.out
index 71857853fb..dd60dcc14f 100644
--- a/tests/qemu-iotests/194.out
+++ b/tests/qemu-iotests/194.out
@@ -1,4 +1,6 @@
 Launching VMs...
+{"execute": "block-dirty-bitmap-add", "arguments": {"name": "bitmap0", "node": 
"drive0"}}
+{"return": {}}
 Launching NBD server on destination...
 {"return": {}}
 {"return": {}}
@@ -8,11 +10,15 @@ Waiting for `drive-mirror` to complete...
 {"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, 
"speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
 Starting migration...
 {"return": {}}
+{"execute": "migrate-start-postcopy", "arguments": {}}
+{"return": {}}
 {"data": {"status": "setup"}, "event": "MIGRATION", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
 {"data": {"status": "active"}, "event": "MIGRATION", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"status": "postcopy-active"}, "event": "MIGRATION", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
 {"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
 Gracefully ending the `drive-mirror` job on source...
 {"return": {}}
 {"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, 
"speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
 Stopping the NBD server on destination...
 {"return": {}}
+[{"busy": false, "count": 0, "granularity": 65536, "name": "bitmap0", 
"persistent": false, "recording": true, "status": "active"}]
-- 
2.21.0




[PATCH 2/5] migretion/block-dirty-bitmap: refactor init_dirty_bitmap_migration

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
Split out handling one bs, it is needed for the following commit, which
will handle BlockBackends in separate.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 migration/block-dirty-bitmap.c | 93 +++---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 5121f86d73..46641b7861 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -268,59 +268,68 @@ static void dirty_bitmap_mig_cleanup(void)
 }
 
 /* Called with iothread lock taken. */
-static int init_dirty_bitmap_migration(void)
+static int add_bitmaps_to_list(BlockDriverState *bs, const char *bs_name)
 {
-BlockDriverState *bs;
 BdrvDirtyBitmap *bitmap;
 DirtyBitmapMigBitmapState *dbms;
 Error *local_err = NULL;
 
-dirty_bitmap_mig_state.bulk_completed = false;
-dirty_bitmap_mig_state.prev_bs = NULL;
-dirty_bitmap_mig_state.prev_bitmap = NULL;
-dirty_bitmap_mig_state.no_bitmaps = false;
+for (bitmap = bdrv_dirty_bitmap_next(bs, NULL); bitmap;
+ bitmap = bdrv_dirty_bitmap_next(bs, bitmap))
+{
+if (!bdrv_dirty_bitmap_name(bitmap)) {
+continue;
+}
 
-for (bs = bdrv_next_all_states(NULL); bs; bs = bdrv_next_all_states(bs)) {
-const char *name = bdrv_get_device_or_node_name(bs);
+if (!bs_name || strcmp(bs_name, "") == 0) {
+error_report("Found bitmap '%s' in unnamed node %p. It can't "
+ "be migrated", bdrv_dirty_bitmap_name(bitmap), bs);
+return -1;
+}
 
-for (bitmap = bdrv_dirty_bitmap_next(bs, NULL); bitmap;
- bitmap = bdrv_dirty_bitmap_next(bs, bitmap))
-{
-if (!bdrv_dirty_bitmap_name(bitmap)) {
-continue;
-}
+if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT, _err)) {
+error_report_err(local_err);
+return -1;
+}
 
-if (!name || strcmp(name, "") == 0) {
-error_report("Found bitmap '%s' in unnamed node %p. It can't "
- "be migrated", bdrv_dirty_bitmap_name(bitmap), 
bs);
-goto fail;
-}
+bdrv_ref(bs);
+bdrv_dirty_bitmap_set_busy(bitmap, true);
+
+dbms = g_new0(DirtyBitmapMigBitmapState, 1);
+dbms->bs = bs;
+dbms->node_name = bs_name;
+dbms->bitmap = bitmap;
+dbms->total_sectors = bdrv_nb_sectors(bs);
+dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
+bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
+if (bdrv_dirty_bitmap_enabled(bitmap)) {
+dbms->flags |= DIRTY_BITMAP_MIG_START_FLAG_ENABLED;
+}
+if (bdrv_dirty_bitmap_get_persistence(bitmap)) {
+dbms->flags |= DIRTY_BITMAP_MIG_START_FLAG_PERSISTENT;
+}
 
-if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT,
-_err)) {
-error_report_err(local_err);
-goto fail;
-}
+QSIMPLEQ_INSERT_TAIL(_bitmap_mig_state.dbms_list,
+ dbms, entry);
+}
 
-bdrv_ref(bs);
-bdrv_dirty_bitmap_set_busy(bitmap, true);
-
-dbms = g_new0(DirtyBitmapMigBitmapState, 1);
-dbms->bs = bs;
-dbms->node_name = name;
-dbms->bitmap = bitmap;
-dbms->total_sectors = bdrv_nb_sectors(bs);
-dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
-bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
-if (bdrv_dirty_bitmap_enabled(bitmap)) {
-dbms->flags |= DIRTY_BITMAP_MIG_START_FLAG_ENABLED;
-}
-if (bdrv_dirty_bitmap_get_persistence(bitmap)) {
-dbms->flags |= DIRTY_BITMAP_MIG_START_FLAG_PERSISTENT;
-}
+return 0;
+}
+
+/* Called with iothread lock taken. */
+static int init_dirty_bitmap_migration(void)
+{
+BlockDriverState *bs;
+DirtyBitmapMigBitmapState *dbms;
+
+dirty_bitmap_mig_state.bulk_completed = false;
+dirty_bitmap_mig_state.prev_bs = NULL;
+dirty_bitmap_mig_state.prev_bitmap = NULL;
+dirty_bitmap_mig_state.no_bitmaps = false;
 
-QSIMPLEQ_INSERT_TAIL(_bitmap_mig_state.dbms_list,
- dbms, entry);
+for (bs = bdrv_next_all_states(NULL); bs; bs = bdrv_next_all_states(bs)) {
+if (add_bitmaps_to_list(bs, bdrv_get_device_or_node_name(bs))) {
+goto fail;
 }
 }
 
-- 
2.21.0




Re: [PATCH v5 0/5] iotests: use python logging

2019-10-04 Thread Max Reitz
On 18.09.19 01:45, John Snow wrote:
> This series uses python logging to enable output conditionally on
> iotests.log(). We unify an initialization call (which also enables
> debugging output for those tests with -d) and then make the switch
> inside of iotests.
> 
> It will help alleviate the need to create logged/unlogged versions
> of all the various helpers we have made.
> 
> V5:
>  - Rebased again
>  - Allow Python tests to run on any platform
> 
> V4:
>  - Rebased on top of kwolf/block at the behest of mreitz
> 
> V3:
>  - Rebased for 4.1+; now based on main branch.
> 
> V2:
>  - Added all of the other python tests I missed to use script_initialize
>  - Refactored the common setup as per Ehabkost's suggestion
>  - Added protocol arguments to common initialization,
>but this isn't strictly required.

I’m OK to take the series as-is (it doesn’t affect any auto tests, so we
can decide what to do about non-Linux platforms in make check at a later
point), but there seems to be something you wanted to fix up in patch 5.

(And there’s also Kevin’s pending pull request that changes a bit of
iotests.py.)

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
04.10.2019 18:27, Max Reitz wrote:
> On 04.10.19 17:04, Vladimir Sementsov-Ogievskiy wrote:
>> 04.10.2019 17:48, Max Reitz wrote:
>>> On 04.10.19 15:22, Vladimir Sementsov-Ogievskiy wrote:
 04.10.2019 15:59, Max Reitz wrote:
> On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
>> 02.10.2019 18:52, Max Reitz wrote:
>>> On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
> 02.10.2019 17:57, Max Reitz wrote:
>> On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
>>> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset 
>>> aligned-up
>>> region in the dirty bitmap, which means that we may not copy some 
>>> bytes
>>> and assume them copied, which actually leads to producing corrupted
>>> target.
>>>
>>> So 9adc1cb49af8d forced dirty bitmap granularity to be
>>> request_alignment for mirror-top filter, so we are not working with
>>> unaligned requests. However forcing large alignment obviously 
>>> decreases
>>> performance of unaligned requests.
>>>
>>> This commit provides another solution for the problem: if unaligned
>>> padding is already dirty, we can safely ignore it, as
>>> 1. It's dirty, it will be copied by mirror_iteration anyway
>>> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>>>     bitmap and therefore don't damage "synchronicity" of the
>>>     write-blocking mirror.
>>
>> But that’s not what active mirror is for.  The point of active 
>> mirror is
>> that it must converge because every guest write will contribute 
>> towards
>> that goal.
>>
>> If you skip active mirroring for unaligned guest writes, they will 
>> not
>> contribute towards converging, but in fact lead to the opposite.
>>
>
> The will not contribute only if region is already dirty. Actually, 
> after
> first iteration of mirroring (copying the whole disk), all following 
> writes
> will contribute, so the whole process must converge. It is a bit 
> similar with
> running one mirror loop in normal mode, and then enable 
> write-blocking.
>


 In other words, we don't need "all guest writes contribute" to 
 converge,
 "all guest writes don't create new dirty bits" is enough, as we have 
 parallel
 mirror iteration which contiguously handles dirty bits.
>>>
>>> Hm, in a sense.  But it does mean that guest writes will not contribute
>>> to convergence.
>>>
>>> And that’s against the current definition of write-blocking, which does
>>> state that “when data is written to the source, write it (synchronously)
>>> to the target as well”.
>>>
>>
>> Hmm, understand. But IMHO our proposed behavior is better in general.
>> Do you think it's a problem to change spec now?
>> If yes, I'll resend with an option
>
> Well, the thing is that I’d find it weird if write-blocking wasn’t
> blocking in all cases.  And in my opinion, it makes more sense for
> active mirror if all writes actively contributed to convergence.
>

 Why? What is the benefit in it?
 What is "all writes actively contributed to convergence" for user?
>>>
>>> One thing I wonder about is whether it’s really guaranteed that the
>>> background job will run under full I/O load, and how often it runs.
>>>
>>> I fear that with your model, the background job might starve and the
>>> mirror may take a very long time.
>>
>> Hmmm. I think mirror job is in same context as guest writes, and goes
>> through same IO api.. Why it will starve? (I understand that my words
>> are not an evidence...).
> 
> I thought that maybe if the disk is read to write and write all the
> time, there’d be no time for the mirror coroutine.
> 
>>>   It won’t diverge, but it also won’t
>>> really converge.
>>
>> But same will be with current behavior: guest is not guaranteed to write
>> to all parts of disk. And in most scenarios it doesn't. So, if mirror job
>> starve because of huge guest IO load, we will not converge anyway.
>>
>> So, background process is necessary thing for converge anyway.
> 
> Good point.  That convinces me.
> 
>>> The advantage of letting all writes block is that even under full I/O
>>> load, the mirror job will progress at a steady pace.
>>>
 I think for user there may be the following criteria:

 1. guaranteed converge, with any guest write load.
 Both current and my proposed variants are OK.

 2. Less impact on guest.
 Obviously my proposed variant is better

 3. Total time of mirroring
 Seems, current may be a bit better, but I don't think that unaligned

Re: [PATCH v2] s390x/tcg: MVCL: Exit to main loop if requested

2019-10-04 Thread Richard Henderson
On 10/4/19 5:34 AM, David Hildenbrand wrote:
> Or, as alternative, something like "cpu_shall_exit()" which only
> wraps the single check.

I would prefer this to the full cpu_loop_exit_restore.
It's harder to imagine what else might need doing for
some other user of the interface.


r~



Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Max Reitz
On 04.10.19 17:04, Vladimir Sementsov-Ogievskiy wrote:
> 04.10.2019 17:48, Max Reitz wrote:
>> On 04.10.19 15:22, Vladimir Sementsov-Ogievskiy wrote:
>>> 04.10.2019 15:59, Max Reitz wrote:
 On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
> 02.10.2019 18:52, Max Reitz wrote:
>> On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
>>> 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
 02.10.2019 17:57, Max Reitz wrote:
> On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
>> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset 
>> aligned-up
>> region in the dirty bitmap, which means that we may not copy some 
>> bytes
>> and assume them copied, which actually leads to producing corrupted
>> target.
>>
>> So 9adc1cb49af8d forced dirty bitmap granularity to be
>> request_alignment for mirror-top filter, so we are not working with
>> unaligned requests. However forcing large alignment obviously 
>> decreases
>> performance of unaligned requests.
>>
>> This commit provides another solution for the problem: if unaligned
>> padding is already dirty, we can safely ignore it, as
>> 1. It's dirty, it will be copied by mirror_iteration anyway
>> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>>    bitmap and therefore don't damage "synchronicity" of the
>>    write-blocking mirror.
>
> But that’s not what active mirror is for.  The point of active mirror 
> is
> that it must converge because every guest write will contribute 
> towards
> that goal.
>
> If you skip active mirroring for unaligned guest writes, they will not
> contribute towards converging, but in fact lead to the opposite.
>

 The will not contribute only if region is already dirty. Actually, 
 after
 first iteration of mirroring (copying the whole disk), all following 
 writes
 will contribute, so the whole process must converge. It is a bit 
 similar with
 running one mirror loop in normal mode, and then enable write-blocking.

>>>
>>>
>>> In other words, we don't need "all guest writes contribute" to converge,
>>> "all guest writes don't create new dirty bits" is enough, as we have 
>>> parallel
>>> mirror iteration which contiguously handles dirty bits.
>>
>> Hm, in a sense.  But it does mean that guest writes will not contribute
>> to convergence.
>>
>> And that’s against the current definition of write-blocking, which does
>> state that “when data is written to the source, write it (synchronously)
>> to the target as well”.
>>
>
> Hmm, understand. But IMHO our proposed behavior is better in general.
> Do you think it's a problem to change spec now?
> If yes, I'll resend with an option

 Well, the thing is that I’d find it weird if write-blocking wasn’t
 blocking in all cases.  And in my opinion, it makes more sense for
 active mirror if all writes actively contributed to convergence.

>>>
>>> Why? What is the benefit in it?
>>> What is "all writes actively contributed to convergence" for user?
>>
>> One thing I wonder about is whether it’s really guaranteed that the
>> background job will run under full I/O load, and how often it runs.
>>
>> I fear that with your model, the background job might starve and the
>> mirror may take a very long time.
> 
> Hmmm. I think mirror job is in same context as guest writes, and goes
> through same IO api.. Why it will starve? (I understand that my words
> are not an evidence...).

I thought that maybe if the disk is read to write and write all the
time, there’d be no time for the mirror coroutine.

>>  It won’t diverge, but it also won’t
>> really converge.
> 
> But same will be with current behavior: guest is not guaranteed to write
> to all parts of disk. And in most scenarios it doesn't. So, if mirror job
> starve because of huge guest IO load, we will not converge anyway.
> 
> So, background process is necessary thing for converge anyway.

Good point.  That convinces me.

>> The advantage of letting all writes block is that even under full I/O
>> load, the mirror job will progress at a steady pace.
>>
>>> I think for user there may be the following criteria:
>>>
>>> 1. guaranteed converge, with any guest write load.
>>> Both current and my proposed variants are OK.
>>>
>>> 2. Less impact on guest.
>>> Obviously my proposed variant is better
>>>
>>> 3. Total time of mirroring
>>> Seems, current may be a bit better, but I don't think that unaligned
>>> tails gives significant impact.
>>>
>>> ===
>>>
>>> So, assume I want [1]+[2]. And possibly
>>> 2.2: Even less impact on guest: ignore not only unaligned tails if they are
>>> already 

[PATCH v2] target/riscv: Expose "priv" register for GDB

2019-10-04 Thread Jonathan Behrens
This patch enables a debugger to read and write the current privilege level via
a special "priv" register. When compiled with CONFIG_USER_ONLY the register is
still visible but is hardwired to zero.

Signed-off-by: Jonathan Behrens 
---
 gdb-xml/riscv-32bit-cpu.xml |  1 +
 gdb-xml/riscv-64bit-cpu.xml |  1 +
 target/riscv/cpu.c  |  2 +-
 target/riscv/gdbstub.c  | 14 ++
 4 files changed, 17 insertions(+), 1 deletion(-)
---
Changelog V2:
- Use PRV_H and PRV_S instead of integer literals

diff --git a/gdb-xml/riscv-32bit-cpu.xml b/gdb-xml/riscv-32bit-cpu.xml
index 0d07aaec85..d6d76aafd8 100644
--- a/gdb-xml/riscv-32bit-cpu.xml
+++ b/gdb-xml/riscv-32bit-cpu.xml
@@ -44,4 +44,5 @@
   
   
   
+  
 
diff --git a/gdb-xml/riscv-64bit-cpu.xml b/gdb-xml/riscv-64bit-cpu.xml
index b8aa424ae4..0758d1b5fe 100644
--- a/gdb-xml/riscv-64bit-cpu.xml
+++ b/gdb-xml/riscv-64bit-cpu.xml
@@ -44,4 +44,5 @@
   
   
   
+  
 
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f13e298a36..347989858f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -475,7 +475,7 @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
 cc->synchronize_from_tb = riscv_cpu_synchronize_from_tb;
 cc->gdb_read_register = riscv_cpu_gdb_read_register;
 cc->gdb_write_register = riscv_cpu_gdb_write_register;
-cc->gdb_num_core_regs = 33;
+cc->gdb_num_core_regs = 34;
 #if defined(TARGET_RISCV32)
 cc->gdb_core_xml_file = "riscv-32bit-cpu.xml";
 #elif defined(TARGET_RISCV64)
diff --git a/target/riscv/gdbstub.c b/target/riscv/gdbstub.c
index ded140e8d8..7e0822145d 100644
--- a/target/riscv/gdbstub.c
+++ b/target/riscv/gdbstub.c
@@ -278,6 +278,12 @@ int riscv_cpu_gdb_read_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 return gdb_get_regl(mem_buf, env->gpr[n]);
 } else if (n == 32) {
 return gdb_get_regl(mem_buf, env->pc);
+} else if (n == 33) {
+#ifdef CONFIG_USER_ONLY
+return gdb_get_regl(mem_buf, 0);
+#else
+return gdb_get_regl(mem_buf, env->priv);
+#endif
 }
 return 0;
 }
@@ -296,6 +302,14 @@ int riscv_cpu_gdb_write_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 } else if (n == 32) {
 env->pc = ldtul_p(mem_buf);
 return sizeof(target_ulong);
+} else if (n == 33) {
+#ifndef CONFIG_USER_ONLY
+env->priv = ldtul_p(mem_buf) & 0x3;
+if (env->priv == PRV_H) {
+env->priv = PRV_S;
+}
+#endif
+return sizeof(target_ulong);
 }
 return 0;
 }
-- 
2.23.0



Re: [Qemu-devel] [PATCH v9 07/17] blockdev: adds bdrv_parse_aio to use io_uring

2019-10-04 Thread Stefan Hajnoczi
On Wed, Aug 07, 2019 at 02:49:51PM +0200, Julia Suvorova via Qemu-devel wrote:
> On Wed, Aug 7, 2019 at 2:06 PM Aarushi Mehta  wrote:
> >
> >
> >
> > On Wed, 7 Aug, 2019, 17:15 Julia Suvorova,  wrote:
> >>
> >> On Fri, Aug 2, 2019 at 1:41 AM Aarushi Mehta  
> >> wrote:
> >> > +int bdrv_parse_aio(const char *mode, int *flags)
> >> > +{
> >> > +if (!strcmp(mode, "threads")) {
> >> > +/* do nothing, default */
> >> > +} else if (!strcmp(mode, "native")) {
> >> > +*flags |= BDRV_O_NATIVE_AIO;
> >>
> >> This 'if' should be covered with CONFIG_LINUX_AIO.
> >
> >
> > The aio=native definition is shared with Windows hosts' native aio and will 
> > break if it was covered.
> >
> > file-posix handles the case.
> 
> Fair enough. Then you can remove all ifdefs for io_uring from
> raw_open_common in file-posix.c as this case was already checked here.

This is not possible since the BLOCKDEV_AIO_OPTIONS_IO_URING enum
constant is conditional in the QAPI schema:

  { 'enum': 'BlockdevAioOptions',
'data': [ 'threads', 'native',
  { 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] }

The code can only use BLOCKDEV_AIO_OPTIONS_IO_URING if
CONFIG_LINUX_IO_URING was defined, so we cannot drop the #ifdefs in
raw_open_common().

Stefan


signature.asc
Description: PGP signature


Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
04.10.2019 17:48, Max Reitz wrote:
> On 04.10.19 15:22, Vladimir Sementsov-Ogievskiy wrote:
>> 04.10.2019 15:59, Max Reitz wrote:
>>> On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
 02.10.2019 18:52, Max Reitz wrote:
> On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
>> 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
>>> 02.10.2019 17:57, Max Reitz wrote:
 On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset 
> aligned-up
> region in the dirty bitmap, which means that we may not copy some 
> bytes
> and assume them copied, which actually leads to producing corrupted
> target.
>
> So 9adc1cb49af8d forced dirty bitmap granularity to be
> request_alignment for mirror-top filter, so we are not working with
> unaligned requests. However forcing large alignment obviously 
> decreases
> performance of unaligned requests.
>
> This commit provides another solution for the problem: if unaligned
> padding is already dirty, we can safely ignore it, as
> 1. It's dirty, it will be copied by mirror_iteration anyway
> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>    bitmap and therefore don't damage "synchronicity" of the
>    write-blocking mirror.

 But that’s not what active mirror is for.  The point of active mirror 
 is
 that it must converge because every guest write will contribute towards
 that goal.

 If you skip active mirroring for unaligned guest writes, they will not
 contribute towards converging, but in fact lead to the opposite.

>>>
>>> The will not contribute only if region is already dirty. Actually, after
>>> first iteration of mirroring (copying the whole disk), all following 
>>> writes
>>> will contribute, so the whole process must converge. It is a bit 
>>> similar with
>>> running one mirror loop in normal mode, and then enable write-blocking.
>>>
>>
>>
>> In other words, we don't need "all guest writes contribute" to converge,
>> "all guest writes don't create new dirty bits" is enough, as we have 
>> parallel
>> mirror iteration which contiguously handles dirty bits.
>
> Hm, in a sense.  But it does mean that guest writes will not contribute
> to convergence.
>
> And that’s against the current definition of write-blocking, which does
> state that “when data is written to the source, write it (synchronously)
> to the target as well”.
>

 Hmm, understand. But IMHO our proposed behavior is better in general.
 Do you think it's a problem to change spec now?
 If yes, I'll resend with an option
>>>
>>> Well, the thing is that I’d find it weird if write-blocking wasn’t
>>> blocking in all cases.  And in my opinion, it makes more sense for
>>> active mirror if all writes actively contributed to convergence.
>>>
>>
>> Why? What is the benefit in it?
>> What is "all writes actively contributed to convergence" for user?
> 
> One thing I wonder about is whether it’s really guaranteed that the
> background job will run under full I/O load, and how often it runs.
> 
> I fear that with your model, the background job might starve and the
> mirror may take a very long time.

Hmmm. I think mirror job is in same context as guest writes, and goes
through same IO api.. Why it will starve? (I understand that my words
are not an evidence...).

>  It won’t diverge, but it also won’t
> really converge.

But same will be with current behavior: guest is not guaranteed to write
to all parts of disk. And in most scenarios it doesn't. So, if mirror job
starve because of huge guest IO load, we will not converge anyway.

So, background process is necessary thing for converge anyway.

> 
> The advantage of letting all writes block is that even under full I/O
> load, the mirror job will progress at a steady pace.
> 
>> I think for user there may be the following criteria:
>>
>> 1. guaranteed converge, with any guest write load.
>> Both current and my proposed variants are OK.
>>
>> 2. Less impact on guest.
>> Obviously my proposed variant is better
>>
>> 3. Total time of mirroring
>> Seems, current may be a bit better, but I don't think that unaligned
>> tails gives significant impact.
>>
>> ===
>>
>> So, assume I want [1]+[2]. And possibly
>> 2.2: Even less impact on guest: ignore not only unaligned tails if they are
>> already dirty, but full synchronous mirror operation if area is already 
>> dirty.
>>
>> How should I call this? Should it be separate mode, or option for 
>> write-blocking?
> 
> I don’t know whether it makes sense to add a separate mode or a separate
> option just for this difference.  I don’t think anyone would choose the
> non-default option.

At 

[PATCH] netmap: support git-submodule build otption

2019-10-04 Thread Giuseppe Lettieri
From: Giuseppe Lettieri 

With this patch, netmap support can be enabled with
the following options to the configure script:

  --enable-netmap[=system]

Use the host system netmap installation.
Fail if not found.

  --enable-netmap=git

clone the official netmap repository on
github (mostly useful for CI)

Signed-off-by: Giuseppe Lettieri 
---
 .gitmodules |  3 +++
 configure   | 64 +
 2 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index c5c474169d..bf75dbc5e3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -58,3 +58,6 @@
 [submodule "roms/opensbi"]
path = roms/opensbi
url =   https://git.qemu.org/git/opensbi.git
+[submodule "netmap"]
+   path = netmap
+   url = https://github.com/luigirizzo/netmap.git
diff --git a/configure b/configure
index 8f8446f52b..cb2c6c70d6 100755
--- a/configure
+++ b/configure
@@ -1132,6 +1132,10 @@ for opt do
   ;;
   --enable-netmap) netmap="yes"
   ;;
+  --enable-netmap=git) netmap="git"
+  ;;
+  --enable-netmap=system) netmap="system"
+  ;;
   --disable-xen) xen="no"
   ;;
   --enable-xen) xen="yes"
@@ -3318,8 +3322,9 @@ fi
 # a minor/major version number. Minor new features will be marked with values 
up
 # to 15, and if something happens that requires a change to the backend we will
 # move above 15, submit the backend fixes and modify this two bounds.
-if test "$netmap" != "no" ; then
-  cat > $TMPC << EOF
+case "$netmap" in
+"" | yes | system)
+  cat > $TMPC << EOF
 #include 
 #include 
 #include 
@@ -3330,14 +3335,55 @@ if test "$netmap" != "no" ; then
 int main(void) { return 0; }
 EOF
   if compile_prog "" "" ; then
-netmap=yes
+netmap_system=yes
   else
-if test "$netmap" = "yes" ; then
-  feature_not_found "netmap"
-fi
-netmap=no
+netmap_system=no
   fi
-fi
+  ;;
+esac
+
+case "$netmap" in
+  "" | yes)
+if test "$netmap_system" = "yes"; then
+  netmap=system
+elif test -e "${source_path}/.git" && test $git_update = 'yes' ; then
+  netmap=git
+elif test -e "${source_path}/netmap/configure" ; then
+  netmap=internal
+elif test -z "$netmap" ; then
+  netmap=no
+else
+  feature_not_found "netmap" "Install netmap or git submodule"
+fi
+;;
+
+  system)
+if test "$netmap_system" = "no"; then
+  feature_not_found "netmap" "Install netmap"
+fi
+;;
+esac
+
+case "$netmap" in
+  git | internal)
+if test "$netmap" = git; then
+  git_submodules="${git_submodules} netmap"
+fi
+mkdir -p netmap
+QEMU_CFLAGS="$QEMU_CFLAGS -I\$(SRC_PATH)/netmap/sys"
+;;
+
+  system)
+;;
+
+  no)
+;;
+  *)
+error_exit "Unknown state for netmap: $netmap"
+;;
+esac
+
+##
 
 ##
 # libcap-ng library probe
@@ -6585,7 +6631,7 @@ if test "$vde" = "yes" ; then
   echo "CONFIG_VDE=y" >> $config_host_mak
   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
 fi
-if test "$netmap" = "yes" ; then
+if test "$netmap" != "no" ; then
   echo "CONFIG_NETMAP=y" >> $config_host_mak
 fi
 if test "$l2tpv3" = "yes" ; then
-- 
2.21.0




Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Max Reitz
On 04.10.19 15:22, Vladimir Sementsov-Ogievskiy wrote:
> 04.10.2019 15:59, Max Reitz wrote:
>> On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
>>> 02.10.2019 18:52, Max Reitz wrote:
 On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
> 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
>> 02.10.2019 17:57, Max Reitz wrote:
>>> On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
 Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset aligned-up
 region in the dirty bitmap, which means that we may not copy some bytes
 and assume them copied, which actually leads to producing corrupted
 target.

 So 9adc1cb49af8d forced dirty bitmap granularity to be
 request_alignment for mirror-top filter, so we are not working with
 unaligned requests. However forcing large alignment obviously decreases
 performance of unaligned requests.

 This commit provides another solution for the problem: if unaligned
 padding is already dirty, we can safely ignore it, as
 1. It's dirty, it will be copied by mirror_iteration anyway
 2. It's dirty, so skipping it now we don't increase dirtiness of the
       bitmap and therefore don't damage "synchronicity" of the
       write-blocking mirror.
>>>
>>> But that’s not what active mirror is for.  The point of active mirror is
>>> that it must converge because every guest write will contribute towards
>>> that goal.
>>>
>>> If you skip active mirroring for unaligned guest writes, they will not
>>> contribute towards converging, but in fact lead to the opposite.
>>>
>>
>> The will not contribute only if region is already dirty. Actually, after
>> first iteration of mirroring (copying the whole disk), all following 
>> writes
>> will contribute, so the whole process must converge. It is a bit similar 
>> with
>> running one mirror loop in normal mode, and then enable write-blocking.
>>
>
>
> In other words, we don't need "all guest writes contribute" to converge,
> "all guest writes don't create new dirty bits" is enough, as we have 
> parallel
> mirror iteration which contiguously handles dirty bits.

 Hm, in a sense.  But it does mean that guest writes will not contribute
 to convergence.

 And that’s against the current definition of write-blocking, which does
 state that “when data is written to the source, write it (synchronously)
 to the target as well”.

>>>
>>> Hmm, understand. But IMHO our proposed behavior is better in general.
>>> Do you think it's a problem to change spec now?
>>> If yes, I'll resend with an option
>>
>> Well, the thing is that I’d find it weird if write-blocking wasn’t
>> blocking in all cases.  And in my opinion, it makes more sense for
>> active mirror if all writes actively contributed to convergence.
>>
> 
> Why? What is the benefit in it?
> What is "all writes actively contributed to convergence" for user?

One thing I wonder about is whether it’s really guaranteed that the
background job will run under full I/O load, and how often it runs.

I fear that with your model, the background job might starve and the
mirror may take a very long time.  It won’t diverge, but it also won’t
really converge.

The advantage of letting all writes block is that even under full I/O
load, the mirror job will progress at a steady pace.

> I think for user there may be the following criteria:
> 
> 1. guaranteed converge, with any guest write load.
> Both current and my proposed variants are OK.
> 
> 2. Less impact on guest.
> Obviously my proposed variant is better
> 
> 3. Total time of mirroring
> Seems, current may be a bit better, but I don't think that unaligned
> tails gives significant impact.
> 
> ===
> 
> So, assume I want [1]+[2]. And possibly
> 2.2: Even less impact on guest: ignore not only unaligned tails if they are
> already dirty, but full synchronous mirror operation if area is already dirty.
> 
> How should I call this? Should it be separate mode, or option for 
> write-blocking?

I don’t know whether it makes sense to add a separate mode or a separate
option just for this difference.  I don’t think anyone would choose the
non-default option.

But I do think there’s quite a bit of difference in how the job behaves
still...  I don’t know. :-/

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v5 1/5] iotests: remove 'linux' from default supported platforms

2019-10-04 Thread Max Reitz
On 04.10.19 16:05, Peter Maydell wrote:
> On Fri, 4 Oct 2019 at 14:50, Max Reitz  wrote:
>> On 04.10.19 15:16, Peter Maydell wrote:
>>> 'make check' does have the restriction
>>> that we don't want the tests to take too long to run, but in
>>> general the block layer should be running some reasonable subset
>>> of tests in the project's standard "run the tests please" command.
>>> (I have no opinion on exactly what that subset ought to be, beyond
>>> that it would be good if that subset doesn't have known intermittent
>>> failures in it.)
>>
>> Deciding the subset is difficult.  My stance was that it’s better to not
>> choose an arbitrary subset here but ensure that really everything gets
>> run as part of CI, that is asynchronously so it doesn’t block anyone and
>> can take as long as it wants.
> 
> I wonder if we have a terminology difference confusion here.
> To me "CI" means "we have tests which we automatically run
> before pushing commits to master, and if they fail then we
> don't push". So (a) they have to run synchronously and (b) there
> is a need for them to run in a reasonable period of time because
> otherwise it takes too long to run the tests before pushing
> to master and we get a backlog of unprocessed pullrequests
> and annoying delays.

Hm, yes.  In that case, there’s of course no point in moving it.

I meant moving the tests to somewhere where they run asynchronously.

>> If we decide to get pull requests based on that, then that’d bring even
>> more pain, but at least it’d be honest.  But just running half of the
>> qcow2 tests to me seems only like we want to pretend we ran the iotests.
>>  So for me, iotests still break, and we need to deal with make check
>> failures on top.  I’d at least prefer one or the other.
> 
> I think the ideal we're aiming for here is:
>  (1) people doing active work in the block subsystem are probably
> often going to want to run all the iotests, and certainly the
> subsystem maintainers will want to do that as they put together
> pull requests.
>  (2) but people who *don't* do active work in the block subsystem
> still sometimes touch it in passing as part of things like global
> refactorings or other changes that touch big parts of the tree,
> or accidentally break it with a change to some other bit of QEMU
> entirely. These people won't run the full iotests, but it is
> reasonable to expect them to run 'make check', and it would be good
> if that caught "whoops you broke the block subsystem".
> 
> Similarly, "make check" has incredibly spotty coverage of
> various arm boards and devices, but it does do some basic
> checks which do catch accidental breakage.

Yes, it has its use, but at the same time it means that intermittent
failure can appear in make check because some iotest is broken.  (The
past has shown that it’s hard to predict which tests will at some point
start to exhibit such behavior.)

This then requires immediate attention, and I simply want help with that
from someone who really cares about having the test be part of make
check, as I wrote in my reply to Thomas’s patch “iotests: Do not run the
iotests during "make check" anymore”.

The issues are just half as pressing when it isn’t make check that’s
intermittently failing.  (I hope nobody takes this as “He doesn’t want
to fix intermittent failures”, because I do really try to do that.  So I
don’t consider this a good kind of pressure.)


I suppose my main problem is that I’m just lucky that I can’t remember a
case where the block subsystem was really broken and iotests in make
check would have caught it; and that I’m naïve enough to expect this to
continue into the future.

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v15 0/5] backup-top filter driver for backup

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
04.10.2019 17:21, Max Reitz wrote:
> On 01.10.19 15:14, Vladimir Sementsov-Ogievskiy wrote:
>> Hi all!
>>
>> These series introduce backup-top driver. It's a filter-node, which
>> do copy-before-write operation. Mirror uses filter-node for handling
>> guest writes, let's move to filter-node (from write-notifiers) for
>> backup too.
>>
>> Preparation patches are queued in Max's block branch:
>>
>> Based-on: https://github.com/XanClic/qemu.git block
>>
>> v15: use BdrvChildren in block-copy
>> 01-03: new
>> 04-05: a lot of changes, such as
>> 04:
>>  - add new parameters for creation
>>  - prepare bcs creation
>>  - add target child
>>  - refactor bdrv_backup_top_append
>>  - drop r-b
>> 05:
>>  - move block-copy to use BdrvChildren
>>  - drop extra style fixing hunks
>>  - iotest 141 output changed
>>
>> v14: Drop range locks, keep old good in-flight requests waiting
>>
>> 12: new patch
>> 14: use old request-waiting scheme instead of range locks
>>
>> Vladimir Sementsov-Ogievskiy (5):
>>block/backup: move in-flight requests handling from backup to
>>  block-copy
>>block/backup: move write_flags calculation inside backup_job_create
>>block/block-copy: split block_copy_set_callbacks function
>>block: introduce backup-top filter driver
>>block/backup: use backup-top instead of write notifiers
>>
>>   qapi/block-core.json   |   8 +-
>>   block/backup-top.h |  41 +
>>   include/block/block-copy.h |  31 +++-
>>   include/block/block_int.h  |   1 +
>>   block/backup-top.c | 276 +
>>   block/backup.c | 147 +-
>>   block/block-copy.c | 140 +
>>   block/replication.c|   2 +-
>>   blockdev.c |   1 +
>>   block/Makefile.objs|   1 +
>>   tests/qemu-iotests/056 |   8 +-
>>   tests/qemu-iotests/141.out |   2 +-
>>   tests/qemu-iotests/257 |   7 +-
>>   tests/qemu-iotests/257.out | 306 ++---
>>   14 files changed, 628 insertions(+), 343 deletions(-)
>>   create mode 100644 block/backup-top.h
>>   create mode 100644 block/backup-top.c
> 
> Thanks, applied to my block branch:
> 
> https://git.xanclic.moe/XanClic/qemu/commits/branch/block
> 
> Max
> 

Yay!! Thank you!

-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] tests/migration: fix a typo in comment

2019-10-04 Thread Philippe Mathieu-Daudé

On 9/11/19 5:31 AM, Mao Zhongyi wrote:

Cc: arm...@redhat.com
Cc: laur...@vivier.eu
Cc: tony.ngu...@bt.com

Signed-off-by: Mao Zhongyi 
Reviewed-by: Alex Bennée 
---
  tests/migration/stress.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/migration/stress.c b/tests/migration/stress.c
index 6cbb2d49d3..19a6eff5fd 100644
--- a/tests/migration/stress.c
+++ b/tests/migration/stress.c
@@ -191,7 +191,7 @@ static int stressone(unsigned long long ramsizeMB)
  
  /* We don't care about initial state, but we do want

   * to fault it all into RAM, otherwise the first iter
- * of the loop below will be quite slow. We cna't use
+ * of the loop below will be quite slow. We can't use
   * 0x0 as the byte as gcc optimizes that away into a
   * calloc instead :-) */
  memset(ram, 0xfe, ramsizeMB * 1024 * 1024);



Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH v15 0/5] backup-top filter driver for backup

2019-10-04 Thread Max Reitz
On 01.10.19 15:14, Vladimir Sementsov-Ogievskiy wrote:
> Hi all!
> 
> These series introduce backup-top driver. It's a filter-node, which
> do copy-before-write operation. Mirror uses filter-node for handling
> guest writes, let's move to filter-node (from write-notifiers) for
> backup too.
> 
> Preparation patches are queued in Max's block branch:
> 
> Based-on: https://github.com/XanClic/qemu.git block
> 
> v15: use BdrvChildren in block-copy
> 01-03: new
> 04-05: a lot of changes, such as
> 04:
> - add new parameters for creation
> - prepare bcs creation
> - add target child
> - refactor bdrv_backup_top_append
> - drop r-b
> 05:
> - move block-copy to use BdrvChildren
> - drop extra style fixing hunks
> - iotest 141 output changed
> 
> v14: Drop range locks, keep old good in-flight requests waiting
> 
> 12: new patch
> 14: use old request-waiting scheme instead of range locks
> 
> Vladimir Sementsov-Ogievskiy (5):
>   block/backup: move in-flight requests handling from backup to
> block-copy
>   block/backup: move write_flags calculation inside backup_job_create
>   block/block-copy: split block_copy_set_callbacks function
>   block: introduce backup-top filter driver
>   block/backup: use backup-top instead of write notifiers
> 
>  qapi/block-core.json   |   8 +-
>  block/backup-top.h |  41 +
>  include/block/block-copy.h |  31 +++-
>  include/block/block_int.h  |   1 +
>  block/backup-top.c | 276 +
>  block/backup.c | 147 +-
>  block/block-copy.c | 140 +
>  block/replication.c|   2 +-
>  blockdev.c |   1 +
>  block/Makefile.objs|   1 +
>  tests/qemu-iotests/056 |   8 +-
>  tests/qemu-iotests/141.out |   2 +-
>  tests/qemu-iotests/257 |   7 +-
>  tests/qemu-iotests/257.out | 306 ++---
>  14 files changed, 628 insertions(+), 343 deletions(-)
>  create mode 100644 block/backup-top.h
>  create mode 100644 block/backup-top.c

Thanks, applied to my block branch:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v15 5/5] block/backup: use backup-top instead of write notifiers

2019-10-04 Thread Max Reitz
On 01.10.19 15:14, Vladimir Sementsov-Ogievskiy wrote:
> Drop write notifiers and use filter node instead.
> 
> = Changes =
> 
> 1. Add filter-node-name argument for backup qmp api. We have to do it
> in this commit, as 257 needs to be fixed.
> 
> 2. There are no more write notifiers here, so is_write_notifier
> parameter is dropped from block-copy paths.
> 
> 3. To sync with in-flight requests at job finish we now have drained
> removing of the filter, we don't need rw-lock.
> 
> 4. Block-copy is now using BdrvChildren instead of BlockBackends
> 
> 5. As backup-top owns these children, we also move block-copy state
> into backup-top's ownership.
> 
> = Iotest changes =
> 
> 56: op-blocker doesn't shoot now, as we set it on source, but then
> check on filter, when trying to start second backup.
> To keep the test we instead can catch another collision: both jobs will
> get 'drive0' job-id, as job-id parameter is unspecified. To prevent
> interleaving with file-posix locks (as they are dependent on config)
> let's use another target for second backup.
> 
> Also, it's obvious now that we'd like to drop this op-blocker at all
> and add a test-case for two backups from one node (to different
> destinations) actually works. But not in these series.
> 
> 141: Output changed: prepatch, "Node is in use" comes from bdrv_has_blk
> check inside qmp_blockdev_del. But we've dropped block-copy blk
> objects, so no more blk objects on source bs (job blk is on backup-top
> filter bs). New message is from op-blocker, which is the next check in
> qmp_blockdev_add.
> 
> 257: The test wants to emulate guest write during backup. They should
> go to filter node, not to original source node, of course. Therefore we
> need to specify filter node name and use it.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  qapi/block-core.json   |   8 +-
>  include/block/block-copy.h |  14 +-
>  include/block/block_int.h  |   1 +
>  block/backup-top.c |  21 +--
>  block/backup.c |  73 +++--
>  block/block-copy.c |  81 +++---
>  block/replication.c|   2 +-
>  blockdev.c |   1 +
>  tests/qemu-iotests/056 |   8 +-
>  tests/qemu-iotests/141.out |   2 +-
>  tests/qemu-iotests/257 |   7 +-
>  tests/qemu-iotests/257.out | 306 ++---
>  12 files changed, 237 insertions(+), 287 deletions(-)

[...]

> diff --git a/block/block-copy.c b/block/block-copy.c
> index fcb112da14..5404bc921d 100644
> --- a/block/block-copy.c
> +++ b/block/block-copy.c

[...]

> @@ -218,8 +183,8 @@ static int coroutine_fn 
> block_copy_with_offload(BlockCopyState *s,
>  nr_clusters = DIV_ROUND_UP(nbytes, s->cluster_size);
>  bdrv_reset_dirty_bitmap(s->copy_bitmap, start,
>  s->cluster_size * nr_clusters);
> -ret = blk_co_copy_range(s->source, start, s->target, start, nbytes,
> -read_flags, s->write_flags);
> +ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
> +0, s->write_flags);

The indentation’s off here.  I’ll fix it.

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v5 1/5] iotests: remove 'linux' from default supported platforms

2019-10-04 Thread Peter Maydell
On Fri, 4 Oct 2019 at 14:50, Max Reitz  wrote:
> On 04.10.19 15:16, Peter Maydell wrote:
> > 'make check' does have the restriction
> > that we don't want the tests to take too long to run, but in
> > general the block layer should be running some reasonable subset
> > of tests in the project's standard "run the tests please" command.
> > (I have no opinion on exactly what that subset ought to be, beyond
> > that it would be good if that subset doesn't have known intermittent
> > failures in it.)
>
> Deciding the subset is difficult.  My stance was that it’s better to not
> choose an arbitrary subset here but ensure that really everything gets
> run as part of CI, that is asynchronously so it doesn’t block anyone and
> can take as long as it wants.

I wonder if we have a terminology difference confusion here.
To me "CI" means "we have tests which we automatically run
before pushing commits to master, and if they fail then we
don't push". So (a) they have to run synchronously and (b) there
is a need for them to run in a reasonable period of time because
otherwise it takes too long to run the tests before pushing
to master and we get a backlog of unprocessed pullrequests
and annoying delays.

> If we decide to get pull requests based on that, then that’d bring even
> more pain, but at least it’d be honest.  But just running half of the
> qcow2 tests to me seems only like we want to pretend we ran the iotests.
>  So for me, iotests still break, and we need to deal with make check
> failures on top.  I’d at least prefer one or the other.

I think the ideal we're aiming for here is:
 (1) people doing active work in the block subsystem are probably
often going to want to run all the iotests, and certainly the
subsystem maintainers will want to do that as they put together
pull requests.
 (2) but people who *don't* do active work in the block subsystem
still sometimes touch it in passing as part of things like global
refactorings or other changes that touch big parts of the tree,
or accidentally break it with a change to some other bit of QEMU
entirely. These people won't run the full iotests, but it is
reasonable to expect them to run 'make check', and it would be good
if that caught "whoops you broke the block subsystem".

Similarly, "make check" has incredibly spotty coverage of
various arm boards and devices, but it does do some basic
checks which do catch accidental breakage.

thanks
-- PMM



Re: [PATCH v6 00/10] Introduce the microvm machine type

2019-10-04 Thread Michael S. Tsirkin
On Fri, Oct 04, 2019 at 11:37:42AM +0200, Sergio Lopez wrote:
> Microvm is a machine type inspired by Firecracker and constructed
> after the its machine model.
> 
> It's a minimalist machine type without PCI nor ACPI support, designed
> for short-lived guests. Microvm also establishes a baseline for
> benchmarking and optimizing both QEMU and guest operating systems,
> since it is optimized for both boot time and footprint.
> 
> ---
> 
> Changelog
> v6:
>  - Some style fixes (Philippe Mathieu-Daudé)
>  - Fix a documentation bug stating that LAPIC was in userspace (Paolo
>Bonzini)
>  - Update Xen HVM code after X86MachineState introduction (Philippe
>Mathieu-Daudé)
>  - Rename header guard from QEMU_VIRTIO_MMIO_H to HW_VIRTIO_MMIO_H
>(Philippe Mathieu-Daudé)
> 
> v5:
>  - Drop unneeded "[PATCH v4 2/8] hw/i386: Factorize e820 related
>functions" (Philippe Mathieu-Daudé)
>  - Drop unneeded "[PATCH v4 1/8] hw/i386: Factorize PVH related
>functions" (Stefano Garzarella)
>  - Split X86MachineState introduction into smaller patches (Philippe
>Mathieu-Daudé)
>  - Change option-roms to x-option-roms and kernel-cmdline to
>auto-kernel-cmdline (Paolo Bonzini)
>  - Make i8259 PIT and i8254 PIC optional (Paolo Bonzini)
>  - Some fixes to the documentation (Paolo Bonzini)
>  - Switch documentation format from txt to rst (Peter Maydell)
>  - Move NMI interface to X86_MACHINE (Philippe Mathieu-Daudé, Paolo
>Bonzini)
> 
> v4:
>  - This is a complete rewrite of the whole patchset, with a focus on
>reusing as much existing code as possible to ease the maintenance burden
>and making the machine type as compatible as possible by default. As
>a result, the number of lines dedicated specifically to microvm is
>383 (code lines measured by "cloc") and, with the default
>configuration, it's now able to boot both PVH ELF images and
>bzImages with either SeaBIOS or qboot.
> 
> v3:
>   - Add initrd support (thanks Stefano).
> 
> v2:
>   - Drop "[PATCH 1/4] hw/i386: Factorize CPU routine".
>   - Simplify machine definition (thanks Eduardo).
>   - Remove use of unneeded NUMA-related callbacks (thanks Eduardo).
>   - Add a patch to factorize PVH-related functions.
>   - Replace use of Linux's Zero Page with PVH (thanks Maran and Paolo).
> 
> ---
> Sergio Lopez (10):
>   hw/virtio: Factorize virtio-mmio headers
>   hw/i386/pc: rename functions shared with non-PC machines
>   hw/i386/pc: move shared x86 functions to x86.c and export them
>   hw/i386: split PCMachineState deriving X86MachineState from it
>   hw/i386: make x86.c independent from PCMachineState
>   fw_cfg: add "modify" functions for all types
>   hw/intc/apic: reject pic ints if isa_pic == NULL
>   roms: add microvm-bios (qboot) as binary and git submodule
>   docs/microvm.rst: document the new microvm machine type
>   hw/i386: Introduce the microvm machine type
> 
>  docs/microvm.rst |  98 
>  default-configs/i386-softmmu.mak |   1 +
>  include/hw/i386/microvm.h|  83 
>  include/hw/i386/pc.h |  28 +-
>  include/hw/i386/x86.h|  94 
>  include/hw/nvram/fw_cfg.h|  42 ++
>  include/hw/virtio/virtio-mmio.h  |  73 +++
>  hw/acpi/cpu_hotplug.c|  10 +-
>  hw/i386/acpi-build.c |  29 +-
>  hw/i386/amd_iommu.c  |   3 +-
>  hw/i386/intel_iommu.c|   3 +-
>  hw/i386/microvm.c| 574 ++
>  hw/i386/pc.c | 780 +++---
>  hw/i386/pc_piix.c|  46 +-
>  hw/i386/pc_q35.c |  38 +-
>  hw/i386/pc_sysfw.c   |  58 +--
>  hw/i386/x86.c| 790 +++
>  hw/i386/xen/xen-hvm.c|  23 +-
>  hw/intc/apic.c   |   2 +-
>  hw/intc/ioapic.c |   2 +-
>  hw/nvram/fw_cfg.c|  29 ++
>  hw/virtio/virtio-mmio.c  |  48 +-
>  .gitmodules  |   3 +
>  hw/i386/Kconfig  |   4 +
>  hw/i386/Makefile.objs|   2 +
>  pc-bios/bios-microvm.bin | Bin 0 -> 65536 bytes
>  roms/Makefile|   6 +
>  roms/qboot   |   1 +

So I guess we want to add new files for x86 machines MAINTAINERS entry.

Otherwise looks good.
Which tree is this going to be merged through? Mine?


>  28 files changed, 1963 insertions(+), 907 deletions(-)
>  create mode 100644 docs/microvm.rst
>  create mode 100644 include/hw/i386/microvm.h
>  create mode 100644 include/hw/i386/x86.h
>  create mode 100644 include/hw/virtio/virtio-mmio.h
>  create mode 100644 hw/i386/microvm.c
>  create mode 100644 hw/i386/x86.c
>  create mode 100755 pc-bios/bios-microvm.bin
>  create mode 16 roms/qboot
> 
> -- 
> 2.21.0



Re: [PATCH v5 1/5] iotests: remove 'linux' from default supported platforms

2019-10-04 Thread Max Reitz
On 04.10.19 15:16, Peter Maydell wrote:
> On Fri, 4 Oct 2019 at 13:45, Max Reitz  wrote:
>>> In the end, I don't care what code these patches touched. I do an
>>> innocent git pull, and when I finally see that it's master that breaks
>>> iotests and not my patches on top of it, I'm annoyed.
>>
>> Hm.  Part of my point was that this still happens all the time.
>>
>> Which is why I’d prefer more tests to run in CI than a handful of not
>> very useful ones in make check.
>>
>> (Of course, running it in a CI won’t prevent iotest failures on your
>> machine, but in practice neither does the current model.)
> 
> If you want the tree to be defended against problems, then
> you need to run tests in 'make check'. Nothing else gets consistently
> run and has failures spotted either before code goes into the
> tree or quickly afterwards.

Yes, but part of the problem is that what does get run as part of make
check currently does not help much.

So this doesn’t really defend the iotests from problems.

> 'make check' does have the restriction
> that we don't want the tests to take too long to run, but in
> general the block layer should be running some reasonable subset
> of tests in the project's standard "run the tests please" command.
> (I have no opinion on exactly what that subset ought to be, beyond
> that it would be good if that subset doesn't have known intermittent
> failures in it.)

Deciding the subset is difficult.  My stance was that it’s better to not
choose an arbitrary subset here but ensure that really everything gets
run as part of CI, that is asynchronously so it doesn’t block anyone and
can take as long as it wants.

If we decide to get pull requests based on that, then that’d bring even
more pain, but at least it’d be honest.  But just running half of the
qcow2 tests to me seems only like we want to pretend we ran the iotests.
 So for me, iotests still break, and we need to deal with make check
failures on top.  I’d at least prefer one or the other.


I voted for dropping make check, because running all iotests doesn’t
seem feasible to me.

>> I still think that I personally would prefer the iotests to not run as
>> part of make check, but just as CI.
> 
> 'make check' *is* our CI gate, to a first approximation. Most of
> the various travis and other setups are simply a front-end for
> "build QEMU in various configurations and on various hosts
> and then run 'make check'". The travis setup at the moment is
> a bit odd, because it runs tests but it's not a gate on our
> merging changes. Ideally I would like to fix this so that
> rather than me personally running "make check" via a bunch
> of scripts we have one CI setup that we trust (gitlab seems
> the current favoured contender) and that gates changes flowing
> into master rather than me doing it manually. We might then turn
> travis off if it's not providing anything for us that the gitlab
> setup doesn't.

Hm, but make check also serves other purposes.  I would suppose e.g.
distributions generally require make check to pass when building packages.

I assumed our CI included more things than just make check, so we could
move the iotests from out of make check but keep them as part of CI.

Max



signature.asc
Description: OpenPGP digital signature


[PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Stefan Brankovic
This patch fixes bug in optimized vsl/vsr instructions reported by Mark
Cave-Ayland and Paul Clarke. Sorry for not responding earlier, I was absent
last couple of days. I also integrated some suggestions made by Aleksandar
Markovic. New soultion is tested and still has noticable performance
improvement compared to old helper implementation.

V1 of this patch was not sent to qemu-devel and I am now sending V2 to
appropriate email adresses.

Stefan Brankovic (1):
  target/ppc: Fix for optimized vsl/vsr instructions

 target/ppc/translate/vmx-impl.inc.c | 84 ++---
 1 file changed, 40 insertions(+), 44 deletions(-)

-- 
2.7.4




[PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Stefan Brankovic
In previous implementation, invocation of TCG shift function could request
shift of TCG variable by 64 bits when variable 'sh' is 0, which is not
supported in TCG (values can be shifted by 0 to 63 bits). This patch fixes
this by using two separate invocation of TCG shift functions, with maximum
shift amount of 32.

Name of variable 'shifted' is changed to 'carry' so variable naming
is similar to old helper implementation.

Variables 'avrA' and 'avrB' are replaced with variable 'avr'.

Fixes: 4e6d0920e7547e6af4bbac5ffe9adfe6ea621822
Reported-by: Paul Clark 
Reported-by: Mark Cave-Ayland 
Suggested-by: Aleksandar Markovic 
Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 84 ++---
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2472a52..81d5a7a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -590,40 +590,38 @@ static void trans_vsl(DisasContext *ctx)
 int VT = rD(ctx->opcode);
 int VA = rA(ctx->opcode);
 int VB = rB(ctx->opcode);
-TCGv_i64 avrA = tcg_temp_new_i64();
-TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 TCGv_i64 sh = tcg_temp_new_i64();
-TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 carry = tcg_temp_new_i64();
 TCGv_i64 tmp = tcg_temp_new_i64();
 
-/* Place bits 125-127 of vB in sh. */
-get_avr64(avrB, VB, false);
-tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+/* Place bits 125-127 of vB in 'sh'. */
+get_avr64(avr, VB, false);
+tcg_gen_andi_i64(sh, avr, 0x07ULL);
 
 /*
- * Save highest sh bits of lower doubleword element of vA in variable
- * shifted and perform shift on lower doubleword.
+ * Save highest 'sh' bits of lower doubleword element of vA in variable
+ * 'carry' and perform shift on lower doubleword.
  */
-get_avr64(avrA, VA, false);
-tcg_gen_subfi_i64(tmp, 64, sh);
-tcg_gen_shr_i64(shifted, avrA, tmp);
-tcg_gen_andi_i64(shifted, shifted, 0x7fULL);
-tcg_gen_shl_i64(avrA, avrA, sh);
-set_avr64(VT, avrA, false);
+get_avr64(avr, VA, false);
+tcg_gen_subfi_i64(tmp, 32, sh);
+tcg_gen_shri_i64(carry, avr, 32);
+tcg_gen_shr_i64(carry, carry, tmp);
+tcg_gen_shl_i64(avr, avr, sh);
+set_avr64(VT, avr, false);
 
 /*
  * Perform shift on higher doubleword element of vA and replace lowest
- * sh bits with shifted.
+ * 'sh' bits with 'carry'.
  */
-get_avr64(avrA, VA, true);
-tcg_gen_shl_i64(avrA, avrA, sh);
-tcg_gen_or_i64(avrA, avrA, shifted);
-set_avr64(VT, avrA, true);
+get_avr64(avr, VA, true);
+tcg_gen_shl_i64(avr, avr, sh);
+tcg_gen_or_i64(avr, avr, carry);
+set_avr64(VT, avr, true);
 
-tcg_temp_free_i64(avrA);
-tcg_temp_free_i64(avrB);
+tcg_temp_free_i64(avr);
 tcg_temp_free_i64(sh);
-tcg_temp_free_i64(shifted);
+tcg_temp_free_i64(carry);
 tcg_temp_free_i64(tmp);
 }
 
@@ -639,39 +637,37 @@ static void trans_vsr(DisasContext *ctx)
 int VT = rD(ctx->opcode);
 int VA = rA(ctx->opcode);
 int VB = rB(ctx->opcode);
-TCGv_i64 avrA = tcg_temp_new_i64();
-TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 TCGv_i64 sh = tcg_temp_new_i64();
-TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 carry = tcg_temp_new_i64();
 TCGv_i64 tmp = tcg_temp_new_i64();
 
-/* Place bits 125-127 of vB in sh. */
-get_avr64(avrB, VB, false);
-tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+/* Place bits 125-127 of vB in 'sh'. */
+get_avr64(avr, VB, false);
+tcg_gen_andi_i64(sh, avr, 0x07ULL);
 
 /*
- * Save lowest sh bits of higher doubleword element of vA in variable
- * shifted and perform shift on higher doubleword.
+ * Save lowest 'sh' bits of higher doubleword element of vA in variable
+ * 'carry' and perform shift on higher doubleword.
  */
-get_avr64(avrA, VA, true);
-tcg_gen_subfi_i64(tmp, 64, sh);
-tcg_gen_shl_i64(shifted, avrA, tmp);
-tcg_gen_andi_i64(shifted, shifted, 0xfe00ULL);
-tcg_gen_shr_i64(avrA, avrA, sh);
-set_avr64(VT, avrA, true);
+get_avr64(avr, VA, true);
+tcg_gen_subfi_i64(tmp, 32, sh);
+tcg_gen_shli_i64(carry, avr, 32);
+tcg_gen_shl_i64(carry, carry, tmp);
+tcg_gen_shr_i64(avr, avr, sh);
+set_avr64(VT, avr, true);
 /*
  * Perform shift on lower doubleword element of vA and replace highest
- * sh bits with shifted.
+ * 'sh' bits with 'carry'.
  */
-get_avr64(avrA, VA, false);
-tcg_gen_shr_i64(avrA, avrA, sh);
-tcg_gen_or_i64(avrA, avrA, shifted);
-set_avr64(VT, avrA, false);
+get_avr64(avr, VA, false);
+tcg_gen_shr_i64(avr, avr, sh);
+tcg_gen_or_i64(avr, avr, carry);
+set_avr64(VT, avr, false);
 
-tcg_temp_free_i64(avrA);
-tcg_temp_free_i64(avrB);
+

Re: [PATCH v15 4/5] block: introduce backup-top filter driver

2019-10-04 Thread Max Reitz
On 01.10.19 15:14, Vladimir Sementsov-Ogievskiy wrote:
> Backup-top filter caches write operations and does copy-before-write
> operations.
> 
> The driver will be used in backup instead of write-notifiers.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/backup-top.h  |  41 +++
>  block/backup-top.c  | 281 
>  block/Makefile.objs |   1 +
>  3 files changed, 323 insertions(+)
>  create mode 100644 block/backup-top.h
>  create mode 100644 block/backup-top.c

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v1 0/5] s390x/mmu: Implement more facilities

2019-10-04 Thread David Hildenbrand
On 26.09.19 12:16, David Hildenbrand wrote:
> This is the follow up of:
> [PATCH-for-4.2 v1 0/9] s390x: MMU changes and extensions
> Without the general MMU rework. It's based on:
> [PATCH v2 0/7] s390x/mmu: DAT translation rewrite
> 
> This series adds adds EDAT2 MMU support, and implements/indicates related
> facilities (ESOP-1, ESOP-2, IEP, ...) for TCG. The QEMU CPU model is
> updated.
> 
> IEP under QEMU TCG seems to work just fine, when eabling it via the "max"
> CPU model - via kvm unit tests:
> t460s: ~/git/kvm-unit-tests master $ ./s390x-run s390x/iep.elf -cpu max
> [...]
> PASS: iep: iep protection: Program interrupt: expected(4) == received(4)
> SUMMARY: 1 tests
> 
> EXIT: STATUS=1
> 
> Changes since "[PATCH-for-4.2 v1 0/9] s390x: MMU changes and extensions":
> - "s390x/mmu: Add EDAT2 translation support"
> -- Fix vaddr offset within 2GB page
> - "s390x/mmu: Implement ESOP-2 and ..."
> -- Squashed two patches as requested.
> -- Also set ilen to "2" in case of MMU_INST_FETCH on mmu_translate_real
> - "s390x/mmu: Implement Instruction-Execution-Protection Facility"
> -- Make sure s390_cpu_get_phys_page_debug() doesn't choke on IEP
> - "s390x/cpumodel: Add new TCG features to QEMU cpu model"
> -- Add comment "features introduced after the z13"
> 
> Cc: Ilya Leoshkevich 
> 
> David Hildenbrand (5):
>   s390x/mmu: Add EDAT2 translation support
>   s390x/mmu: Implement ESOP-2 and
> access-exception-fetch/store-indication facility
>   s390x/mmu: Implement Instruction-Execution-Protection Facility
>   s390x/cpumodel: Prepare for changes of QEMU model
>   s390x/cpumodel: Add new TCG features to QEMU cpu model
> 
>  hw/s390x/s390-virtio-ccw.c  |  2 ++
>  target/s390x/cpu.h  |  1 +
>  target/s390x/gen-features.c | 11 +-
>  target/s390x/helper.c   |  6 +-
>  target/s390x/mmu_helper.c   | 42 +++--
>  5 files changed, 58 insertions(+), 4 deletions(-)
> 

@Christian (@Conny) if I can get an ACK on the last patch, I can send
this directly upstream.

-- 

Thanks,

David / dhildenb



Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Vladimir Sementsov-Ogievskiy
04.10.2019 15:59, Max Reitz wrote:
> On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
>> 02.10.2019 18:52, Max Reitz wrote:
>>> On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
> 02.10.2019 17:57, Max Reitz wrote:
>> On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
>>> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset aligned-up
>>> region in the dirty bitmap, which means that we may not copy some bytes
>>> and assume them copied, which actually leads to producing corrupted
>>> target.
>>>
>>> So 9adc1cb49af8d forced dirty bitmap granularity to be
>>> request_alignment for mirror-top filter, so we are not working with
>>> unaligned requests. However forcing large alignment obviously decreases
>>> performance of unaligned requests.
>>>
>>> This commit provides another solution for the problem: if unaligned
>>> padding is already dirty, we can safely ignore it, as
>>> 1. It's dirty, it will be copied by mirror_iteration anyway
>>> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>>>       bitmap and therefore don't damage "synchronicity" of the
>>>       write-blocking mirror.
>>
>> But that’s not what active mirror is for.  The point of active mirror is
>> that it must converge because every guest write will contribute towards
>> that goal.
>>
>> If you skip active mirroring for unaligned guest writes, they will not
>> contribute towards converging, but in fact lead to the opposite.
>>
>
> The will not contribute only if region is already dirty. Actually, after
> first iteration of mirroring (copying the whole disk), all following 
> writes
> will contribute, so the whole process must converge. It is a bit similar 
> with
> running one mirror loop in normal mode, and then enable write-blocking.
>


 In other words, we don't need "all guest writes contribute" to converge,
 "all guest writes don't create new dirty bits" is enough, as we have 
 parallel
 mirror iteration which contiguously handles dirty bits.
>>>
>>> Hm, in a sense.  But it does mean that guest writes will not contribute
>>> to convergence.
>>>
>>> And that’s against the current definition of write-blocking, which does
>>> state that “when data is written to the source, write it (synchronously)
>>> to the target as well”.
>>>
>>
>> Hmm, understand. But IMHO our proposed behavior is better in general.
>> Do you think it's a problem to change spec now?
>> If yes, I'll resend with an option
> 
> Well, the thing is that I’d find it weird if write-blocking wasn’t
> blocking in all cases.  And in my opinion, it makes more sense for
> active mirror if all writes actively contributed to convergence.
> 

Why? What is the benefit in it?
What is "all writes actively contributed to convergence" for user?

I think for user there may be the following criteria:

1. guaranteed converge, with any guest write load.
Both current and my proposed variants are OK.

2. Less impact on guest.
Obviously my proposed variant is better

3. Total time of mirroring
Seems, current may be a bit better, but I don't think that unaligned
tails gives significant impact.

===

So, assume I want [1]+[2]. And possibly
2.2: Even less impact on guest: ignore not only unaligned tails if they are
already dirty, but full synchronous mirror operation if area is already dirty.

How should I call this? Should it be separate mode, or option for 
write-blocking?

-- 
Best regards,
Vladimir


Re: [PATCH v5 1/5] iotests: remove 'linux' from default supported platforms

2019-10-04 Thread Peter Maydell
On Fri, 4 Oct 2019 at 13:45, Max Reitz  wrote:
> > In the end, I don't care what code these patches touched. I do an
> > innocent git pull, and when I finally see that it's master that breaks
> > iotests and not my patches on top of it, I'm annoyed.
>
> Hm.  Part of my point was that this still happens all the time.
>
> Which is why I’d prefer more tests to run in CI than a handful of not
> very useful ones in make check.
>
> (Of course, running it in a CI won’t prevent iotest failures on your
> machine, but in practice neither does the current model.)

If you want the tree to be defended against problems, then
you need to run tests in 'make check'. Nothing else gets consistently
run and has failures spotted either before code goes into the
tree or quickly afterwards. 'make check' does have the restriction
that we don't want the tests to take too long to run, but in
general the block layer should be running some reasonable subset
of tests in the project's standard "run the tests please" command.
(I have no opinion on exactly what that subset ought to be, beyond
that it would be good if that subset doesn't have known intermittent
failures in it.)

> I still think that I personally would prefer the iotests to not run as
> part of make check, but just as CI.

'make check' *is* our CI gate, to a first approximation. Most of
the various travis and other setups are simply a front-end for
"build QEMU in various configurations and on various hosts
and then run 'make check'". The travis setup at the moment is
a bit odd, because it runs tests but it's not a gate on our
merging changes. Ideally I would like to fix this so that
rather than me personally running "make check" via a bunch
of scripts we have one CI setup that we trust (gitlab seems
the current favoured contender) and that gates changes flowing
into master rather than me doing it manually. We might then turn
travis off if it's not providing anything for us that the gitlab
setup doesn't.

thanks
-- PMM



Re: [PATCH v2] s390x/tcg: MVCL: Exit to main loop if requested

2019-10-04 Thread Alex Bennée


David Hildenbrand  writes:

> On 04.10.19 14:11, Peter Maydell wrote:
>> On Fri, 4 Oct 2019 at 09:02, David Hildenbrand  wrote:
>>> So shall we leave this patch as-is (adding a summary of what you
>>> explained to the description) or shall we somehow factor out the
>>> TCG-internal-thingy check?
>>
>> Nothing else in target code touches the icount data structures,
>> so if this s390 insn needs to make this check I think it ought
>> to do it by calling a function implemented by the tcg code;
>> that can then have a good name that describes what it's doing
>> and a doc comment explaining the reason we need to have it.
>>
>> thanks
>> -- PMM
>>
>
> I can offer something like this:
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 49db07ba0b..d370ac0134 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -72,6 +72,26 @@ void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
>  void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
>  void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
>
> +/**
> + * cpu_cond_loop_exit_restore:
> + * @cpu: the vCPU state to be restored
> + * @pc: the host PC
> + *
> + * Trigger a cpu_loop_exit_restore() in case somebody asked for a return
> + * to the main loop (e.g. cpu_exit() or cpu_interrupt()).
> + *
> + * This is helpful for architectures that support interruptible
> + * instructions. After writing back all state to registers/memory, this
> + * call can be used to conditionally return back to the main loop or to
> + * continue executing the interruptible instruction.
> + */
> +static inline void cpu_cond_loop_exit_restore(CPUState *cpu, uintptr_t pc)
> +{
> +if (unlikely((int32_t)atomic_read(_neg(cs)->icount_decr.u32) < 0)) {
> +cpu_loop_exit_restore(cs, ra);
> +}
> +}
> +

cpu_loop_exit_restore_cond_irq
cpu_loop_exit_restore_ifirq
cpu_loop_exit_restore_conditional
cpu_loop_exit_restore_maybe

meh, naming stuff is hard:

Acked-by: Alex Bennée 

>  #if !defined(CONFIG_USER_ONLY)
>  void cpu_reloading_memory_map(void);
>  /**
>
>
> Or, as alternative, something like "cpu_shall_exit()" which only
> wraps the single check.


--
Alex Bennée



Re: [PATCH] netmap: support git-submodule build otption

2019-10-04 Thread Peter Maydell
On Fri, 4 Oct 2019 at 14:03, Giuseppe Lettieri  wrote:
>
> From: Giuseppe Lettieri 
>
> With this patch, netmap support can be enabled with
> the following options to the configure script:
>
>   --enable-netmap[=system]
>
> Use the host system netmap installation.
> Fail if not found.
>
>   --enable-netmap=git
>
> clone the official netmap repository on
> github (mostly useful for CI)
>
> Signed-off-by: Giuseppe Lettieri 
> ---
>  .gitmodules |  3 +++
>  configure   | 64 +
>  2 files changed, 58 insertions(+), 9 deletions(-)
>
> diff --git a/.gitmodules b/.gitmodules
> index c5c474169d..bf75dbc5e3 100644
> --- a/.gitmodules
> +++ b/.gitmodules
> @@ -58,3 +58,6 @@
>  [submodule "roms/opensbi"]
> path = roms/opensbi
> url =   https://git.qemu.org/git/opensbi.git
> +[submodule "netmap"]
> +   path = netmap
> +   url = https://github.com/luigirizzo/netmap.git

Hi; this patch seems to be missing the rationale for why
we want to do this. New submodules:
 * should always be on git.qemu.org (we need to mirror them
in case the original upstream vanishes)
 * need a strong justification for why they're required
(ie why we can't just use whatever the system-provided
version of the library is, or fall back to not providing
the feature if the library isn't present)

Basically new submodules are a pain so we seek to minimize
the use of them.

thanks
-- PMM



[PATCH v3] s390x/tcg: MVCL: Exit to main loop if requested

2019-10-04 Thread David Hildenbrand
MVCL is interruptible and we should check for interrupts and process
them after writing back the variables to the registers. Let's check
for any exit requests and exit to the main loop. Introduce a new helper
function for that: cpu_cond_loop_exit_restore().

When booting Fedora 30, I can see a handful of these exits and it seems
to work reliable. Also, Richard explained why this works correctly even
when MVCL is called via EXECUTE:

(1) TB with EXECUTE runs, at address Ae
- env->psw_addr stored with Ae.
- helper_ex() runs, memory address Am computed
  from D2a(X2a,B2a) or from psw.addr+RI2.
- env->ex_value stored with memory value modified by R1a

(2) TB of executee runs,
- env->ex_value stored with 0.
- helper_mvcl() runs, using and updating R1b, R1b+1, R2b, R2b+1.

(3a) helper_mvcl() completes,
 - TB of executee continues, psw.addr += ilen.
 - Next instruction is the one following EXECUTE.

(3b) helper_mvcl() exits to main loop,
 - cpu_loop_exit_restore() unwinds psw.addr = Ae.
 - Next instruction is the EXECUTE itself...
 - goto 1.

As the PoP mentiones that an interruptible instruction called via EXECUTE
should avoid modifying storage/registers that are used by EXECUTE itself,
it is fine to retrigger EXECUTE.

Cc: Alex Bennée 
Cc: Peter Maydell 
Cc: Paolo Bonzini 
Suggested-by: Richard Henderson 
Signed-off-by: David Hildenbrand 
---

v2 -> v3:
- Add TCG helper function
- Add details about EXECUTE to description
- Return to main loop only if there is work left to do

v1 -> v2:
- Check only if icount_decr.u32 < 0
- Drop should_interrupt_instruction() and perform the check inline
- Rephrase comment, subject, and description

---
 include/exec/exec-all.h   | 20 
 target/s390x/mem_helper.c | 11 ++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 49db07ba0b..d6beeddecf 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -72,6 +72,26 @@ void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
 void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
 void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
 
+/**
+ * cpu_cond_loop_exit_restore:
+ * @cpu: the vCPU state to be restored
+ * @pc: the host PC
+ *
+ * Trigger a cpu_loop_exit_restore() in case somebody asked for a return
+ * to the main loop (e.g., cpu_exit() or cpu_interrupt()).
+ *
+ * This is helpful for architectures that support interruptible
+ * instructions. After writing back all state to registers/memory, this
+ * call can be used to conditionally return back to the main loop or to
+ * continue executing the interruptible instruction.
+ */
+static inline void cpu_cond_loop_exit_restore(CPUState *cpu, uintptr_t pc)
+{
+if (unlikely((int32_t)atomic_read(_neg(cpu)->icount_decr.u32) < 0)) {
+cpu_loop_exit_restore(cpu, pc);
+}
+}
+
 #if !defined(CONFIG_USER_ONLY)
 void cpu_reloading_memory_map(void);
 /**
diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index 08c5cc6a99..c6ccb73d51 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -1015,6 +1015,7 @@ uint32_t HELPER(mvcl)(CPUS390XState *env, uint32_t r1, 
uint32_t r2)
 uint64_t srclen = env->regs[r2 + 1] & 0xff;
 uint64_t src = get_address(env, r2);
 uint8_t pad = env->regs[r2 + 1] >> 24;
+CPUState *cs = env_cpu(env);
 S390Access srca, desta;
 uint32_t cc, cur_len;
 
@@ -1065,7 +1066,15 @@ uint32_t HELPER(mvcl)(CPUS390XState *env, uint32_t r1, 
uint32_t r2)
 env->regs[r1 + 1] = deposit64(env->regs[r1 + 1], 0, 24, destlen);
 set_address_zero(env, r1, dest);
 
-/* TODO: Deliver interrupts. */
+/*
+ * MVCL is interruptible. Return to the main loop if requested after
+ * writing back all state to registers. If no interrupt will get
+ * injected, we'll end up back in this handler and continue processing
+ * the remaining parts.
+ */
+if (destlen) {
+cpu_cond_loop_exit_restore(cs, ra);
+}
 }
 return cc;
 }
-- 
2.21.0




Re: bitmap migration bug with -drive while block mirror runs

2019-10-04 Thread Eric Blake

On 10/4/19 4:24 AM, Vladimir Sementsov-Ogievskiy wrote:


The way I see it, we know an auto-generated node name will never be
correct, but an explicitly specified one represents an explicit user
configuration.

It's wrong to use generated names for migration details, but it's never
wrong to use explicit configuration.

So I believe they are /already/ distinct in nature. We even already have
code that can tell them apart.


Is it restricted to create user node-names formatted like automated ones?


Yes. Explicit node names cannot begin with '#', while all generated node 
names do.




There are four cases here:

- The bitmap is loaded to a root node with an explicit name
- The bitmap is loaded to a non-root node with an explicit name

The blockdev case with persistence. The name represents explicit user
configuration and can be relied upon in the destination.

- The bitmap is loaded to a root node with an implicit name, with a named BB

The -drive case. The named BB represents the explicit user configuration
and can be relied upon.

- The bitmap is loaded to a non-root node with an implicit name.


So, do you suggest to save information of haw bitmap was loaded or created in
BdrvDirtyBitmap structure, to distinguish, how to identify it, by blk name or
by node-name? And how this information would be updated on bitmap merge? And
what about creating bitmaps?

So if one bitmap created in node N by blk name B, and another bitmap created in
same node N by node-name N, will we migrated these bitmaps in different ways?


In the -drive case (historical libvirt), the block device is named, and 
node names are generated (it may be possible to use -drive and still 
create explicit node names, but libvirt will never do that).  You can 
create a bitmap using either ('drive-name','bitmap-name'), or 
('generated-node-name','bitmap-name'), but for the purposes of 
migration, only the 'drive-name' variant is migrateable.


In the -blockdev case (upcoming libvirt), the block device is anonymous, 
and all node names are given by libvirt.  Thus, you can only create a 
bitmap using ('node-name','bitmap-name'), but it is also obvious that 
migration will use the 'node-name' variant.





If it's a problem for libvirt to keep same node-names, why should we insist?




Is it really a problem? If libvirt requests migration of bitmaps, those
bitmaps need to go somewhere. Even if it constructs a different graph
for valid reasons, it should still understand which qcow2 nodes
correlate to which other qcow2 nodes and name them accordingly.

I don't see why this is actually a terrible constraint. Even in our
mapping proposal we're still using node-names.




The obvious case I see is that if we have a source:

Backing.qcow2 (contains bitmap1) <- Active.qcow2 (contains bitmap2)

and we want to migrate AND flatten at the same time, but still preserve 
the bitmaps as a record of changes between the points in time, then 
libvirt needs a way to specify migration to:


Flattened.qcow2 (contains bitmap1 and bitmap2)

No matter which node name libvirt assigns to Flattened.qcow2, at least 
one of the two bitmaps on the source will be migrated to a different 
node name on the destination, while still giving the net result of a 
bitmap logically associated with the drive (and not any particular node).




Ok, I'm not completely against node-name matching, keeping in mind that it is
current default behavior anyway. And I see Peter not completely against too.

But I'd prefer to select default name from current moment, not involving 
information
of "how bitmap was created or loaded", as it may lead to migrating bitmaps from 
one
node in different ways which seems inconsistent.


As long as a bitmap never has both names non-generated, we should be 
fine (it either has an explicit drive name and generated node name, or 
it has no drive name and an explicit node name).




Current default is blk name. And node-name if blk name is not available. So I 
think
the only thing to fix right now is skipping filters. We possibly may 
additionally
restrict migrating bitmaps without blk name and with generated node-name.



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



Re: [PATCH 3/4] block/mirror: support unaligned write in active mirror

2019-10-04 Thread Max Reitz
On 03.10.19 11:34, Vladimir Sementsov-Ogievskiy wrote:
> 02.10.2019 18:52, Max Reitz wrote:
>> On 02.10.19 17:06, Vladimir Sementsov-Ogievskiy wrote:
>>> 02.10.2019 18:03, Vladimir Sementsov-Ogievskiy wrote:
 02.10.2019 17:57, Max Reitz wrote:
> On 12.09.19 17:13, Vladimir Sementsov-Ogievskiy wrote:
>> Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset aligned-up
>> region in the dirty bitmap, which means that we may not copy some bytes
>> and assume them copied, which actually leads to producing corrupted
>> target.
>>
>> So 9adc1cb49af8d forced dirty bitmap granularity to be
>> request_alignment for mirror-top filter, so we are not working with
>> unaligned requests. However forcing large alignment obviously decreases
>> performance of unaligned requests.
>>
>> This commit provides another solution for the problem: if unaligned
>> padding is already dirty, we can safely ignore it, as
>> 1. It's dirty, it will be copied by mirror_iteration anyway
>> 2. It's dirty, so skipping it now we don't increase dirtiness of the
>>      bitmap and therefore don't damage "synchronicity" of the
>>      write-blocking mirror.
>
> But that’s not what active mirror is for.  The point of active mirror is
> that it must converge because every guest write will contribute towards
> that goal.
>
> If you skip active mirroring for unaligned guest writes, they will not
> contribute towards converging, but in fact lead to the opposite.
>

 The will not contribute only if region is already dirty. Actually, after
 first iteration of mirroring (copying the whole disk), all following writes
 will contribute, so the whole process must converge. It is a bit similar 
 with
 running one mirror loop in normal mode, and then enable write-blocking.

>>>
>>>
>>> In other words, we don't need "all guest writes contribute" to converge,
>>> "all guest writes don't create new dirty bits" is enough, as we have 
>>> parallel
>>> mirror iteration which contiguously handles dirty bits.
>>
>> Hm, in a sense.  But it does mean that guest writes will not contribute
>> to convergence.
>>
>> And that’s against the current definition of write-blocking, which does
>> state that “when data is written to the source, write it (synchronously)
>> to the target as well”.
>>
> 
> Hmm, understand. But IMHO our proposed behavior is better in general.
> Do you think it's a problem to change spec now?
> If yes, I'll resend with an option

Well, the thing is that I’d find it weird if write-blocking wasn’t
blocking in all cases.  And in my opinion, it makes more sense for
active mirror if all writes actively contributed to convergence.

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] audio: add -audiodev pa,in|out.latency= to documentation

2019-10-04 Thread Marc-André Lureau
Hi

On Fri, Oct 4, 2019 at 4:57 PM Stefan Hajnoczi  wrote:
>
> The "latency" parameter wasn't covered by the documentation.
>
> Signed-off-by: Stefan Hajnoczi 
> ---
> Hi Gerd,
> You asked me to resend this patch because there was a conflict.  I have
> rebased it onto qemu.git/master (4f59102571fc).
>
>  qemu-options.hx | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 2a04ca6ac5..5c27c57273 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -470,6 +470,7 @@ DEF("audiodev", HAS_ARG, QEMU_OPTION_audiodev,
>  "-audiodev pa,id=id[,prop[=value][,...]]\n"
>  "server= PulseAudio server address\n"
>  "in|out.name= source/sink device name\n"
> +"in|out.latency= desired latency in microseconds\n"
>  #endif
>  #ifdef CONFIG_AUDIO_SDL
>  "-audiodev sdl,id=id[,prop[=value][,...]]\n"
> @@ -630,6 +631,10 @@ Sets the PulseAudio @var{server} to connect to.
>  @item in|out.name=@var{sink}
>  Use the specified source/sink for recording/playback.
>
> +@item in|out.latency=@var{usecs}
> +Desired latency in microseconds.  The PulseAudio server will try to honor 
> this
> +value but actual latencies may be lower or higher.

default is 15ms according to f6142777659f2e7ad143f2850f1f036f899f475f,
might be worth to document.

otherwise, looks good:
Reviewed-by: Marc-André Lureau 

> +
>  @end table
>
>  @item -audiodev sdl,id=@var{id}[,@var{prop}[=@var{value}][,...]]
> --
> 2.21.0
>
>


-- 
Marc-André Lureau



Re: [RFC 3/4] ptimer: Provide new transaction-based API

2019-10-04 Thread Peter Maydell
On Fri, 4 Oct 2019 at 12:48, Peter Maydell  wrote:
>
> Provide the new transaction-based API.


> +void ptimer_transaction_begin(ptimer_state *s)
> +{
> +assert(!s->in_transaction && s->callback);
> +s->in_transaction = true;

As I was working on converting the ptimer tests to the new
API I noticed a silly bug here -- we should set
   s->need_reload = false;
in ptimer_transaction_begin(). Otherwise one transaction has
decided to do a reload, we'll do a reload in commit every time,
including when that's bogus (eg when the timer is disabled)...

thanks
-- PMM



[PATCH] audio: add -audiodev pa,in|out.latency= to documentation

2019-10-04 Thread Stefan Hajnoczi
The "latency" parameter wasn't covered by the documentation.

Signed-off-by: Stefan Hajnoczi 
---
Hi Gerd,
You asked me to resend this patch because there was a conflict.  I have
rebased it onto qemu.git/master (4f59102571fc).

 qemu-options.hx | 5 +
 1 file changed, 5 insertions(+)

diff --git a/qemu-options.hx b/qemu-options.hx
index 2a04ca6ac5..5c27c57273 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -470,6 +470,7 @@ DEF("audiodev", HAS_ARG, QEMU_OPTION_audiodev,
 "-audiodev pa,id=id[,prop[=value][,...]]\n"
 "server= PulseAudio server address\n"
 "in|out.name= source/sink device name\n"
+"in|out.latency= desired latency in microseconds\n"
 #endif
 #ifdef CONFIG_AUDIO_SDL
 "-audiodev sdl,id=id[,prop[=value][,...]]\n"
@@ -630,6 +631,10 @@ Sets the PulseAudio @var{server} to connect to.
 @item in|out.name=@var{sink}
 Use the specified source/sink for recording/playback.
 
+@item in|out.latency=@var{usecs}
+Desired latency in microseconds.  The PulseAudio server will try to honor this
+value but actual latencies may be lower or higher.
+
 @end table
 
 @item -audiodev sdl,id=@var{id}[,@var{prop}[=@var{value}][,...]]
-- 
2.21.0




Re: [PATCH 02/67] iotests.py: Add @skip_for_imgopts()

2019-10-04 Thread Max Reitz
On 03.10.19 17:19, Vladimir Sementsov-Ogievskiy wrote:
> 01.10.2019 22:46, Max Reitz wrote:
>> Signed-off-by: Max Reitz 
>> ---
>>   tests/qemu-iotests/iotests.py | 13 +
>>   1 file changed, 13 insertions(+)
>>
>> diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
>> index 7030900807..cdcb62c4ac 100644
>> --- a/tests/qemu-iotests/iotests.py
>> +++ b/tests/qemu-iotests/iotests.py
>> @@ -950,6 +950,19 @@ def skip_if_unsupported(required_formats=[], 
>> read_only=False):
>>   return func_wrapper
>>   return skip_test_decorator
>>   
>> +def skip_for_imgopts(unsupported_opts=[]):
>> +'''Skip Test Decorator
>> +   Skips the test if imgopts contains any of the given options'''
>> +def skip_test_decorator(func):
>> +def func_wrapper(test_case: QMPTestCase, *args, **kwargs):
> 
> how about
> 
> unsup = set(imgopts) & set(unsupported_opts)
> if unsup:
> test_case.case_skip('... Options {} are ...', format(..., ', 
> '.join(map(str, unsup)))

Sure.

Max



signature.asc
Description: OpenPGP digital signature


[Virtio-fs] [PATCH] virtiofsd: Add pread64() to seccomp list for posix_fallocate()

2019-10-04 Thread Misono Tomohiro
I test virtiofs with NFS 4.0 as backend and notice that fallocate
causes system hang (kernel: 5.4-rc1, qemu: virtio-fs-dev branch):
 $ mount -t virtiofs myfs /mnt
 $ dd if=/dev/urandom bs=1000 seek=1 count=1 of=/mnt/file
 $ fallocate -l 2000 /mnt/file # system hang

This is because:
 1. virtiofs supports fallocate syscall while NFS 4.0 does not.
 2. virtiofsd uses posix_fallocate() and it fallbacks to pread64()/
pwrite64() sequence to reserve blocks if fallocate syscall is
not available.
 3. pread64() syscall is prohibited by seccomp and virtiofsd thread
is killed by SIGSYS.

So, just add pread64() to seccomp white list to fix this problem.

Signed-off-by: Misono Tomohiro 
---
 contrib/virtiofsd/seccomp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/virtiofsd/seccomp.c b/contrib/virtiofsd/seccomp.c
index 93b679271d..88b61bca42 100644
--- a/contrib/virtiofsd/seccomp.c
+++ b/contrib/virtiofsd/seccomp.c
@@ -61,6 +61,7 @@ static const int syscall_whitelist[] = {
SCMP_SYS(ppoll),
SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */
SCMP_SYS(preadv),
+   SCMP_SYS(pread64),
SCMP_SYS(pwritev),
SCMP_SYS(pwrite64),
SCMP_SYS(read),
-- 
2.21.0




Re: libvirt? qemu change that mmaps ELF files breaks libvirt svirt handling for os.kernel

2019-10-04 Thread Christian Borntraeger



On 04.10.19 14:36, Daniel P. Berrangé wrote:
> On Fri, Oct 04, 2019 at 02:18:49PM +0200, Christian Borntraeger wrote:
>>
>>
>> On 04.10.19 14:13, Paolo Bonzini wrote:
>>> On 04/10/19 14:03, Christian Borntraeger wrote:
 Stefano, Paolo,

 I have an interesting fail in QEMU 

 2019-10-04T12:00:32.675188Z qemu-system-s390x: GLib: g_mapped_file_unref: 
 assertion 'file != NULL' failed
 that bisected to 
 commit 816b9fe450220e19acb91a0ce4a8ade7000648d1 (refs/bisect/bad)
 elf-ops.h: Map into memory the ELF to load

 strace tells that I can read the ELF file, but not mmap
 strace:
 214365 openat(AT_FDCWD, "/var/lib/libvirt/images/test_cpu_timer.elf", 
 O_RDONLY) = 36
 214365 read(46, "\177ELF\2\2\1\0\0\0\0\0\0\0\0\0", 16) = 16
 214365 lseek(46, 0, SEEK_SET)   = 0
 [...]
 214365 fstat(46, {st_mode=S_IFREG|0755, st_size=168176, ...}) = 0
 214365 mmap(NULL, 168176, PROT_READ|PROT_WRITE, MAP_PRIVATE, 46, 0) = -1 
 EACCES (Permission denied)

 So reading from /var/lib/libvirt/images/test_cpu_timer.elf does work, 
 mmaping does not.
 setenforce 0 makes the problem go away. 

 This might be more of an issue in libvirt, setting the svirt context too
 restrictive, but I am not too deep into the svirt part of libvirt.
 Reverting the qemu commit makes the problem go away.
>>>
>>> Yes, the policy is too restrictive in my opinion.
>>>
>>> Can you include the output of "audit2allow" and/or "audit2allow -R"?
>>>
>>> Thanks,
>>>
>>> Paolo
>>>
>>
>> require {
>>  type unconfined_t;
>>  type virt_content_t;
>>  type svirt_t;
>>  type systemd_tmpfiles_t;
>>  type user_home_t;
>>  type NetworkManager_t;
>>  class file { entrypoint execute ioctl lock map open read write };
>>  class bpf prog_run;
>> }
>>
>> #= svirt_t ==
>> allow svirt_t user_home_t:file { entrypoint execute ioctl lock open read 
>> write };
>>
>> # This avc can be allowed using the boolean 'domain_can_mmap_files'
> 
> This is an unrelated boolean and we don't want to use that so ignore
> this suggestion !
> 
>> allow svirt_t virt_content_t:file map;
> 
> This matches your strace.  virt_content_t is the label we use on
> files that are exposed to QEMU read-only.
> 
> The svirt policy only allows mmap on files that re exposed read-write
> currently.
> 
> I believe we can safely allow this mmap on read-only files too
> because  the read-only restriction is enforced at time of open,
> and any writes to the mapped file  will result in a private
> copy on write.
> 
> Please file a bz entry against the selinux-policy component for
> whatever distro you're using, and/or Fedora rawhide, and CC me
> on it too.

Done. This was on Fedora 30.
https://bugzilla.redhat.com/show_bug.cgi?id=1758525

 Now sure about others like RHEL. RHV.




Re: [PATCH v5 1/5] iotests: remove 'linux' from default supported platforms

2019-10-04 Thread Max Reitz
On 04.10.19 12:19, Kevin Wolf wrote:
> Am 02.10.2019 um 19:47 hat Max Reitz geschrieben:
>> On 02.10.19 18:44, Kevin Wolf wrote:
>>> Am 02.10.2019 um 13:57 hat Max Reitz geschrieben:
 It usually worked fine for me because it’s rather rare that non-block
 patches broke the iotests.
>>>
>>> I disagree. It happened all the time that someone else broke the iotests
>>> in master and we needed to fix it up.
>>
>> OK.
>>
>> (In my experience, it’s still mostly block patches, only that they tend
>> to go through non-block trees.)
> 
> Fair enough, it's usually code that touches block code. I assumed "block
> patches" to mean patches that go through one of the block trees and for
> which iotests are run before sending a pull request.
> 
> In the end, I don't care what code these patches touched. I do an
> innocent git pull, and when I finally see that it's master that breaks
> iotests and not my patches on top of it, I'm annoyed.

Hm.  Part of my point was that this still happens all the time.

Which is why I’d prefer more tests to run in CI than a handful of not
very useful ones in make check.

(Of course, running it in a CI won’t prevent iotest failures on your
machine, but in practice neither does the current model.)

 Maybe my main problem is that I feel like now I have to deal with all of
 the fallout, even though adding the iotests to make check wasn’t my idea
 and neither me nor Kevin ever consented.  (I don’t know whether Kevin
 consented implicitly, but I don’t see his name in bdd95e47844f2d8b.)

 You can’t just give me more responsibility without my consent and then
 call me egoistic for not liking it.
>>>
>>> I think I may have implicitly consented by helping Alex with the changes
>>> to 'check' that made its output fit better in 'make check'.
>>>
>>> Basically, my stance is that I share your dislike for the effects on me
>>> personally (also, I can't run 'make check' any more before a pull
>>> request without testing half of the iotests twice because I still need a
>>> full iotests run),

-x auto for the qcow2 pass, by the way.

>>> but I also think that having iotests in 'make check'
>>> is really the right thing for the project despite the inconvenience it
>>> means for me.
>>
>> Then I expect you to NACK Thomas’s patch, and I take it to mean that I
>> can rely on you to handle basically all make check iotests failures that
>> are not caused by my own pull requests.
>>
>> Works for me.
> 
> Ah, we can also play this game the other way round.
> 
> So if you merge that revert, when iotests break in master, I take it I
> can rely on you to fix it up before it impacts my working branch?

This is not a game, I’m talking purely from my standpoint:
(I talked wrongly, but more on that below)

Whenever make check fails, it’s urgent.  Without iotests running in make
check, we had some time to sort the situation out.  That’s no longer the
case.

That means we need to take care of everything immediately.  And I purely
want help with that.  I just did not have the impression that such help
was there in the past months.
(This impression may or may not have a correlation to reality.)

I thought that was just because nobody really cared about the iotests in
make check.  Now you say you do, so that’s why I said you should NACK
the revert and then I know that I can count on your help.


Now where I was wrong: I didn’t say “your help” but “you handle it”.
That was wrong.  I’m sorry.  That would mean I reap all the benefits and
you have to pay the price.


What I’m not so sure of is why you didn’t just say that’s unfair and
instead reply with an impossible suggestion.  That makes it a bit
difficult for me to find out exactly what you plan to do.

I take your point as “Exactly, this suggestion would be impossible.
With the tests in make check, the worst that happens is CI breakage and
rejected pull requests, and dealing with those is very much possible.”[1]

As I’ve said, my POV is different, because I find CI breakage, make
check breakage, and rejected pull requests to be worse because those are
failures on other people’s machines.  That puts me personally under much
more stress than failures on my own machine.  But it seems that you feel
differently.

[1] Or maybe you wanted to say that you find a rare intermittent failure
that slips into make check less worse than 100 % failure of some test
for everyone who runs the iotests.  I don’t know.
I know that the 100 % failures are annoying but generally easily fixed;
whereas the intermittent ones are the ones that stick and hard to fix.
And those intermittent ones are unlikely to cause pull requests to be
rejected.

 I know you’ll say that we just need to ensure we can add more tests,
 then.  But for one thing, the most important tests are the ones that
 take the longest, like 041.  And the other of course is that adding any
 more tests to make check just brings me more pain, so I won’t do it.
>>>
>>> That's something that 

[Bug 1777777] Re: arm9 clock pending (SP804)

2019-10-04 Thread Peter Maydell
I sent out an initial RFC patchset which fixes this (it's just an RFC because 
it only converts this one device to the new ptimer API, and we should do a 
proper conversion of all devices); it seems to make the test case in this bug 
work correctly:
https://patchew.org/QEMU/20191004114848.16831-1-peter.mayd...@linaro.org/

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/177

Title:
  arm9 clock pending (SP804)

Status in QEMU:
  Confirmed

Bug description:
  Hello all,

  I'm using the versatilepb board and the timer Interrupt Mask Status
  register (offset 0x14 of the SP804) does not seem to be working
  properly on the latest qemu-2.12. I tried on the 2.5 (i believe this
  is the mainstream version that comes with Linux) and it works
  perfectly.

  What happens is that the pending bit does not seem to be set in some
  scenarios. In my case, I see the timer value decreasing to 0 and then
  being reset to the reload value and neither does the interrupt is
  triggered nor the pending bit is set.

  I believe this is a matter of timing since in the "long" run the
  system eventually catches up (after a few microseconds).

  Thank you

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/177/+subscriptions



Re: libvirt? qemu change that mmaps ELF files breaks libvirt svirt handling for os.kernel

2019-10-04 Thread Daniel P . Berrangé
On Fri, Oct 04, 2019 at 02:18:49PM +0200, Christian Borntraeger wrote:
> 
> 
> On 04.10.19 14:13, Paolo Bonzini wrote:
> > On 04/10/19 14:03, Christian Borntraeger wrote:
> >> Stefano, Paolo,
> >>
> >> I have an interesting fail in QEMU 
> >>
> >> 2019-10-04T12:00:32.675188Z qemu-system-s390x: GLib: g_mapped_file_unref: 
> >> assertion 'file != NULL' failed
> >> that bisected to 
> >> commit 816b9fe450220e19acb91a0ce4a8ade7000648d1 (refs/bisect/bad)
> >> elf-ops.h: Map into memory the ELF to load
> >>
> >> strace tells that I can read the ELF file, but not mmap
> >> strace:
> >> 214365 openat(AT_FDCWD, "/var/lib/libvirt/images/test_cpu_timer.elf", 
> >> O_RDONLY) = 36
> >> 214365 read(46, "\177ELF\2\2\1\0\0\0\0\0\0\0\0\0", 16) = 16
> >> 214365 lseek(46, 0, SEEK_SET)   = 0
> >> [...]
> >> 214365 fstat(46, {st_mode=S_IFREG|0755, st_size=168176, ...}) = 0
> >> 214365 mmap(NULL, 168176, PROT_READ|PROT_WRITE, MAP_PRIVATE, 46, 0) = -1 
> >> EACCES (Permission denied)
> >>
> >> So reading from /var/lib/libvirt/images/test_cpu_timer.elf does work, 
> >> mmaping does not.
> >> setenforce 0 makes the problem go away. 
> >>
> >> This might be more of an issue in libvirt, setting the svirt context too
> >> restrictive, but I am not too deep into the svirt part of libvirt.
> >> Reverting the qemu commit makes the problem go away.
> > 
> > Yes, the policy is too restrictive in my opinion.
> > 
> > Can you include the output of "audit2allow" and/or "audit2allow -R"?
> > 
> > Thanks,
> > 
> > Paolo
> > 
> 
> require {
>   type unconfined_t;
>   type virt_content_t;
>   type svirt_t;
>   type systemd_tmpfiles_t;
>   type user_home_t;
>   type NetworkManager_t;
>   class file { entrypoint execute ioctl lock map open read write };
>   class bpf prog_run;
> }
> 
> #= svirt_t ==
> allow svirt_t user_home_t:file { entrypoint execute ioctl lock open read 
> write };
> 
> # This avc can be allowed using the boolean 'domain_can_mmap_files'

This is an unrelated boolean and we don't want to use that so ignore
this suggestion !

> allow svirt_t virt_content_t:file map;

This matches your strace.  virt_content_t is the label we use on
files that are exposed to QEMU read-only.

The svirt policy only allows mmap on files that re exposed read-write
currently.

I believe we can safely allow this mmap on read-only files too
because  the read-only restriction is enforced at time of open,
and any writes to the mapped file  will result in a private
copy on write.

Please file a bz entry against the selinux-policy component for
whatever distro you're using, and/or Fedora rawhide, and CC me
on it too.

> corecmd_bin_entry_type(svirt_t)
> userdom_manage_user_home_content_dirs(svirt_t)
> userdom_map_user_home_files(svirt_t)
> virt_rw_svirt_image(svirt_t)

These look unrelated to the problem above.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|



Re: [PATCH v2] s390x/tcg: MVCL: Exit to main loop if requested

2019-10-04 Thread David Hildenbrand
On 04.10.19 14:11, Peter Maydell wrote:
> On Fri, 4 Oct 2019 at 09:02, David Hildenbrand  wrote:
>> So shall we leave this patch as-is (adding a summary of what you
>> explained to the description) or shall we somehow factor out the
>> TCG-internal-thingy check?
> 
> Nothing else in target code touches the icount data structures,
> so if this s390 insn needs to make this check I think it ought
> to do it by calling a function implemented by the tcg code;
> that can then have a good name that describes what it's doing
> and a doc comment explaining the reason we need to have it.
> 
> thanks
> -- PMM
> 

I can offer something like this:

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 49db07ba0b..d370ac0134 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -72,6 +72,26 @@ void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
 void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
 void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
 
+/**
+ * cpu_cond_loop_exit_restore:
+ * @cpu: the vCPU state to be restored
+ * @pc: the host PC
+ *
+ * Trigger a cpu_loop_exit_restore() in case somebody asked for a return
+ * to the main loop (e.g. cpu_exit() or cpu_interrupt()).
+ *
+ * This is helpful for architectures that support interruptible
+ * instructions. After writing back all state to registers/memory, this
+ * call can be used to conditionally return back to the main loop or to
+ * continue executing the interruptible instruction.
+ */
+static inline void cpu_cond_loop_exit_restore(CPUState *cpu, uintptr_t pc)
+{
+if (unlikely((int32_t)atomic_read(_neg(cs)->icount_decr.u32) < 0)) {
+cpu_loop_exit_restore(cs, ra);
+}
+}
+
 #if !defined(CONFIG_USER_ONLY)
 void cpu_reloading_memory_map(void);
 /**


Or, as alternative, something like "cpu_shall_exit()" which only
wraps the single check.

-- 

Thanks,

David / dhildenb



Re: [PATCH] target/arm/arch_dump: Add SVE notes

2019-10-04 Thread no-reply
Patchew URL: https://patchew.org/QEMU/20191004094609.32714-1-drjo...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20191004094609.32714-1-drjo...@redhat.com
Subject: [PATCH] target/arm/arch_dump: Add SVE notes

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] patchew/20191004120313.5347-1-drjo...@redhat.com -> 
patchew/20191004120313.5347-1-drjo...@redhat.com
Switched to a new branch 'test'
d041497 target/arm/arch_dump: Add SVE notes

=== OUTPUT BEGIN ===
ERROR: code indent should never use tabs
#21: FILE: include/elf.h:1653:
+#define NT_ARM_SVE^I0x405^I^I/* ARM Scalable Vector Extension$

WARNING: Block comments use a leading /* on a separate line
#21: FILE: include/elf.h:1653:
+#define NT_ARM_SVE 0x405   /* ARM Scalable Vector Extension

ERROR: code indent should never use tabs
#22: FILE: include/elf.h:1654:
+^I^I^I^I^I   registers */$

WARNING: Block comments use * on subsequent lines
#22: FILE: include/elf.h:1654:
+#define NT_ARM_SVE 0x405   /* ARM Scalable Vector Extension
+  registers */

WARNING: Block comments use a trailing */ on a separate line
#22: FILE: include/elf.h:1654:
+  registers */

total: 2 errors, 3 warnings, 181 lines checked

Commit d0414974dab6 (target/arm/arch_dump: Add SVE notes) has style problems, 
please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20191004094609.32714-1-drjo...@redhat.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PULL v2 00/22] Misc patches for 2010-10-02

2019-10-04 Thread Peter Maydell
On Thu, 3 Oct 2019 at 12:05, Paolo Bonzini  wrote:
>
> The following changes since commit 7f21573c822805a8e6be379d9bcf3ad9effef3dc:
>
>   Merge remote-tracking branch 
> 'remotes/huth-gitlab/tags/pull-request-2019-10-01' into staging (2019-10-01 
> 13:13:38 +0100)
>
> are available in the git repository at:
>
>
>   git://github.com/bonzini/qemu.git tags/for-upstream
>
> for you to fetch changes up to 8cd5aa6899689e6f9f3e72bf43474a436148a341:
>
>   tests/docker: only enable ubsan for test-clang (2019-10-03 12:58:05 +0200)
>
> 
> * Compilation fix for KVM (Alex)
> * SMM fix (Dmitry)
> * VFIO error reporting (Eric)
> * win32 fixes and workarounds (Marc-André)
> * qemu-pr-helper crash bugfix (Maxim)
> * Memory leak fixes (myself)
> * Record-replay deadlock (Pavel)
> * i386 CPUID bits (Sebastian)
> * kconfig tweak (Thomas)
> * Valgrind fix (Thomas)
> * distclean improvement (Thomas)
> * Autoconverge test (Yury)

> Thomas Huth (2):
>   Makefile: Remove generated files when doing 'distclean'

This change breaks the 'build from clean' part of my merge
build tests -- I've followed up to the patch email with the
description of why.

thanks
-- PMM



libvirt? qemu change that mmaps ELF files breaks libvirt svirt handling for os.kernel

2019-10-04 Thread Christian Borntraeger
Stefano, Paolo,

I have an interesting fail in QEMU 

2019-10-04T12:00:32.675188Z qemu-system-s390x: GLib: g_mapped_file_unref: 
assertion 'file != NULL' failed
that bisected to 
commit 816b9fe450220e19acb91a0ce4a8ade7000648d1 (refs/bisect/bad)
elf-ops.h: Map into memory the ELF to load

strace tells that I can read the ELF file, but not mmap
strace:
214365 openat(AT_FDCWD, "/var/lib/libvirt/images/test_cpu_timer.elf", O_RDONLY) 
= 36
214365 read(46, "\177ELF\2\2\1\0\0\0\0\0\0\0\0\0", 16) = 16
214365 lseek(46, 0, SEEK_SET)   = 0
[...]
214365 fstat(46, {st_mode=S_IFREG|0755, st_size=168176, ...}) = 0
214365 mmap(NULL, 168176, PROT_READ|PROT_WRITE, MAP_PRIVATE, 46, 0) = -1 EACCES 
(Permission denied)

So reading from /var/lib/libvirt/images/test_cpu_timer.elf does work, mmaping 
does not.
setenforce 0 makes the problem go away. 

This might be more of an issue in libvirt, setting the svirt context too
restrictive, but I am not too deep into the svirt part of libvirt.
Reverting the qemu commit makes the problem go away.




Re: [RFC 0/4] transaction-based ptimer API

2019-10-04 Thread Paolo Bonzini
On 04/10/19 14:00, Peter Maydell wrote:
> No, because stop/run causes the ptimer to "lose time"
> (we stop the underlying timer and restart it). It's
> very common for a device to want to change the ptimer
> properties without a stop/restart -- "set the ptimer
> count value when the guest writes to the device's counter
> register" is the common one. Of the three begin/commit
> blocks in the arm_timer.c conversion, only one of those
> involves calls to stop/run, and even there we only
> call stop/run if the write to the control register
> modified the enable bit.

Ok, thanks.

Paolo



[PATCH v2] target/arm/arch_dump: Add SVE notes

2019-10-04 Thread Andrew Jones
When dumping a guest with dump-guest-memory also dump the SVE
registers if they are in use.

Signed-off-by: Andrew Jones 
---
 include/elf.h  |   2 +
 target/arm/arch_dump.c | 135 -
 2 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/include/elf.h b/include/elf.h
index 3501e0c8d03a..a7c357af74ca 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -1650,6 +1650,8 @@ typedef struct elf64_shdr {
 #define NT_ARM_HW_BREAK 0x402   /* ARM hardware breakpoint registers */
 #define NT_ARM_HW_WATCH 0x403   /* ARM hardware watchpoint registers */
 #define NT_ARM_SYSTEM_CALL  0x404   /* ARM system call number */
+#define NT_ARM_SVE 0x405   /* ARM Scalable Vector Extension
+  registers */
 
 /*
  * Physical entry point into the kernel.
diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c
index 26a2c098687c..b02e398430b9 100644
--- a/target/arm/arch_dump.c
+++ b/target/arm/arch_dump.c
@@ -62,12 +62,23 @@ struct aarch64_user_vfp_state {
 
 QEMU_BUILD_BUG_ON(sizeof(struct aarch64_user_vfp_state) != 528);
 
+/* struct user_sve_header from arch/arm64/include/uapi/asm/ptrace.h */
+struct aarch64_user_sve_header {
+uint32_t size;
+uint32_t max_size;
+uint16_t vl;
+uint16_t max_vl;
+uint16_t flags;
+uint16_t reserved;
+} QEMU_PACKED;
+
 struct aarch64_note {
 Elf64_Nhdr hdr;
 char name[8]; /* align_up(sizeof("CORE"), 4) */
 union {
 struct aarch64_elf_prstatus prstatus;
 struct aarch64_user_vfp_state vfp;
+struct aarch64_user_sve_header sve;
 };
 } QEMU_PACKED;
 
@@ -76,6 +87,8 @@ struct aarch64_note {
 (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_elf_prstatus))
 #define AARCH64_PRFPREG_NOTE_SIZE \
 (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_user_vfp_state))
+#define AARCH64_SVE_NOTE_SIZE(env) \
+(AARCH64_NOTE_HEADER_SIZE + sve_size(env))
 
 static void aarch64_note_init(struct aarch64_note *note, DumpState *s,
   const char *name, Elf64_Word namesz,
@@ -128,11 +141,113 @@ static int 
aarch64_write_elf64_prfpreg(WriteCoreDumpFunction f,
 return 0;
 }
 
+#ifdef TARGET_AARCH64
+static off_t sve_zreg_offset(uint32_t vq, int n)
+{
+off_t off = sizeof(struct aarch64_user_sve_header);
+return ROUND_UP(off, 16) + vq * 16 * n;
+}
+static off_t sve_preg_offset(uint32_t vq, int n)
+{
+return sve_zreg_offset(vq, 32) + vq * 16 / 8 * n;
+}
+static off_t sve_fpsr_offset(uint32_t vq)
+{
+off_t off = sve_preg_offset(vq, 17) + offsetof(struct aarch64_note, sve);
+return ROUND_UP(off, 16) - offsetof(struct aarch64_note, sve);
+}
+static off_t sve_fpcr_offset(uint32_t vq)
+{
+return sve_fpsr_offset(vq) + sizeof(uint32_t);
+}
+static uint32_t sve_current_vq(CPUARMState *env)
+{
+return sve_zcr_len_for_el(env, arm_current_el(env)) + 1;
+}
+static size_t sve_size_vq(uint32_t vq)
+{
+off_t off = sve_fpcr_offset(vq) + sizeof(uint32_t) +
+offsetof(struct aarch64_note, sve);
+return ROUND_UP(off, 16) - offsetof(struct aarch64_note, sve);
+}
+static size_t sve_size(CPUARMState *env)
+{
+return sve_size_vq(sve_current_vq(env));
+}
+
+static int aarch64_write_elf64_sve(WriteCoreDumpFunction f,
+   CPUARMState *env, int cpuid,
+   DumpState *s)
+{
+struct aarch64_note *note;
+ARMCPU *cpu = env_archcpu(env);
+uint32_t vq = sve_current_vq(env);
+uint32_t fpr;
+uint8_t *buf;
+int ret, i;
+
+note = g_malloc0(AARCH64_SVE_NOTE_SIZE(env));
+buf = (uint8_t *)>sve;
+
+aarch64_note_init(note, s, "LINUX", 6, NT_ARM_SVE, sve_size_vq(vq));
+
+note->sve.size = cpu_to_dump32(s, sve_size_vq(vq));
+note->sve.max_size = cpu_to_dump32(s, sve_size_vq(cpu->sve_max_vq));
+note->sve.vl = cpu_to_dump16(s, vq * 16);
+note->sve.max_vl = cpu_to_dump16(s, cpu->sve_max_vq * 16);
+note->sve.flags = cpu_to_dump16(s, 1);
+
+for (i = 0; i < 32; ++i) {
+#ifdef HOST_WORDS_BIGENDIAN
+uint64_t d[vq * 2];
+int j;
+
+for (j = 0; j < vq * 2; ++j) {
+d[j] = bswap64(env->vfp.zregs[i].d[j]);
+}
+#else
+uint64_t *d = >vfp.zregs[i].d[0];
+#endif
+memcpy([sve_zreg_offset(vq, i)], [0], vq * 16);
+}
+
+for (i = 0; i < 17; ++i) {
+#ifdef HOST_WORDS_BIGENDIAN
+uint64_t d[DIV_ROUND_UP(vq * 2, 8)];
+int j;
+
+for (j = 0; j < DIV_ROUND_UP(vq * 2, 8); ++j) {
+d[j] = bswap64(env->vfp.pregs[i].p[j]);
+}
+#else
+uint64_t *d = >vfp.pregs[i].p[0];
+#endif
+memcpy([sve_preg_offset(vq, i)], [0], vq * 16 / 8);
+}
+
+fpr = cpu_to_dump32(s, vfp_get_fpsr(env));
+memcpy([sve_fpsr_offset(vq)], , sizeof(uint32_t));
+
+fpr = cpu_to_dump32(s, vfp_get_fpcr(env));
+memcpy([sve_fpcr_offset(vq)], , sizeof(uint32_t));
+
+ret 

  1   2   3   >