date:20190422

Re: [Qemu-devel] [PATCH v8 1/6] ppc: spapr: Handle "ibm, nmi-register" and "ibm, nmi-interlock" RTAS calls

2019-04-22 Thread David Gibson

On Mon, Apr 22, 2019 at 12:32:58PM +0530, Aravinda Prasad wrote:
> This patch adds support in QEMU to handle "ibm,nmi-register"
> and "ibm,nmi-interlock" RTAS calls.
> 
> The machine check notification address is saved when the
> OS issues "ibm,nmi-register" RTAS call.
> 
> This patch also handles the case when multiple processors
> experience machine check at or about the same time by
> handling "ibm,nmi-interlock" call. In such cases, as per
> PAPR, subsequent processors serialize waiting for the first
> processor to issue the "ibm,nmi-interlock" call. The second
> processor that also received a machine check error waits
> till the first processor is done reading the error log.
> The first processor issues "ibm,nmi-interlock" call
> when the error log is consumed. This patch implements the
> releasing part of the error-log while subsequent patch
> (which builds error log) handles the locking part.
> 
> Signed-off-by: Aravinda Prasad 

Reviewed-by: David Gibson 

Although I wonder if it needs to be moved later in the series to avoid
advertising the availability of the RTAS calls to the guest before all
the prereq patches are in place to make them work properly.

> ---
>  hw/ppc/spapr.c |   18 ++
>  hw/ppc/spapr_rtas.c|   61 
> 
>  include/hw/ppc/spapr.h |9 ++-
>  3 files changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index c56939a..6642cb5 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1805,6 +1805,11 @@ static void spapr_machine_reset(void)
>  first_ppc_cpu->env.gpr[5] = 0;
>  
>  spapr->cas_reboot = false;
> +
> +spapr->guest_machine_check_addr = -1;
> +
> +/* Signal all vCPUs waiting on this condition */
> +qemu_cond_broadcast(&spapr->mc_delivery_cond);
>  }
>  
>  static void spapr_create_nvram(SpaprMachineState *spapr)
> @@ -2095,6 +2100,16 @@ static const VMStateDescription vmstate_spapr_dtb = {
>  },
>  };
>  
> +static const VMStateDescription vmstate_spapr_machine_check = {
> +.name = "spapr_machine_check",
> +.version_id = 1,
> +.minimum_version_id = 1,
> +.fields = (VMStateField[]) {
> +VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> +VMSTATE_END_OF_LIST()
> +},
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>  .name = "spapr",
>  .version_id = 3,
> @@ -2127,6 +2142,7 @@ static const VMStateDescription vmstate_spapr = {
>  &vmstate_spapr_dtb,
>  &vmstate_spapr_cap_large_decr,
>  &vmstate_spapr_cap_ccf_assist,
> +&vmstate_spapr_machine_check,
>  NULL
>  }
>  };
> @@ -3068,6 +3084,8 @@ static void spapr_machine_init(MachineState *machine)
>  
>  kvmppc_spapr_enable_inkernel_multitce();
>  }
> +
> +qemu_cond_init(&spapr->mc_delivery_cond);
>  }
>  
>  static int spapr_kvm_type(MachineState *machine, const char *vm_type)
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index ee24212..c2f3991 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -348,6 +348,39 @@ static void rtas_get_power_level(PowerPCCPU *cpu, 
> SpaprMachineState *spapr,
>  rtas_st(rets, 1, 100);
>  }
>  
> +static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
> +  SpaprMachineState *spapr,
> +  uint32_t token, uint32_t nargs,
> +  target_ulong args,
> +  uint32_t nret, target_ulong rets)
> +{
> +uint64_t rtas_addr = spapr_get_rtas_addr();
> +
> +if (!rtas_addr) {
> +rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
> +return;
> +}
> +
> +spapr->guest_machine_check_addr = rtas_ld(args, 1);
> +rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +}
> +
> +static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> +   SpaprMachineState *spapr,
> +   uint32_t token, uint32_t nargs,
> +   target_ulong args,
> +   uint32_t nret, target_ulong rets)
> +{
> +if (!spapr->guest_machine_check_addr) {
> +/* NMI register not called */
> +rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +} else {
> +qemu_cond_signal(&spapr->mc_delivery_cond);
> +rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +}
> +}
> +
> +
>  static struct rtas_call {
>  const char *name;
>  spapr_rtas_fn fn;
> @@ -466,6 +499,30 @@ void spapr_load_rtas(SpaprMachineState *spapr, void 
> *fdt, hwaddr addr)
>  }
>  }
>  
> +uint64_t spapr_get_rtas_addr(void)
> +{
> +SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +int rtas_node;
> +const struct fdt_property *rtas_addr_prop;
> +void *fdt = spapr->fdt_blob;
> +uint32_t rtas_addr;
> +
> +/* fetch rtas addr from fdt */
> +rtas_node = fdt_path_offset(fdt, "/rtas");
> +if

[Qemu-devel] [PATCH v3 2/2] Always return EXCP_DMAR for protection id trap as EXCP_DMP is considered legacy.

2019-04-22 Thread Nick Hudson

From: Nick Hudson 

"In PA-RISC 1.1 (Second Edition) and later revisions, processors must use
traps 26, 27,and 28 which provide equivalent functionality"

Signed-off-by: Nick Hudson 
---
 target/hppa/mem_helper.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index c9b57d07c3..77fb544838 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -154,8 +154,7 @@ int hppa_get_physical_address(CPUHPPAState *env, vaddr 
addr, int mmu_idx,

 if (unlikely(!(prot & type))) {
 /* The access isn't allowed -- Inst/Data Memory Protection Fault.  */
-ret = (type & PAGE_EXEC ? EXCP_IMP :
-   prot & PAGE_READ ? EXCP_DMP : EXCP_DMAR);
+ret = (type & PAGE_EXEC) ? EXCP_IMP : EXCP_DMAR;
 goto egress;
 }

--
2.17.1

[Qemu-devel] [PATCH v3 1/2] Implement the pcxl and pcxl2 Fast TLB Insert instructions as used by NetBSD (and OpenBSD)

2019-04-22 Thread Nick Hudson

From: Nick Hudson 

See
 https://parisc.wiki.kernel.org/images-parisc/a/a9/Pcxl2_ers.pdf
 page 13-9 (195/206)

Signed-off-by: Nick Hudson 
---
 target/hppa/insns.decode |  3 +++
 target/hppa/translate.c  | 52 
 2 files changed, 55 insertions(+)

diff --git a/target/hppa/insns.decode b/target/hppa/insns.decode
index 098370c2f0..f0dd71dd08 100644
--- a/target/hppa/insns.decode
+++ b/target/hppa/insns.decode
@@ -133,6 +133,9 @@ ixtlbx  01 b:5 r:5 sp:2 010 addr:1 0 0  
data=1
 ixtlbx  01 b:5 r:5 ... 00 addr:1 0 0\
 sp=%assemble_sr3x data=0

+# pcxl and pcxl2 Fast TLB Insert instructions
+ixtlbxf 01 0 r:5 00 0 data:1 01000 addr:1 0 0
+
 pxtlbx  01 b:5 x:5 sp:2 0100100 local:1 m:1 -   data=1
 pxtlbx  01 b:5 x:5 ... 000100 local:1 m:1 - \
 sp=%assemble_sr3x data=0
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 43b74367ea..8d59990cfe 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -2518,6 +2518,58 @@ static bool trans_pxtlbx(DisasContext *ctx, arg_pxtlbx 
*a)
 #endif
 }

+/* Implement the pcxl and pcxl2 Fast TLB Insert instructions.
+ * See
+ * https://parisc.wiki.kernel.org/images-parisc/a/a9/Pcxl2_ers.pdf
+ * page 13-9 (195/206) */
+static bool trans_ixtlbxf(DisasContext *ctx, arg_ixtlbxf *a)
+{
+CHECK_MOST_PRIVILEGED(EXCP_PRIV_OPR);
+#ifndef CONFIG_USER_ONLY
+TCGv_tl addr;
+TCGv_reg reg;
+TCGv_reg ar, sr;
+TCGv_tl atl, stl;
+
+nullify_over(ctx);
+
+/*if (not (pcxl or pcxl2))
+  return gen_illegal(ctx); */
+
+ar = get_temp(ctx);
+sr = get_temp(ctx);
+atl = get_temp_tl(ctx);
+stl = get_temp_tl(ctx);
+addr = get_temp_tl(ctx);
+
+
+if (a->data) {
+tcg_gen_ld_reg(sr, cpu_env, offsetof(CPUHPPAState, cr[CR_ISR]));
+tcg_gen_ld_reg(ar, cpu_env, offsetof(CPUHPPAState, cr[CR_IOR]));
+} else {
+tcg_gen_ld_reg(sr, cpu_env, offsetof(CPUHPPAState, cr[CR_IIASQ]));
+tcg_gen_ld_reg(ar, cpu_env, offsetof(CPUHPPAState, cr[CR_IIAOQ]));
+}
+
+tcg_gen_extu_reg_tl(atl, ar);
+tcg_gen_extu_reg_tl(stl, sr);
+tcg_gen_shli_i64(stl, stl, 32);
+tcg_gen_or_tl(addr, atl, stl);
+reg = load_gpr(ctx, a->r);
+if (a->addr) {
+gen_helper_itlba(cpu_env, addr, reg);
+} else {
+gen_helper_itlbp(cpu_env, addr, reg);
+}
+
+/* Exit TB for TLB change if mmu is enabled.  */
+if (ctx->tb_flags & PSW_C) {
+ctx->base.is_jmp = DISAS_IAQ_N_STALE;
+}
+return nullify_end(ctx);
+#endif
+}
+
 static bool trans_lpa(DisasContext *ctx, arg_ldst *a)
 {
 CHECK_MOST_PRIVILEGED(EXCP_PRIV_OPR);
--
2.17.1

[Qemu-devel] [PATCH v3 0/2] HPPA fixes for NetBSD/hppa emulation

2019-04-22 Thread Nick Hudson

v3 changes:
 - Don't use C99 comments and fix a typo in the comment

v2 changes:
 - remove old debug code

*** BLURB HERE ***

Nick Hudson (2):
  Implement the pcxl and pcxl2 Fast TLB Insert instructions as used by
NetBSD (and OpenBSD)
  Always return EXCP_DMAR for protection id trap as EXCP_DMP is
considered legacy.

 target/hppa/insns.decode |  3 +++
 target/hppa/mem_helper.c |  3 +--
 target/hppa/translate.c  | 52 
 3 files changed, 56 insertions(+), 2 deletions(-)

--
2.17.1

Re: [Qemu-devel] Fwd: How live migration work for vhost-user

2019-04-22 Thread fengyd

Hi,

I want to add some log to qemu-kvm-ev.
Do you know how to compile qemu-kvm-ev from source code?

Thanks

Yafeng

On Tue, 16 Apr 2019 at 16:47, Dr. David Alan Gilbert 
wrote:

> * fengyd (fengy...@gmail.com) wrote:
> > -- Forwarded message -
> > From: fengyd 
> > Date: Tue, 16 Apr 2019 at 09:17
> > Subject: Re: [Qemu-devel] How live migration work for vhost-user
> > To: Dr. David Alan Gilbert 
> >
> >
> > Hi,
> >
> > Any special feature needs to be supported on guest driver?
> > Because it's OK for standard Linux VM, but not OK for our VM where virtio
> > is  implemented by ourself.
>
> I'm not sure; you do have to support that 'log' mechanism but I don't
> know what else is needed.
>
> > And with qemu-kvm-ev-2.6, live migration can work with our VM where
> virtio
> > is  implemented by ourself.
>
> 2.6 is pretty old, so there's a lot of changes - not sure what's
> relevant.
>
> Dave
>
> > Thanks
> > Yafeng
> >
> > On Mon, 15 Apr 2019 at 22:54, Dr. David Alan Gilbert <
> dgilb...@redhat.com>
> > wrote:
> >
> > > * fengyd (fengy...@gmail.com) wrote:
> > > > Hi,
> > > >
> > > > During live migration,  the folloing log can see in nova-compute.log
> in
> > > my
> > > > environment:
> > > >  ERROR nova.virt.libvirt.driver
> [req-039a85e1-e7a1-4a63-bc6d-c4b9a044aab6
> > > > 0cdab20dc79f4bc6ae5790e7b4a898ac 3363c319773549178acc67f32c78310e -
> > > default
> > > > default] [instance: 5ec719f4-1865-4afe-a207-3d9fae22c410] Live
> Migration
> > > > failure: internal error: qemu unexpectedly closed the monitor:
> > > > 2019-04-15T02:58:22.213897Z qemu-kvm: VQ 0
> > > > size 0x100 < last_avail_idx 0x1e - used_idx 0x23
> > > >
> > > > It's OK for standard Linux VM, but not OK for our VM where virtio is
> > > > implemented by ourself.
> > > > KVM version as follow:
> > > > qemu-kvm-common-ev-2.12.0-18.el7_6.3.1.x86_64
> > > > qemu-kvm-ev-2.12.0-18.el7_6.3.1.x86_64
> > > > libvirt-daemon-kvm-3.9.0-14.2.el7.centos.ncir.8.x86_64
> > > >
> > > > Do you know what's the difference between virtio and vhost-user
> during
> > > > migration?
> > > > The function virtio_load in Qemu is called for virtio and vhost-user
> > > during
> > > > migration.
> > > > For virtio,  last_avail_idx  and used_idx are stored in Qemu, Qemu is
> > > > responsible for updating their values accordingly
> > > > For vhost-user, last_avail_idx  and used_idx are stored in vhost-user
> > > app,
> > > > eg. DPDK, not in Qemu?
> > > > How does migration work for vhost-user?
> > >
> > > I don't know the details, but my understanding is that vhost-user
> > > tells the vhost-user client about an area of 'log' memory, where the
> > > vhost-user client must mark pages as dirty.
> > >
> > > In the qemu source, see docs/interop/vhost-user.txt and see
> > > the VHOST_SET_LOG_BASE and VHOST_USER_SET_LOG_FD calls.
> > >
> > > If the client correctly marks the areas as dirty, then qemu
> > > should resend those pages across.
> > >
> > >
> > > Dave
> > >
> > > > Thanks in advance
> > > > Yafeng
> > > --
> > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > >
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>

Re: [Qemu-devel] [PATCH 1/2] vfio/mdev: add version field as mandatory attribute for mdev device

2019-04-22 Thread Yan Zhao

On Tue, Apr 23, 2019 at 09:21:00AM +0800, Alex Williamson wrote:
> On Mon, 22 Apr 2019 21:01:52 -0400
> Yan Zhao  wrote:
> 
> > On Mon, Apr 22, 2019 at 10:39:50PM +0800, Alex Williamson wrote:
> > > On Fri, 19 Apr 2019 04:35:04 -0400
> > > Yan Zhao  wrote:
> > >   
> > > > device version attribute in mdev sysfs is used by user space software
> > > > (e.g. libvirt) to query device compatibility for live migration of VFIO
> > > > mdev devices. This attribute is mandatory if a mdev device supports live
> > > > migration.  
> > > 
> > > The Subject: doesn't quite match what's being proposed here.
> > >
> > > > It consists of two parts: common part and vendor proprietary part.
> > > > common part: 32 bit. lower 16 bits is vendor id and higher 16 bits
> > > >  identifies device type. e.g., for pci device, it is
> > > >  "pci vendor id" | (VFIO_DEVICE_FLAGS_PCI << 16).  
> > > 
> > > What purpose does this serve?  If it's intended as some sort of
> > > namespace feature, shouldn't we first assume that we can only support
> > > migration to devices of the same type?  Therefore each type would
> > > already have its own namespace.  Also that would make the trailing bit
> > > of the version string listed below in the example redundant.  A vendor
> > > is still welcome to include this in their version string if they wish,
> > > but I think the string should be entirely vendor defined.
> > >  
> > hi Alex,
> > This common part is a kind of namespace.
> > Because if version string is entirely defined by vendors, I'm worried about
> > if there is a case that one vendor's version string happens to deceive and
> > interfere with another vendor's version checking?
> > e.g.
> > vendor A has a version string like: vendor id + device id + mdev type
> > vendor B has a version string like: device id + vendor id + mdev type
> > but vendor A's vendor id is 0x8086, device id is 0x1217
> > vendor B's vendor id is 0x1217, device id is 0x8086.
> > 
> > In this corner case, the two vendors may regard the two device is
> > migratable but actually they are not.
> > 
> > That's the reason for this common part that serve as a kind of namespace
> > that all vendors will comply with to avoid overlap.
> 
> If we assume that migration can only occur between matching mdev types,
> this is redundant, each type already has their own namespace.
>
hi Alex,
do you mean user space software like libvirt needs to first check whether
mdev type is matching and then check whether version is matching?

if user space software only checks version for migration, it means vendor
driver has to include mdev type in their vendor proprietary part string,
right?

Another thing is that could there be any future mdev parent driver that
applies to all mdev devices, just like vfio-pci? like Yi's vfio-pci-mdev
driver (https://lkml.org/lkml/2019/3/13/114)?

> > > > vendor proprietary part: this part is varied in length. vendor driver 
> > > > can
> > > >  specify any string to identify a device.
> > > > 
> > > > When reading this attribute, it should show device version string of the
> > > > device of type . If a device does not support live migration, 
> > > > it
> > > > should return errno.
> > > > When writing a string to this attribute, it returns errno for
> > > > incompatibility or returns written string length in compatibility case.
> > > > If a device does not support live migration, it always returns errno.
> > > > 
> > > > For user space software to use:
> > > > 1.
> > > > Before starting live migration, user space software first reads source 
> > > > side
> > > > mdev device's version. e.g.
> > > > "#cat \
> > > > /sys/bus/pci/devices/\:00\:02.0/5ac1fb20-2bbf-4842-bb7e-36c58c3be9cd/mdev_type/version"
> > > > 00028086-193b-i915-GVTg_V5_4
> > > > 
> > > > 2.
> > > > Then, user space software writes the source side returned version string
> > > > to device version attribute in target side, and checks the return value.
> > > > If a negative errno is returned in the target side, then mdev devices in
> > > > source and target sides are not compatible;
> > > > If a positive number is returned and it equals to the length of written
> > > > string, then the two mdev devices in source and target side are 
> > > > compatible.
> > > > e.g.
> > > > (a) compatibility case
> > > > "# echo 00028086-193b-i915-GVTg_V5_4 >
> > > > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > > > 
> > > > (b) incompatibility case
> > > > "#echo 00028086-193b-i915-GVTg_V5_1 >
> > > > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > > > -bash: echo: write error: Invalid argument
> > > > 
> > > > 3. if two mdev devices are compatible, user space software can start
> > > > live migration, and vice versa.
> > > > 
> > > > Note: if a mdev device does not support live migration, it either does
> > > > not provide a version attribute, or always returns errno when

[Qemu-devel] [Bug 1793904] Re: files are randomly overwritten by Zero Bytes

2019-04-22 Thread Hans

I did some undocumented testing to circle the issue down: It seems to
affect Files that are stored as sparsed files either raw or qcow2.

There was a big event and I had the chance to find some people using
glusterfs with qemu, too. the biggest difference as far as I could see
was that they all where using xfs based backends while I am using ext4.

I have to fix a few more issues in Production this week, then I will run
more tests.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1793904

Title:
  files are randomly overwritten by Zero Bytes

Status in QEMU:
  New

Bug description:
  Hello together,

  I am currently tracking down a "Hard to reproduce" bug on my systems
  that I first discovered during gitlab installation:

  
  Here is the Text from the Gitlab Bug 
https://gitlab.com/gitlab-org/gitlab-ce/issues/51023
  
--

  Steps to reproduce

  I still do not have all the steps together to reproduce, so far it is:
  apt install gitlab-ce and
  gitlab-rake backup:recovery
  Then it works for some time before it fails.

  What is the current bug behavior?

  I have a 12 hour old Installation of gitlab ce 11.2.3-ce.0 for debian
  stretch on a fresh debian stretch system together with our imported
  data. However it turns out that some gitlab related files contain Zero
  bytes instead of actual data.

  root@gitlab:~# xxd -l 16 /opt/gitlab/bin/gitlab-ctl
  :          

  This behaviour is somewhat strange because it was working for a few
  minutes/hours. I did write a shell script to find out which files are
  affected of this memory loss. It turns out that only files located
  under /opt/gitlab are affected, if I rule out files like
  /var/log/faillog and some postgresql table files.

  What I find even stranger is that it does not seem to affect
  Logfiles/databases/git_repositorys but application files, like .rb
  scripts. and not all of them. No non gitlab package is affected.

  What is the expected correct behavior?
  Binarys and .rb files should stay as they are.

  Possible fixes

  I am still investigating, I hope that it is not an infrastructure problem 
(libvirt/qemu/glusterfs) it can still be one but the point that files of 
/opt/gitlab are affected and not any logfile and that we to not have similar 
problems with any other system leads me to the application for now.
  If I would have used docker the same problem might have caused a reboot of 
the container.
  But for the Debian package it is a bit of work to recover. That is all a 
workaround, however.
  
-

  I do have found 2 more systems having the same problem with different
  software:

  root@erp:~# xxd -l 16 /usr/share/perl/5.26.2/constant.pm
  :          

  The Filesize itself is, compared with another machine 1660 Bytes
  for both the corrupted and the intact file. It looks to me from the
  outside that if some data in the qcow2 file is written too many bytes
  get written so it sometimes overwites data of existing files located
  right after the position in memory where the write goes to.

  I would like to rule out Linux+Ext4 filesystems because I find it
  highly unlikely that such an error keeps undiscovered in that part of
  the environment for long. I think the same might go for qemu.

  Which leaves qemu, gemu+gluster:// mount, qcow2 volumes, glusterfs,
  network. So I am now going to check if I can find any system which
  gets its volumes via fusermount instead of gluster:// path if the
  error is gone there. This may take a while.

  
  - some software versions---

  QEMU emulator version 2.12.0 (Debian 1:2.12+dfsg-3)
  Copyright (c) 2003-2017 Fabrice Bellard and the QEMU Project developers

  libvirt-daemon-driver-storage-gluster/testing,unstable,now 4.6.0-2
  amd64 [installed]

  ii  glusterfs-client   4.1.3-1amd64

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1793904/+subscriptions

[Qemu-devel] [Bug 1615079] Re: GTK+ UI virtual consoles scrolling broken

2019-04-22 Thread Thomas Huth

Are you talking about serial console? Or graphical (VGA) console? Which
QEMU target (x86? ... and which machine?)? How did you start QEMU? ...
this bug report clearly lacks some more information...

** Changed in: qemu
   Status: New => Incomplete

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1615079

Title:
  GTK+ UI virtual consoles scrolling broken

Status in QEMU:
  Incomplete

Bug description:
  "In the virtual consoles, you can use Ctrl-Up, Ctrl-Down, Ctrl-PageUp
  and Ctrl-PageDown to move in the back log."

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1615079/+subscriptions

Re: [Qemu-devel] [PATCH 3/3] accel: Remove unused AccelClass::available field

2019-04-22 Thread Thomas Huth

On 22/04/2019 23.04, Eduardo Habkost wrote:
> The field is not used anymore, we can remove it.
> 
> Signed-off-by: Eduardo Habkost 
> ---
>  include/sysemu/accel.h | 1 -
>  accel/accel.c  | 5 -
>  2 files changed, 6 deletions(-)

Reviewed-by: Thomas Huth

Re: [Qemu-devel] [PATCH 2/3] qtest: Don't compile qtest accel on non-POSIX systems

2019-04-22 Thread Thomas Huth

On 22/04/2019 23.04, Eduardo Habkost wrote:
> qtest_available() will always return 0 on non-POSIX systems.
> It's simpler to just not compile the accelerator code on those
> systems instead of relying on the AccelClass::available function.
> 
> Signed-off-by: Eduardo Habkost 
> ---
>  include/sysemu/qtest.h | 9 -
>  accel/qtest.c  | 1 -
>  accel/Makefile.objs| 2 +-
>  3 files changed, 1 insertion(+), 11 deletions(-)

Reviewed-by: Thomas Huth

Re: [Qemu-devel] [PATCH 1/3] qtest: Move accel code to accel/qtest.c

2019-04-22 Thread Thomas Huth

On 22/04/2019 23.04, Eduardo Habkost wrote:
> QTest has two parts: the server (-qtest) and the accelerator
> (-machine accel=qtest).  The accelerator depends on CONFIG_POSIX
> due to its usage of sigwait(), but the server doesn't.
> 
> Move the accel code to accel/qtest.c.  Later we will disable
> compilation of accel/qtest.c on non-POSIX systems.
> 
> Signed-off-by: Eduardo Habkost 
> ---
>  accel/qtest.c   | 55 +

New file ==> You should update MAINTAINERS here as well.

 Thomas

Re: [Qemu-devel] Following up questions related to QEMU and I/O Thread

2019-04-22 Thread Wei Li

Hi Stefan,

I did investigation per your advices, please see inline for the details and 
questions.

   1. Compare "iostat -dx 1" inside the guest and host.  Are the I/O
   patterns comparable?  blktrace(8) can give you even more detail on
   the exact I/O patterns.  If the guest and host have different I/O
   patterns (blocksize, IOPS, queue depth) then request merging or
   I/O scheduler effects could be responsible for the difference.

[wei]: That's good point, I compared 'iostate -dx1" between guest and host, but 
I have not find obvious difference between guest and host which could 
responsible for the difference.

2. kvm_stat or perf record -a -e kvm:\* counters for vmexits and
   interrupt injections.  If these counters vary greatly between queue
   sizes, then that is usually a clue.  It's possible to get higher
   performance by spending more CPU cycles although your system doesn't
   have many CPUs available, so I'm not sure if this is the case.

[wei]: vmexits looks like a reason. I am using FIO tool to read/write block 
storage via following sample command, interesting thing is that kvm:kvm_exit 
count decreased from 846K to 395K after I increased num_queues from 2 to 4 
while the vCPU count is 2.
   1). Does this mean using more queues than vCPU count may increase 
IOPS via spending more CPU cycle? 
   2). Could you please help me better understand how more queues is 
able to spend more CPU cycle? Thanks!
   FIO command: fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k 
--ioengine=libaio --iodepth=64 --numjobs=4 --time_based --group_reporting 
--name=iops --runtime=60 --eta-newline=1

3. Power management and polling (kvm.ko halt_poll_ns, tuned profiles,
   and QEMU iothread poll-max-ns).  It's expensive to wake a CPU when it
   goes into a low power mode due to idle.  There are several features
   that can keep the CPU awake or even poll so that request latency is
   reduced.  The reason why the number of queues may matter is that
   kicking multiple queues may keep the CPU awake more than batching
   multiple requests onto a small number of queues.
[wei]: CPU awake could be another reason, I noticed that kvm:kvm_vcpu_wakeup 
count decreased from 151K to 47K after I increased num_queues from 2 to 4 while 
the vCPU count is 2.
   1). Does this mean more queues may keep CPU more busy and awake 
which reduced the vcpu wakeup time?
   2). If using more num_queues than vCPU count is able to get higher 
IOPS for this case, is it safe to use 4 queues while it only have 2 vCPU, or 
there is any concern or impact by using more queues than vCPU count which I 
should keep in mind?

In addition, does Virtio-scsi support Batch I/O Submission feature which may be 
able to increase the IOPS via reducing the number of system calls?

Thanks,
Wei

On 4/16/19, 6:42 PM, "Wei Li"  wrote:

Thanks Stefan and Dongli for your feedback and advices!

I will do the further investigation per your advices and get back to you 
later on.

Thanks, 
-Wei

On 4/16/19, 2:20 AM, "Stefan Hajnoczi"  wrote:

On Tue, Apr 16, 2019 at 07:23:38AM +0800, Dongli Zhang wrote:
> 
> 
> On 4/16/19 1:34 AM, Wei Li wrote:
> > Hi @Paolo Bonzini & @Stefan Hajnoczi,
> > 
> > Would you please help confirm whether @Paolo Bonzini's multiqueue 
feature change will benefit virtio-scsi or not? Thanks!
> > 
> > @Stefan Hajnoczi,
> > I also spent some time on exploring the virtio-scsi multi-queue 
features via num_queues parameter as below, here are what we found:
> > 
> > 1. Increase number of Queues from one to the same number as CPU 
will get better IOPS increase.
> > 2. Increase number of Queues to the number (e.g. 8) larger than the 
number of vCPU (e.g. 2) can get even better IOPS increase.
> 
> As mentioned in below link, when the number of hw queues is larger 
than
> nr_cpu_ids, the blk-mq layer would limit and only use at most 
nr_cpu_ids queues
> (e.g., /sys/block/sda/mq/).
> 
> That is, when the num_queus=4 while vcpus is 2, there should be only 
2 queues
> available /sys/block/sda/mq/
> 
> 
https://lore.kernel.org/lkml/1553682995-5682-1-git-send-email-dongli.zh...@oracle.com/
> 
> I am just curious how increasing the num_queues from 2 to 4 would 
double the
> iops, while there are only 2 vcpus available...

I don't know the answer.  It's especially hard to guess without seeing
the benchmark (fio?) configuration and QEMU command-line.

Common things to look at are:

1. Compare "iostat -dx 1" inside the guest and host.  Are the I/O
   patterns comparable?  blktrace(

Re: [Qemu-devel] [PATCH v5 1/6] libnvdimm: nd_region flush callback support

2019-04-22 Thread Pankaj Gupta



> 
> Dan Williams  writes:
> 
> > On Mon, Apr 22, 2019 at 8:59 AM Jeff Moyer  wrote:
> >>
> >> Dan Williams  writes:
> >>
> >> > On Thu, Apr 18, 2019 at 9:18 AM Christoph Hellwig 
> >> > wrote:
> >> >>
> >> >> On Thu, Apr 18, 2019 at 09:05:05AM -0700, Dan Williams wrote:
> >> >> > > > I'd either add a comment about avoiding retpoline overhead here
> >> >> > > > or just
> >> >> > > > make ->flush == NULL mean generic_nvdimm_flush(). Just so that
> >> >> > > > people don't
> >> >> > > > get confused by the code.
> >> >> > >
> >> >> > > Isn't this premature optimization?  I really don't like adding
> >> >> > > things
> >> >> > > like this without some numbers to show it's worth it.
> >> >> >
> >> >> > I don't think it's premature given this optimization technique is
> >> >> > already being deployed elsewhere, see:
> >> >> >
> >> >> > https://lwn.net/Articles/774347/
> >> >>
> >> >> For one this one was backed by numbers, and second after feedback
> >> >> from Linux we switched to the NULL pointer check instead.
> >> >
> >> > Ok I should have noticed the switch to NULL pointer check. However,
> >> > the question still stands do we want everyone to run numbers to
> >> > justify this optimization, or make it a new common kernel coding
> >> > practice to do:
> >> >
> >> > if (!object->op)
> >> > generic_op(object);
> >> > else
> >> > object->op(object);
> >> >
> >> > ...in hot paths?
> >>
> >> I don't think nvdimm_flush is a hot path.  Numbers of some
> >> representative workload would prove one of us right.
> >
> > I'd rather say that the if "if (!op) do_generic()" pattern is more
> > readable in the general case, saves grepping for who set the op in the
> > common case. The fact that it has the potential to be faster is gravy
> > at that point.
> 
> If the primary motivation is performance, then I'd expect performance
> numbers to back it up.  If that isn't the primary motivation, then
> choose whichever way you feel is appropriate.

Agree. This change enhances the code readability. Will add this change in
v6 with other changes.

Thank you! 

Pankaj

> 
> Cheers,
> Jeff
>

Re: [Qemu-devel] [PATCH v2 1/3] edu: mmio: set 'max_access_size' to 8

2019-04-22 Thread Li Qiang

Philippe Mathieu-Daudé  于2019年4月23日周二 上午12:28写道：

> On 4/22/19 3:17 AM, Li Qiang wrote:
> >
> >
> > Philippe Mathieu-Daudé mailto:phi...@redhat.com>> 于
> > 2019年4月21日周日 下午6:28写道：
> >
> > Hi Li,
> >
> > The patch title is not very descriptive, maybe "allow 64-bit access"
> >
> >
> > On 4/20/19 6:14 PM, Li Qiang wrote:
> > > The edu spec said, the MMIO area can be accessed by 8 bytes.
> >
> > or 64-bit...
> >
> > > However currently the 'max_access_size' is not so the MMIO
> > > access dispatch can only access 4 bytes one time. This patch
> >
> > 32-bit
> >
> > > fixes this to respect the spec.
> > >
> > > Notice: here the 'min_access_size' is not a must, I set this
> > > for completement.
> >
> > Which one? valid/impl? I think you can drop this comment from the
> commit
> > description.
> >
> >
> > Both needed. from memory_access_size, if we has no valid.max_access_size,
> > this function will set it to 4.
> >
> > static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
> > {
> > unsigned access_size_max = mr->ops->valid.max_access_size;
> >
> > /* Regions are assumed to support 1-4 byte accesses unless
> >otherwise specified.  */
> > if (access_size_max == 0) {
> > access_size_max = 4;
> > }
> >...
> > }
> >
> > From access_with_adjusted_size, if we has no impl.max_access_size,
> > this function will set it to 4.
> >
> > ps: I will appreciate if anyone can explain what's the meaning of valid
> > and impl's min/max_access_size
> > and how it affects the behavior.
>
> "valid" describes the valid access from the bus to the device.
>
> Indeed in the EDU case those are 4 and 8.
>
> "impl" describes the accesses implemented by the QEMU device model.
> The developper who writes the device is free to choose the accesses he
> will model.
>
> If valid/impl accesses don't match, the function
> access_with_adjusted_size() from memory.c will adjust the bus access to
> the device implementation.
>
> For example, if the device only implements 1 and 2 bytes accesses, with
> a 1-4 valid access, if the CPU executes a 32-bit access, this function
> will do 2x 16-bit access to the device (incrementing the address by 2)
> and returns a 32-bit result.
>
> Similarly, if the CPU does a 8-bit access on a 32-bit impl device,
> access_with_adjusted_size() will execute a single 32-bit access to the
> device, then mask/shift the returned value and returns a 8-bit result to
> the caller.
>
>
Thanks  Philippe,

I think I get the pointer.

Thanks,
Li Qiang



> > Thanks,
> > Li Qiang
> >
> > >
> > > Signed-off-by: Li Qiang mailto:liq...@163.com>>
> > > ---
> > >  hw/misc/edu.c | 9 +
> > >  1 file changed, 9 insertions(+)
> > >
> > > diff --git a/hw/misc/edu.c b/hw/misc/edu.c
> > > index 91af452c9e..65fc32b928 100644
> > > --- a/hw/misc/edu.c
> > > +++ b/hw/misc/edu.c
> > > @@ -289,6 +289,15 @@ static const MemoryRegionOps edu_mmio_ops = {
> > >  .read = edu_mmio_read,
> > >  .write = edu_mmio_write,
> > >  .endianness = DEVICE_NATIVE_ENDIAN,
> > > +.valid = {
> > > +.min_access_size = 4,
> >
> > Per the spec, this is correct.
> >
> > > +.max_access_size = 8,
> >
> > Correct.
> >
> > > +},
> > > +.impl = {
> > > +.min_access_size = 4,
> >
> > OK.
> >
> > > +.max_access_size = 8,
> >
> > Correct.
> >
> > > +},
> > > +
> > >  };
> > >
> > >  /*
> > >
> >
> > With title/description updated:
> > Reviewed-by: Philippe Mathieu-Daudé  > >
> >
>

[Qemu-devel] question： Does qemu-pr-helper have its own log files？

2019-04-22 Thread Jie Wang

 Does qemu-pr-helper have its own log files？

Re: [Qemu-devel] [PATCH 1/2] add VirtIONet vhost_stopped flag to prevent multiple stops

2019-04-22 Thread Jason Wang




On 2019/4/23 上午4:14, Dan Streetman wrote:

On Sun, Apr 21, 2019 at 10:50 PM Jason Wang  wrote:


On 2019/4/17 上午2:46, Dan Streetman wrote:

From: Dan Streetman 

Buglink: https://launchpad.net/bugs/1823458

There is a race condition when using the vhost-user driver, between a guest
shutdown and the vhost-user interface being closed.  This is explained in
more detail at the bug link above; the short explanation is the vhost-user
device can be closed while the main thread is in the middle of stopping
the vhost_net.  In this case, the main thread handling shutdown will
enter virtio_net_vhost_status() and move into the n->vhost_started (else)
block, and call vhost_net_stop(); while it is running that function,
another thread is notified that the vhost-user device has been closed,
and (indirectly) calls into virtio_net_vhost_status() also.


I think we need figure out why there are multiple vhost_net_stop() calls
simultaneously. E.g vhost-user register fd handlers like:

  qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
   net_vhost_user_event, NULL, nc0->name,
NULL,
   true);

which uses default main context, so it should only be called only in
main thread.

net_vhost_user_event() schedules chr_closed_bh() to do its bottom half
work; does aio_bh_schedule_oneshot() execute its events from the main
thread?



I think so if net_vhost_user_event() was called in main thread (it calls 
qemu_get_current_aio_context()).





For reference, the call chain is:

chr_closed_bh()
   qmp_set_link()
 nc->info->link_status_changed() -> virtio_net_set_link_status()
   virtio_net_set_status()
 virtio_net_vhost_status()



The code was added by Marc since:

commit e7c83a885f865128ae3cf1946f8cb538b63cbfba
Author: Marc-André Lureau 
Date:   Mon Feb 27 14:49:56 2017 +0400

    vhost-user: delay vhost_user_stop

Cc him for more thoughts.

Thanks



Thanks



   Since the
vhost_net status hasn't yet changed, the second thread also enters
the n->vhost_started block, and also calls vhost_net_stop().  This
causes problems for the second thread when it tries to stop the network
that's already been stopped.

This adds a flag to the struct that's atomically set to prevent more than
one thread from calling vhost_net_stop().  The atomic_fetch_inc() is likely
overkill and probably could be done with a simple check-and-set, but
since it's a race condition there would still be a (very, very) small
window without using an atomic to set it.

Signed-off-by: Dan Streetman 
---
   hw/net/virtio-net.c| 3 ++-
   include/hw/virtio/virtio-net.h | 1 +
   2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index ffe0872fff..d36f50d5dd 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -13,6 +13,7 @@

   #include "qemu/osdep.h"
   #include "qemu/iov.h"
+#include "qemu/atomic.h"
   #include "hw/virtio/virtio.h"
   #include "net/net.h"
   #include "net/checksum.h"
@@ -240,7 +241,7 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t 
status)
"falling back on userspace virtio", -r);
   n->vhost_started = 0;
   }
-} else {
+} else if (atomic_fetch_inc(&n->vhost_stopped) == 0) {
   vhost_net_stop(vdev, n->nic->ncs, queues);
   n->vhost_started = 0;
   }
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index b96f0c643f..d03fd933d0 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -164,6 +164,7 @@ struct VirtIONet {
   uint8_t nouni;
   uint8_t nobcast;
   uint8_t vhost_started;
+int vhost_stopped;
   struct {
   uint32_t in_use;
   uint32_t first_multi;

Re: [Qemu-devel] [PATCH v14 0/2] support MAP_SYNC for memory-backend-file

2019-04-22 Thread Wei Yang

On Mon, Apr 22, 2019 at 03:22:55PM -0300, Eduardo Habkost wrote:
>On Mon, Apr 22, 2019 at 08:34:51AM -0400, Michael S. Tsirkin wrote:
>> On Mon, Apr 22, 2019 at 08:48:47AM +0800, Wei Yang wrote:
>> > Linux 4.15 introduces a new mmap flag MAP_SYNC, which can be used to
>> > guarantee the write persistence to mmap'ed files supporting DAX (e.g.,
>> > files on ext4/xfs file system mounted with '-o dax').
>> > 
>> > A description of MAP_SYNC and MAP_SHARED_VALIDATE can be found at
>> > https://patchwork.kernel.org/patch/10028151/
>> > 
>> > In order to make sure that the file metadata is in sync after a fault 
>> > while we are writing a shared DAX supporting backend files, this
>> > patch-set enables QEMU to use MAP_SYNC flag for memory-backend-dax-file.
>> > 
>> > As the DAX vs DMA truncated issue was solved, we refined the code and
>> > send out this feature for the v5 version.
>> > 
>> > We will pass MAP_SYNC to mmap(2); if MAP_SYNC is supported and
>> > 'share=on' & 'pmem=on'. 
>> > Or QEMU will not pass this flag to mmap(2)
>> 
>> OK this is in a good shape. As we are in freeze anyway,
>> there's still a bit more time to polish it. I have a couple of
>> suggestions:
>> 
>> - squash docs in same patch with code, no need for two patches
>> - mmap errors are not silently ignored as the doc says,
>>   a warning is produced
>
>Note that a warning is produced only if both share=on and pmem=on
>is specified.  If using pmem=on without share=on, no warning is
>printed at all.
>
>I agree we could squash the docs in the same patch, but I don't
>want to prevent the code from being merged and require v15 to be
>sent just because we are still polishing the documentation.
>
>If there are no objections, I plan to apply this version of the
>series including the following fixup (just removing the word
>"silently"), and I suggest further improvements to be sent as
>follow up patches.
>

If my understanding is correct, the following up patch is:

"send the warnings to an errp object and not stderr"

-- 
Wei Yang
Help you, Help me

Re: [Qemu-devel] [PATCH v3 5/7] hw/ppc: Implement fw_cfg_arch_key_name()

2019-04-22 Thread David Gibson

On Mon, Apr 22, 2019 at 09:50:18PM +0200, Philippe Mathieu-Daudé wrote:
> Implement fw_cfg_arch_key_name(), which returns the name of a
> ppc-specific key.
> 
> Signed-off-by: Philippe Mathieu-Daudé 

What ppc machines actually use fw_cfg?  I know pseries and powernv
don't.

> ---
>  hw/ppc/Makefile.objs |  2 +-
>  hw/ppc/fw_cfg.c  | 45 
>  2 files changed, 46 insertions(+), 1 deletion(-)
>  create mode 100644 hw/ppc/fw_cfg.c
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index b218a04..ae940981553 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -1,5 +1,5 @@
>  # shared objects
> -obj-y += ppc.o ppc_booke.o fdt.o
> +obj-y += ppc.o ppc_booke.o fdt.o fw_cfg.o
>  # IBM pSeries (sPAPR)
>  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> diff --git a/hw/ppc/fw_cfg.c b/hw/ppc/fw_cfg.c
> new file mode 100644
> index 000..a88b5c4bde2
> --- /dev/null
> +++ b/hw/ppc/fw_cfg.c
> @@ -0,0 +1,45 @@
> +/*
> + * fw_cfg helpers (PPC specific)
> + *
> + * Copyright (c) 2019 Red Hat, Inc.
> + *
> + * Author:
> + *   Philippe Mathieu-Daudé 
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/ppc/ppc.h"
> +#include "hw/nvram/fw_cfg.h"
> +
> +const char *fw_cfg_arch_key_name(uint16_t key)
> +{
> +static const struct {
> +uint16_t key;
> +const char *name;
> +} fw_cfg_arch_wellknown_keys[] = {
> +{FW_CFG_PPC_WIDTH, "width"},
> +{FW_CFG_PPC_HEIGHT, "height"},
> +{FW_CFG_PPC_DEPTH, "depth"},
> +{FW_CFG_PPC_TBFREQ, "tbfreq"},
> +{FW_CFG_PPC_CLOCKFREQ, "clockfreq"},
> +{FW_CFG_PPC_IS_KVM, "is_kvm"},
> +{FW_CFG_PPC_KVM_HC, "kvm_hc"},
> +{FW_CFG_PPC_KVM_PID, "pid"},
> +{FW_CFG_PPC_NVRAM_ADDR, "nvram_addr"},
> +{FW_CFG_PPC_BUSFREQ, "busfreq"},
> +{FW_CFG_PPC_NVRAM_FLAT, "nvram_flat"},
> +{FW_CFG_PPC_VIACONFIG, "viaconfig"},
> +};
> +
> +for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
> +if (fw_cfg_arch_wellknown_keys[i].key == key) {
> +return fw_cfg_arch_wellknown_keys[i].name;
> +}
> +}
> +return NULL;
> +}

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH 1/2] vfio/mdev: add version field as mandatory attribute for mdev device

2019-04-22 Thread Alex Williamson

On Mon, 22 Apr 2019 21:01:52 -0400
Yan Zhao  wrote:

> On Mon, Apr 22, 2019 at 10:39:50PM +0800, Alex Williamson wrote:
> > On Fri, 19 Apr 2019 04:35:04 -0400
> > Yan Zhao  wrote:
> >   
> > > device version attribute in mdev sysfs is used by user space software
> > > (e.g. libvirt) to query device compatibility for live migration of VFIO
> > > mdev devices. This attribute is mandatory if a mdev device supports live
> > > migration.  
> > 
> > The Subject: doesn't quite match what's being proposed here.
> >
> > > It consists of two parts: common part and vendor proprietary part.
> > > common part: 32 bit. lower 16 bits is vendor id and higher 16 bits
> > >  identifies device type. e.g., for pci device, it is
> > >  "pci vendor id" | (VFIO_DEVICE_FLAGS_PCI << 16).  
> > 
> > What purpose does this serve?  If it's intended as some sort of
> > namespace feature, shouldn't we first assume that we can only support
> > migration to devices of the same type?  Therefore each type would
> > already have its own namespace.  Also that would make the trailing bit
> > of the version string listed below in the example redundant.  A vendor
> > is still welcome to include this in their version string if they wish,
> > but I think the string should be entirely vendor defined.
> >  
> hi Alex,
> This common part is a kind of namespace.
> Because if version string is entirely defined by vendors, I'm worried about
> if there is a case that one vendor's version string happens to deceive and
> interfere with another vendor's version checking?
> e.g.
> vendor A has a version string like: vendor id + device id + mdev type
> vendor B has a version string like: device id + vendor id + mdev type
> but vendor A's vendor id is 0x8086, device id is 0x1217
> vendor B's vendor id is 0x1217, device id is 0x8086.
> 
> In this corner case, the two vendors may regard the two device is
> migratable but actually they are not.
> 
> That's the reason for this common part that serve as a kind of namespace
> that all vendors will comply with to avoid overlap.

If we assume that migration can only occur between matching mdev types,
this is redundant, each type already has their own namespace.

> > > vendor proprietary part: this part is varied in length. vendor driver can
> > >  specify any string to identify a device.
> > > 
> > > When reading this attribute, it should show device version string of the
> > > device of type . If a device does not support live migration, it
> > > should return errno.
> > > When writing a string to this attribute, it returns errno for
> > > incompatibility or returns written string length in compatibility case.
> > > If a device does not support live migration, it always returns errno.
> > > 
> > > For user space software to use:
> > > 1.
> > > Before starting live migration, user space software first reads source 
> > > side
> > > mdev device's version. e.g.
> > > "#cat \
> > > /sys/bus/pci/devices/\:00\:02.0/5ac1fb20-2bbf-4842-bb7e-36c58c3be9cd/mdev_type/version"
> > > 00028086-193b-i915-GVTg_V5_4
> > > 
> > > 2.
> > > Then, user space software writes the source side returned version string
> > > to device version attribute in target side, and checks the return value.
> > > If a negative errno is returned in the target side, then mdev devices in
> > > source and target sides are not compatible;
> > > If a positive number is returned and it equals to the length of written
> > > string, then the two mdev devices in source and target side are 
> > > compatible.
> > > e.g.
> > > (a) compatibility case
> > > "# echo 00028086-193b-i915-GVTg_V5_4 >
> > > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > > 
> > > (b) incompatibility case
> > > "#echo 00028086-193b-i915-GVTg_V5_1 >
> > > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > > -bash: echo: write error: Invalid argument
> > > 
> > > 3. if two mdev devices are compatible, user space software can start
> > > live migration, and vice versa.
> > > 
> > > Note: if a mdev device does not support live migration, it either does
> > > not provide a version attribute, or always returns errno when its version
> > > attribute is read/written.  
> > 
> > I think it would be cleaner to do the former, not supply the
> > attribute.  This seems to do the latter in the sample drivers.  Thanks,  
> Ok. you are right!
> what about just keep one sample driver to show how to do the latter,
> and let the others do the former?

I'd rather that if a vendor driver doesn't support features requiring
the version attribute that they don't implement it.  It's confusing to
developers looking at the sample driver for guidance if we have
different implementations.  Of course if you'd like to add migration
support to one of the sample drivers, that'd be very welcome.  Thanks,

Alex

> > > Cc: Alex Williamson 
> > > Cc: Erik Skultety 
> > > Cc: "Dr. David Alan Gil

[Qemu-devel] [Bug 1805256] Re: qemu-img hangs on high core count ARM system

2019-04-22 Thread 贞贵李

I can reproduce this problem with qemu.git/matser. It still exists in 
qemu.git/matser. I found that when an IO return in worker threads and want to 
call aio_notify to wake up main_loop, but it found that ctx->notify_me is 
cleared to 0 by main_loop in aio_ctx_check by calling 
atomic_and(&ctx->notify_me, ~1) . So worker thread won't write enventfd to 
notify main_loop.If such a scene happens, the main_loop will hang:
main loopworker thread1  worker 
thread2
---
qemu_poll_nsaio_worker
qemu_bh_schedule(pool->completion_bh)
glib_pollfds_poll
g_main_context_check
aio_ctx_check
atomic_and(&ctx->notify_me, ~1)aio_worker
  
qemu_bh_schedule(pool->completion_bh)
/* do something for event */
qemu_poll_ns
/* hangs !!!*/

As we known, ctx->notify_me will be visited by worker thread and main
loop. I thank we should add a lock protection for ctx->notify_me to
avoid this happend.what do you thank so?

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on high core count ARM system

Status in QEMU:
  Confirmed

Bug description:
  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760, 
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1  0xbbefaf00 in ppoll (__ss=0x0, __timeout=0x0, __nfds=, 
  __fds=) at /usr/include/aarch64-linux-gnu/bits/poll2.h:77
  #2  qemu_poll_ns (fds=, nfds=, 
  timeout=timeout@entry=-1) at util/qemu-timer.c:322
  #3  0xbbefbf80 in os_host_main_loop_wait (timeout=-1)
  at util/main-loop.c:233
  #4  main_loop_wait (nonblocking=) at util/main-loop.c:497
  #5  0xbbe2aa30 in convert_do_copy (s=0xc123bb58) at 
qemu-img.c:1980
  #6  img_convert (argc=, argv=) at 
qemu-img.c:2456
  #7  0xbbe2333c in main (argc=7, argv=) at 
qemu-img.c:4975

  Reproduced w/ latest QEMU git (@ 53744e0a182)

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1805256/+subscriptions

Re: [Qemu-devel] [PATCH 1/2] vfio/mdev: add version field as mandatory attribute for mdev device

2019-04-22 Thread Yan Zhao

On Mon, Apr 22, 2019 at 10:39:50PM +0800, Alex Williamson wrote:
> On Fri, 19 Apr 2019 04:35:04 -0400
> Yan Zhao  wrote:
> 
> > device version attribute in mdev sysfs is used by user space software
> > (e.g. libvirt) to query device compatibility for live migration of VFIO
> > mdev devices. This attribute is mandatory if a mdev device supports live
> > migration.
> 
> The Subject: doesn't quite match what's being proposed here.
>  
> > It consists of two parts: common part and vendor proprietary part.
> > common part: 32 bit. lower 16 bits is vendor id and higher 16 bits
> >  identifies device type. e.g., for pci device, it is
> >  "pci vendor id" | (VFIO_DEVICE_FLAGS_PCI << 16).
> 
> What purpose does this serve?  If it's intended as some sort of
> namespace feature, shouldn't we first assume that we can only support
> migration to devices of the same type?  Therefore each type would
> already have its own namespace.  Also that would make the trailing bit
> of the version string listed below in the example redundant.  A vendor
> is still welcome to include this in their version string if they wish,
> but I think the string should be entirely vendor defined.
>
hi Alex,
This common part is a kind of namespace.
Because if version string is entirely defined by vendors, I'm worried about
if there is a case that one vendor's version string happens to deceive and
interfere with another vendor's version checking?
e.g.
vendor A has a version string like: vendor id + device id + mdev type
vendor B has a version string like: device id + vendor id + mdev type
but vendor A's vendor id is 0x8086, device id is 0x1217
vendor B's vendor id is 0x1217, device id is 0x8086.

In this corner case, the two vendors may regard the two device is
migratable but actually they are not.

That's the reason for this common part that serve as a kind of namespace
that all vendors will comply with to avoid overlap.

> > vendor proprietary part: this part is varied in length. vendor driver can
> >  specify any string to identify a device.
> > 
> > When reading this attribute, it should show device version string of the
> > device of type . If a device does not support live migration, it
> > should return errno.
> > When writing a string to this attribute, it returns errno for
> > incompatibility or returns written string length in compatibility case.
> > If a device does not support live migration, it always returns errno.
> > 
> > For user space software to use:
> > 1.
> > Before starting live migration, user space software first reads source side
> > mdev device's version. e.g.
> > "#cat \
> > /sys/bus/pci/devices/\:00\:02.0/5ac1fb20-2bbf-4842-bb7e-36c58c3be9cd/mdev_type/version"
> > 00028086-193b-i915-GVTg_V5_4
> > 
> > 2.
> > Then, user space software writes the source side returned version string
> > to device version attribute in target side, and checks the return value.
> > If a negative errno is returned in the target side, then mdev devices in
> > source and target sides are not compatible;
> > If a positive number is returned and it equals to the length of written
> > string, then the two mdev devices in source and target side are compatible.
> > e.g.
> > (a) compatibility case
> > "# echo 00028086-193b-i915-GVTg_V5_4 >
> > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > 
> > (b) incompatibility case
> > "#echo 00028086-193b-i915-GVTg_V5_1 >
> > /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> > -bash: echo: write error: Invalid argument
> > 
> > 3. if two mdev devices are compatible, user space software can start
> > live migration, and vice versa.
> > 
> > Note: if a mdev device does not support live migration, it either does
> > not provide a version attribute, or always returns errno when its version
> > attribute is read/written.
> 
> I think it would be cleaner to do the former, not supply the
> attribute.  This seems to do the latter in the sample drivers.  Thanks,
Ok. you are right!
what about just keep one sample driver to show how to do the latter,
and let the others do the former?

Thanks!
Yan


> > Cc: Alex Williamson 
> > Cc: Erik Skultety 
> > Cc: "Dr. David Alan Gilbert" 
> > Cc: Cornelia Huck 
> > Cc: "Tian, Kevin" 
> > Cc: Zhenyu Wang 
> > Cc: "Wang, Zhi A" 
> > Cc: Neo Jia 
> > Cc: Kirti Wankhede 
> > 
> > Signed-off-by: Yan Zhao 
> > ---
> >  Documentation/vfio-mediated-device.txt | 36 ++
> >  samples/vfio-mdev/mbochs.c | 17 
> >  samples/vfio-mdev/mdpy.c   | 16 
> >  samples/vfio-mdev/mtty.c   | 16 
> >  4 files changed, 85 insertions(+)
> > 
> > diff --git a/Documentation/vfio-mediated-device.txt 
> > b/Documentation/vfio-mediated-device.txt
> > index c3f69bcaf96e..bc28471c0667 100644
> > --- a/Documentation/vfio-mediated-device.txt
> > +++ b/Documentation/vfio-mediated-device.txt
> >

Re: [Qemu-devel] [PATCH 2/2] drm/i915/gvt: export mdev device version to sysfs for Intel vGPU

2019-04-22 Thread Yan Zhao

On Mon, Apr 22, 2019 at 04:37:39PM +0800, Zhenyu Wang wrote:
> On 2019.04.19 04:35:59 -0400, Yan Zhao wrote:
> > This feature implements the version attribute for Intel's vGPU mdev
> > devices.
> > 
> > version attribute is rw. It is queried by userspace software like libvirt
> > to check whether two vGPUs are compatible for live migration.
> > 
> > It consists of two parts: common part and vendor proprietary part.
> > common part: 32 bit. lower 16 bits is vendor id and higher 16 bits
> >  identifies device type. e.g., for pci device, it is
> >  "pci vendor id" | (VFIO_DEVICE_FLAGS_PCI << 16).
> > vendor proprietary part: this part is varied in length. vendor driver can
> >  specify any string to identify a device.
> > 
> > For Intel vGPU of gen8 and gen9, the vendor proprietary part currently
> > consists of 2 fields: "device id" + "mdev type".
> > 
> > Reading from a vGPU's version attribute, a string is returned in below
> > format: 00028086--. e.g.
> > 00028086-193b-i915-GVTg_V5_2.
> > 
> > Writing a string to a vGPU's version attribute will trigger GVT to check
> > whether a vGPU identified by the written string is compatible with
> > current vGPU owning this version attribute. errno is returned if the two
> > vGPUs are incompatible. The length of written string is returned in
> > compatible case.
> > 
> > For other platforms, and for GVT not supporting vGPU live migration
> > feature, errnos are returned when read/write of mdev devices' version
> > attributes.
> > 
> > For old GVT versions where no version attributes exposed in sysfs, it is
> > regarded as not supporting vGPU live migration.
> > 
> > For future platforms, besides the current 2 fields in vendor proprietary
> > part, more fields may be added to identify Intel vGPU well for live
> > migration purpose.
> > 
> > Cc: Alex Williamson 
> > Cc: Erik Skultety 
> > Cc: "Dr. David Alan Gilbert" 
> > Cc: Cornelia Huck 
> > Cc: "Tian, Kevin" 
> > Cc: Zhenyu Wang 
> > Cc: "Wang, Zhi A" 
> > c: Neo Jia 
> > Cc: Kirti Wankhede 
> > 
> > Signed-off-by: Yan Zhao 
> > ---
> >  drivers/gpu/drm/i915/gvt/Makefile |  2 +-
> >  drivers/gpu/drm/i915/gvt/device_version.c | 94 +++
> >  drivers/gpu/drm/i915/gvt/gvt.c| 55 +
> >  drivers/gpu/drm/i915/gvt/gvt.h|  6 ++
> >  4 files changed, 156 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/i915/gvt/device_version.c
> > 
> > diff --git a/drivers/gpu/drm/i915/gvt/Makefile 
> > b/drivers/gpu/drm/i915/gvt/Makefile
> > index 271fb46d4dd0..54e209a23899 100644
> > --- a/drivers/gpu/drm/i915/gvt/Makefile
> > +++ b/drivers/gpu/drm/i915/gvt/Makefile
> > @@ -3,7 +3,7 @@ GVT_DIR := gvt
> >  GVT_SOURCE := gvt.o aperture_gm.o handlers.o vgpu.o trace_points.o 
> > firmware.o \
> > interrupt.o gtt.o cfg_space.o opregion.o mmio.o display.o edid.o \
> > execlist.o scheduler.o sched_policy.o mmio_context.o cmd_parser.o 
> > debugfs.o \
> > -   fb_decoder.o dmabuf.o page_track.o
> > +   fb_decoder.o dmabuf.o page_track.o device_version.o
> >  
> >  ccflags-y  += -I$(src) -I$(src)/$(GVT_DIR)
> >  i915-y += $(addprefix $(GVT_DIR)/, 
> > $(GVT_SOURCE))
> > diff --git a/drivers/gpu/drm/i915/gvt/device_version.c 
> > b/drivers/gpu/drm/i915/gvt/device_version.c
> > new file mode 100644
> > index ..c64010d2bc54
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gvt/device_version.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * Copyright(c) 2011-2017 Intel Corporation. All rights reserved.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the 
> > "Software"),
> > + * to deal in the Software without restriction, including without 
> > limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice (including the 
> > next
> > + * paragraph) shall be included in all copies or substantial portions of 
> > the
> > + * Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
> > OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
> > OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> > FROM,
> > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
> > IN THE
> > + * SOFTWARE.
> > + */
> > +#include 
> > +#include "i915_drv.h"
> > +
> > +#define GVT_VFIO_DEVICE_VENDOR_ID ((0x8086) |  \
> > +   ((VFIO_DEVICE_FL

[Qemu-devel] [PATCH 0/2] i386: "unavailable-features" QOM property

2019-04-22 Thread Eduardo Habkost

Currently, libvirt uses the "filtered-features" QOM property at
runtime to ensure no feature was accidentally disabled on VCPUs
because it's not available on the host.

However, the code for "feature-words" assumes that all missing
features have a corresponding CPUID bit, which is not true for
MSR-based features like the ones at FEAT_ARCH_CAPABILITIES.

We could extend X86CPUFeatureWordInfo to include information
about MSR features, but it's impossible to do that while keeping
compatibility with clients that (reasonably) expect all elements
of "filtered-features" to have the cpuid-* fields.

We have a field in "query-cpu-definitions" that already describes
all features that are missing on a CPU, including MSR features:
CpuDefinitionInfo.unavailable-features.  The existing code for
building the unavailable-features array even uses
X86CPU::filtered_features to build the feature list.

This series adds a "unavailable-features" QOM property to X86CPU
objects that have the same semantics of "unavailable-features" on
query-cpu-definitions.  The new property has the same goal of
"filtered-features", but is generic enough to let any kind of CPU
feature to be listed there without relying on low level details
like CPUID leaves or MSR numbers.

Eduardo Habkost (2):
  i386: x86_cpu_list_feature_names() function
  i386: "unavailable-features" QOM property

 target/i386/cpu.c | 55 ---
 1 file changed, 42 insertions(+), 13 deletions(-)

-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH 1/2] i386: x86_cpu_list_feature_names() function

2019-04-22 Thread Eduardo Habkost

Extract feature name listing code from
x86_cpu_class_check_missing_features().  It will be reused to
return information about CPU filtered features at runtime.

Signed-off-by: Eduardo Habkost 
---
 target/i386/cpu.c | 35 ++-
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index d6bb57d210..23e55a9ef2 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3619,6 +3619,27 @@ static void x86_cpu_parse_featurestr(const char 
*typename, char *features,
 static void x86_cpu_expand_features(X86CPU *cpu, Error **errp);
 static int x86_cpu_filter_features(X86CPU *cpu);
 
+/* Build a list with the name of all features on a feature word array */
+static void x86_cpu_list_feature_names(FeatureWordArray features,
+   strList **feat_names)
+{
+FeatureWord w;
+strList **next = feat_names;
+
+for (w = 0; w < FEATURE_WORDS; w++) {
+uint32_t filtered = features[w];
+int i;
+for (i = 0; i < 32; i++) {
+if (filtered & (1UL << i)) {
+strList *new = g_new0(strList, 1);
+new->value = g_strdup(x86_cpu_feature_name(w, i));
+*next = new;
+next = &new->next;
+}
+}
+}
+}
+
 /* Check for missing features that may prevent the CPU class from
  * running using the current machine and accelerator.
  */
@@ -3626,7 +3647,6 @@ static void 
x86_cpu_class_check_missing_features(X86CPUClass *xcc,
  strList **missing_feats)
 {
 X86CPU *xc;
-FeatureWord w;
 Error *err = NULL;
 strList **next = missing_feats;
 
@@ -3653,18 +3673,7 @@ static void 
x86_cpu_class_check_missing_features(X86CPUClass *xcc,
 
 x86_cpu_filter_features(xc);
 
-for (w = 0; w < FEATURE_WORDS; w++) {
-uint32_t filtered = xc->filtered_features[w];
-int i;
-for (i = 0; i < 32; i++) {
-if (filtered & (1UL << i)) {
-strList *new = g_new0(strList, 1);
-new->value = g_strdup(x86_cpu_feature_name(w, i));
-*next = new;
-next = &new->next;
-}
-}
-}
+x86_cpu_list_feature_names(xc->filtered_features, next);
 
 object_unref(OBJECT(xc));
 }
-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH 2/2] i386: "unavailable-features" QOM property

2019-04-22 Thread Eduardo Habkost

Add a "unavailable-features" QOM property to X86CPU objects that
have the same semantics of "unavailable-features" on
query-cpu-definitions.  The new property has the same goal of
"filtered-features", but is generic enough to let any kind of CPU
feature to be listed there without relying on low level details
like CPUID leaves or MSR numbers.

Signed-off-by: Eduardo Habkost 
---
 target/i386/cpu.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 23e55a9ef2..042d09a1bb 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3640,6 +3640,17 @@ static void x86_cpu_list_feature_names(FeatureWordArray 
features,
 }
 }
 
+static void x86_cpu_get_unavailable_features(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+X86CPU *xc = X86_CPU(obj);
+strList *result = NULL;
+
+x86_cpu_list_feature_names(xc->filtered_features, &result);
+visit_type_strList(v, "unavailable-features", &result, errp);
+}
+
 /* Check for missing features that may prevent the CPU class from
  * running using the current machine and accelerator.
  */
@@ -5580,6 +5591,15 @@ static void x86_cpu_initfn(Object *obj)
 object_property_add(obj, "filtered-features", "X86CPUFeatureWordInfo",
 x86_cpu_get_feature_words,
 NULL, NULL, (void *)cpu->filtered_features, NULL);
+/*
+ * The "unavailable-features" property has the same semantics as
+ * CpuDefinitionInfo.unavailable-features on the "query-cpu-definitions"
+ * QMP command: they list the features that would have prevented the
+ * CPU from running if the "enforce" flag was set.
+ */
+object_property_add(obj, "unavailable-features", "strList",
+x86_cpu_get_unavailable_features,
+NULL, NULL, NULL, &error_abort);
 
 object_property_add(obj, "crash-information", "GuestPanicInformation",
 x86_cpu_get_crash_info_qom, NULL, NULL, NULL, NULL);
-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH v2 0/2] docs/interop/bitmaps: rewrite and modernize doc

2019-04-22 Thread John Snow

The documentation as written in the 4.0 release is not quite correct as
of the 4.0 release; though thanks to backwards compatibility it's not
quite wrong either. It is suitable for inclusion in 4.0.1, or any
downstream that packages 4.0.

This patch is an attempt at a nearly full rewrite that revitalizes this
document to address frequent questions I encounter when discussing the
API.

V2:
  - Split off makefile change into its own little patch.

  - Addressed (almost) all comments from Vladimir.
- Example styling was not reworked.
  - Renamed all filenames to be consistent throughout the document,
fixing a few more file extensions in the process
  - Reflowed almost all paragraphs to 78 columns for consistency.
  - Changed a few bulleted lists to paragraphs instead
  - Changed the big ascii text diagram to a more compact version
and clarified its language
  - While in general I prefer to use en_US spellings, I have
replaced "canceled" with "cancelled" so it does not look
out of place juxtaposed with BLOCK_JOB_CANCELLED...

Future work that remains to be done:

- Paying heed to blockdev-backup workflows
- Adding a small migration section explaining the options there
- Possibly reworking some of the sections that all make their own
  repeated attempts to clarify time interval discretization and adding
  a proper treatment of the idea start to finish; this may be neccessary
  to explain the two below items properly:
- Differential Backups
- Pull Backups and NBD

John Snow (2):
  Makefile: add nit-picky mode to sphinx-build
  docs/interop/bitmaps: rewrite and modernize doc

 docs/interop/bitmaps.rst | 1600 ++
 Makefile |2 +-
 2 files changed, 1266 insertions(+), 336 deletions(-)

-- 
2.20.1

Re: [Qemu-devel] [PATCH 3/3] accel: Remove unused AccelClass::available field

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 11:04 PM, Eduardo Habkost wrote:
> The field is not used anymore, we can remove it.
> 
> Signed-off-by: Eduardo Habkost 

Reviewed-by: Philippe Mathieu-Daudé 
mingw64:
Tested-by: Philippe Mathieu-Daudé 

> ---
>  include/sysemu/accel.h | 1 -
>  accel/accel.c  | 5 -
>  2 files changed, 6 deletions(-)
> 
> diff --git a/include/sysemu/accel.h b/include/sysemu/accel.h
> index 5565e00a96..70e9e2f2a1 100644
> --- a/include/sysemu/accel.h
> +++ b/include/sysemu/accel.h
> @@ -38,7 +38,6 @@ typedef struct AccelClass {
>  
>  const char *opt_name;
>  const char *name;
> -int (*available)(void);
>  int (*init_machine)(MachineState *ms);
>  void (*setup_post)(MachineState *ms, AccelState *accel);
>  bool *allowed;
> diff --git a/accel/accel.c b/accel/accel.c
> index 454fef9d92..5fa31717b4 100644
> --- a/accel/accel.c
> +++ b/accel/accel.c
> @@ -107,11 +107,6 @@ void configure_accelerator(MachineState *ms, const char 
> *progname)
>  if (!acc) {
>  continue;
>  }
> -if (acc->available && !acc->available()) {
> -printf("%s not supported for this target\n",
> -   acc->name);
> -continue;
> -}
>  ret = accel_init_machine(acc, ms);
>  if (ret < 0) {
>  init_failed = true;
>

Re: [Qemu-devel] [PATCH 2/3] qtest: Don't compile qtest accel on non-POSIX systems

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 11:04 PM, Eduardo Habkost wrote:
> qtest_available() will always return 0 on non-POSIX systems.
> It's simpler to just not compile the accelerator code on those
> systems instead of relying on the AccelClass::available function.
> 
> Signed-off-by: Eduardo Habkost 

Reviewed-by: Philippe Mathieu-Daudé 
mingw64:
Tested-by: Philippe Mathieu-Daudé 

> ---
>  include/sysemu/qtest.h | 9 -
>  accel/qtest.c  | 1 -
>  accel/Makefile.objs| 2 +-
>  3 files changed, 1 insertion(+), 11 deletions(-)
> 
> diff --git a/include/sysemu/qtest.h b/include/sysemu/qtest.h
> index 70aa40aa72..096ddfc20c 100644
> --- a/include/sysemu/qtest.h
> +++ b/include/sysemu/qtest.h
> @@ -27,13 +27,4 @@ bool qtest_driver(void);
>  
>  void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error 
> **errp);
>  
> -static inline int qtest_available(void)
> -{
> -#ifdef CONFIG_POSIX
> -return 1;
> -#else
> -return 0;
> -#endif
> -}
> -
>  #endif
> diff --git a/accel/qtest.c b/accel/qtest.c
> index a02b3c26c7..5b88f55921 100644
> --- a/accel/qtest.c
> +++ b/accel/qtest.c
> @@ -34,7 +34,6 @@ static void qtest_accel_class_init(ObjectClass *oc, void 
> *data)
>  {
>  AccelClass *ac = ACCEL_CLASS(oc);
>  ac->name = "QTest";
> -ac->available = qtest_available;
>  ac->init_machine = qtest_init_accel;
>  ac->allowed = &qtest_allowed;
>  }
> diff --git a/accel/Makefile.objs b/accel/Makefile.objs
> index 2a5ed46940..8b498d39d8 100644
> --- a/accel/Makefile.objs
> +++ b/accel/Makefile.objs
> @@ -1,5 +1,5 @@
>  obj-$(CONFIG_SOFTMMU) += accel.o
> -obj-$(CONFIG_SOFTMMU) += qtest.o
> +obj-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_POSIX)) += qtest.o
>  obj-$(CONFIG_KVM) += kvm/
>  obj-$(CONFIG_TCG) += tcg/
>  obj-y += stubs/
>

[Qemu-devel] [PATCH v2 1/2] Makefile: add nit-picky mode to sphinx-build

2019-04-22 Thread John Snow

If we add references that don't resolve (or accidentally remove them),
it will be helpful to have an error message alerting us to that.

Signed-off-by: John Snow 
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 04a0d45050..ff9ce2ed4c 100644
--- a/Makefile
+++ b/Makefile
@@ -899,7 +899,7 @@ docs/version.texi: $(SRC_PATH)/VERSION
 sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html 
$(MANUAL_BUILDDIR)/interop/index.html
 
 # Canned command to build a single manual
-build-manual = $(call quiet-command,sphinx-build $(if $(V),,-q) -b html -D 
version=$(VERSION) -D release="$(FULL_VERSION)" -d .doctrees/$1 
$(SRC_PATH)/docs/$1 $(MANUAL_BUILDDIR)/$1 ,"SPHINX","$(MANUAL_BUILDDIR)/$1")
+build-manual = $(call quiet-command,sphinx-build $(if $(V),,-q) -n -b html -D 
version=$(VERSION) -D release="$(FULL_VERSION)" -d .doctrees/$1 
$(SRC_PATH)/docs/$1 $(MANUAL_BUILDDIR)/$1 ,"SPHINX","$(MANUAL_BUILDDIR)/$1")
 # We assume all RST files in the manual's directory are used in it
 manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) 
$(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py
 
-- 
2.20.1

[Qemu-devel] [PATCH v2 2/2] docs/interop/bitmaps: rewrite and modernize doc

2019-04-22 Thread John Snow

This just about rewrites the entirety of the bitmaps.rst document to
make it consistent with the 4.0 release. I have added new features seen
in the 4.0 release, as well as tried to clarify some points that keep
coming up when discussing this feature both in-house and upstream.

Yes, it's a lot longer, mostly due to examples. I get a bit chatty.
I could use a good editor to help reign in my chattiness.

It does not yet cover pull backups or migration details, but I intend to
keep extending this document to cover those cases.

Please try compiling it with sphinx and look at the rendered output, I
don't have hosting to share my copy at present. I think this new layout
reads nicer in the HTML format than the old one did, at the expense of
looking less readable in the source tree itself (though not completely
unmanagable. We did decide to convert it from Markdown to ReST, after
all, so I am going all-in on ReST.)

Signed-off-by: John Snow 
---
 docs/interop/bitmaps.rst | 1600 ++
 1 file changed, 1265 insertions(+), 335 deletions(-)

diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
index 7bcfe7f461..becff537d8 100644
--- a/docs/interop/bitmaps.rst
+++ b/docs/interop/bitmaps.rst
@@ -1,5 +1,5 @@
 ..
-   Copyright 2015 John Snow  and Red Hat, Inc.
+   Copyright 2019 John Snow  and Red Hat, Inc.
All rights reserved.
 
This file is licensed via The FreeBSD Documentation License, the full
@@ -9,547 +9,1477 @@
 Dirty Bitmaps and Incremental Backup
 
 
--  Dirty Bitmaps are objects that track which data needs to be backed up
-   for the next incremental backup.
+Dirty Bitmaps are in-memory objects that track writes to block devices. They
+can be used in conjunction with various block job operations to perform
+incremental or differential backup regimens.
 
--  Dirty bitmaps can be created at any time and attached to any node
-   (not just complete drives).
+This document explains the conceptual mechanisms, as well as up-to-date,
+complete and comprehensive documentation on the API to manipulate them.
+(Hopefully, the "why", "what", and "how".)
+
+The intended audience for this document is developers who are adding QEMU
+backup features to management applications, or power users who run and
+administer QEMU directly via QMP.
 
 .. contents::
 
+Overview
+
+
+Bitmaps are bit vectors where each '1' bit in the vector indicates a modified
+("dirty") segment of the corresponding block device. The size of the segment
+that is tracked is the granularity of the bitmap. If the granularity of a
+bitmap is 64K, each '1' bit means that a 64K region as a whole may have
+changed in some way, possibly by as little as one byte.
+
+Smaller granularities mean more accurate tracking of modified disk data, but
+requires more computational overhead and larger bitmap sizes. Larger
+granularities mean smaller bitmap sizes, but less targeted backups.
+
+The size of a bitmap (in bytes) can be computed as such:
+``size`` = ceil(ceil(``image_size`` / ``granularity``) / 8)
+
+e.g. the size of a 64KiB granularity bitmap on a 2TiB image is:
+``size`` = ((2147483648K / 64K) / 8)
+ = 4194304B = 4MiB.
+
+QEMU uses these bitmaps when making incremental backups to know which sections
+of the file to copy out. They are not enabled by default and must be
+explicitly added in order to begin tracking writes.
+
+Bitmaps can be created at any time and can be attached to any arbitrary block
+node in the storage graph, but are most useful conceptually when attached to
+the root node attached to the guest's storage device model.
+
+That is to say: It's likely most useful to track the guest's writes to disk,
+but you could theoretically track things like qcow2 metadata changes by
+attaching the bitmap elsewhere in the storage graph. This is beyond the scope
+of this document.
+
+QEMU supports persisting these bitmaps to disk via the qcow2 image format.
+Bitmaps which are stored or loaded in this way are called "persistent",
+whereas bitmaps that are not are called "transient".
+
+QEMU also supports the migration of both transient bitmaps (tracking any
+arbitrary image format) or persistent bitmaps (qcow2) via live migration.
+
+Supported Image Formats
+---
+
+QEMU supports all documented features below on the qcow2 image format.
+
+However, qcow2 is only strictly necessary for the persistence feature, which
+writes bitmap data to disk upon close. If persistence is not required for a
+specific use case, all bitmap features excepting persistence are available for
+any arbitrary image format.
+
+For example, Dirty Bitmaps can be combined with the 'raw' image format, but
+any changes to the bitmap will be discarded upon exit.
+
+.. warning:: Transient bitmaps will not be saved on QEMU exit! Persistent
+ bitmaps are available only on qcow2 images.
+
 Dirty Bitmap Names
 --
 
--  A dirty bitmap's name is u

Re: [Qemu-devel] [PATCH 1/3] qtest: Move accel code to accel/qtest.c

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/23/19 12:08 AM, Eduardo Habkost wrote:
> On Mon, Apr 22, 2019 at 11:56:18PM +0200, Philippe Mathieu-Daudé wrote:
>> Hi Eduardo,
>>
>> On 4/22/19 11:04 PM, Eduardo Habkost wrote:
>>> QTest has two parts: the server (-qtest) and the accelerator
>>> (-machine accel=qtest).  The accelerator depends on CONFIG_POSIX
>>> due to its usage of sigwait(), but the server doesn't.
>>>
>>> Move the accel code to accel/qtest.c.  Later we will disable
>>> compilation of accel/qtest.c on non-POSIX systems.
>>>
> [...]
>>> +#include "qemu/module.h"
>>
>> Why include "qemu/module.h"?
>> I guess you don't need it.
> 
> That's where the type_init() macro is defined.

Oh correct...

I tried to remove it/compile and it worked because it is pulled by:

"sysemu/cpus.h"
 -> "qemu/timer.h"
 -> "qemu-common.h"
 -> "qemu/module.h"

Thanks!

>>> +type_init(qtest_type_init);
> [...]
> 
> 
>> Reviewed-by: Philippe Mathieu-Daudé 
>> Tested-by: Philippe Mathieu-Daudé 
> 
> Thanks!
>

Re: [Qemu-devel] [PATCH 1/3] qtest: Move accel code to accel/qtest.c

2019-04-22 Thread Eduardo Habkost

On Mon, Apr 22, 2019 at 11:56:18PM +0200, Philippe Mathieu-Daudé wrote:
> Hi Eduardo,
> 
> On 4/22/19 11:04 PM, Eduardo Habkost wrote:
> > QTest has two parts: the server (-qtest) and the accelerator
> > (-machine accel=qtest).  The accelerator depends on CONFIG_POSIX
> > due to its usage of sigwait(), but the server doesn't.
> > 
> > Move the accel code to accel/qtest.c.  Later we will disable
> > compilation of accel/qtest.c on non-POSIX systems.
> > 
[...]
> > +#include "qemu/module.h"
> 
> Why include "qemu/module.h"?
> I guess you don't need it.

That's where the type_init() macro is defined.


> > +type_init(qtest_type_init);
[...]


> Reviewed-by: Philippe Mathieu-Daudé 
> Tested-by: Philippe Mathieu-Daudé 

Thanks!

-- 
Eduardo

Re: [Qemu-devel] [PATCH 1/3] qtest: Move accel code to accel/qtest.c

2019-04-22 Thread Philippe Mathieu-Daudé

Hi Eduardo,

On 4/22/19 11:04 PM, Eduardo Habkost wrote:
> QTest has two parts: the server (-qtest) and the accelerator
> (-machine accel=qtest).  The accelerator depends on CONFIG_POSIX
> due to its usage of sigwait(), but the server doesn't.
> 
> Move the accel code to accel/qtest.c.  Later we will disable
> compilation of accel/qtest.c on non-POSIX systems.
> 
> Signed-off-by: Eduardo Habkost 
> ---
>  accel/qtest.c   | 55 +
>  qtest.c | 34 
>  accel/Makefile.objs |  1 +
>  3 files changed, 56 insertions(+), 34 deletions(-)
>  create mode 100644 accel/qtest.c
> 
> diff --git a/accel/qtest.c b/accel/qtest.c
> new file mode 100644
> index 00..a02b3c26c7
> --- /dev/null
> +++ b/accel/qtest.c
> @@ -0,0 +1,55 @@
> +/*
> + * QTest accelerator code
> + *
> + * Copyright IBM, Corp. 2011
> + *
> + * Authors:
> + *  Anthony Liguori   
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/module.h"

Why include "qemu/module.h"?
I guess you don't need it.

> +#include "qemu/option.h"
> +#include "qemu/config-file.h"
> +#include "sysemu/accel.h"
> +#include "sysemu/qtest.h"
> +#include "sysemu/cpus.h"
> +
> +static int qtest_init_accel(MachineState *ms)
> +{
> +QemuOpts *opts = qemu_opts_create(qemu_find_opts("icount"), NULL, 0,
> +  &error_abort);
> +qemu_opt_set(opts, "shift", "0", &error_abort);
> +configure_icount(opts, &error_abort);
> +qemu_opts_del(opts);
> +return 0;
> +}
> +
> +static void qtest_accel_class_init(ObjectClass *oc, void *data)
> +{
> +AccelClass *ac = ACCEL_CLASS(oc);
> +ac->name = "QTest";
> +ac->available = qtest_available;
> +ac->init_machine = qtest_init_accel;
> +ac->allowed = &qtest_allowed;
> +}
> +
> +#define TYPE_QTEST_ACCEL ACCEL_CLASS_NAME("qtest")
> +
> +static const TypeInfo qtest_accel_type = {
> +.name = TYPE_QTEST_ACCEL,
> +.parent = TYPE_ACCEL,
> +.class_init = qtest_accel_class_init,
> +};
> +
> +static void qtest_type_init(void)
> +{
> +type_register_static(&qtest_accel_type);
> +}
> +
> +type_init(qtest_type_init);
> diff --git a/qtest.c b/qtest.c
> index 527141785f..f2e2f27778 100644
> --- a/qtest.c
> +++ b/qtest.c
> @@ -749,16 +749,6 @@ static void qtest_event(void *opaque, int event)
>  }
>  }
>  
> -static int qtest_init_accel(MachineState *ms)
> -{
> -QemuOpts *opts = qemu_opts_create(qemu_find_opts("icount"), NULL, 0,
> -  &error_abort);
> -qemu_opt_set(opts, "shift", "0", &error_abort);
> -configure_icount(opts, &error_abort);
> -qemu_opts_del(opts);
> -return 0;
> -}
> -
>  void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error 
> **errp)
>  {
>  Chardev *chr;
> @@ -791,27 +781,3 @@ bool qtest_driver(void)
>  {
>  return qtest_chr.chr != NULL;
>  }
> -
> -static void qtest_accel_class_init(ObjectClass *oc, void *data)
> -{
> -AccelClass *ac = ACCEL_CLASS(oc);
> -ac->name = "QTest";
> -ac->available = qtest_available;
> -ac->init_machine = qtest_init_accel;
> -ac->allowed = &qtest_allowed;
> -}
> -
> -#define TYPE_QTEST_ACCEL ACCEL_CLASS_NAME("qtest")
> -
> -static const TypeInfo qtest_accel_type = {
> -.name = TYPE_QTEST_ACCEL,
> -.parent = TYPE_ACCEL,
> -.class_init = qtest_accel_class_init,
> -};
> -
> -static void qtest_type_init(void)
> -{
> -type_register_static(&qtest_accel_type);
> -}
> -
> -type_init(qtest_type_init);
> diff --git a/accel/Makefile.objs b/accel/Makefile.objs
> index c3718a10c5..2a5ed46940 100644
> --- a/accel/Makefile.objs
> +++ b/accel/Makefile.objs
> @@ -1,4 +1,5 @@
>  obj-$(CONFIG_SOFTMMU) += accel.o
> +obj-$(CONFIG_SOFTMMU) += qtest.o
>  obj-$(CONFIG_KVM) += kvm/
>  obj-$(CONFIG_TCG) += tcg/
>  obj-y += stubs/
> 

Reviewed-by: Philippe Mathieu-Daudé 
Tested-by: Philippe Mathieu-Daudé

Re: [Qemu-devel] [RHEL-8.1 virt 2/2] target/i386: sev: Do not pin the ram device memory region

2019-04-22 Thread Eduardo Habkost

On Tue, Apr 09, 2019 at 08:08:03PM -0400, Gary R Hook wrote:
> BZ: 1667249
> Branch: rhel-8.1.0
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1667249
> Upstream Status: 4.0.0-rc1
> Build Info: 
> https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=20980582
> Conflicts: None
> 
> commit cedc0ad539afbbb669dba9e73dfad2915bc1c25b
> Author: Singh, Brijesh 
> Date:   Mon Feb 4 22:23:40 2019 +
> 
> target/i386: sev: Do not pin the ram device memory region
> 
> The RAM device presents a memory region that should be handled
> as an IO region and should not be pinned.
> 
> In the case of the vfio-pci, RAM device represents a MMIO BAR
> and the memory region is not backed by pages hence
> KVM_MEMORY_ENCRYPT_REG_REGION fails to lock the memory range.
> 
> Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1667249
> Cc: Alex Williamson 
> Cc: Paolo Bonzini 
> Signed-off-by: Brijesh Singh 
> Message-Id: <20190204222322.26766-3-brijesh.si...@amd.com>
> Signed-off-by: Paolo Bonzini 
> 
> Cc: Paolo Bonzini  
> Cc: Richard Henderson  
> Cc: Eduardo Habkost  
> Cc: qemu-devel@nongnu.org 

Acked-by: Eduardo Habkost 

-- 
Eduardo

Re: [Qemu-devel] [RHEL-8.1 virt 1/2] memory: Fix the memory region type assignment order

2019-04-22 Thread Eduardo Habkost

On Tue, Apr 09, 2019 at 08:08:02PM -0400, Gary R Hook wrote:
> BZ: 1667249
> Branch: rhel-8.1.0
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1667249
> Upstream Status: 4.0.0-rc1
> Build Info: 
> https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=20980582
> Conflicts: None
> 
> commit 2ddb89b00f947f785c9ca6742f28f954e3b75e62
> Author: Singh, Brijesh 
> Date:   Mon Feb 4 22:23:39 2019 +
> 
> memory: Fix the memory region type assignment order
> 
> Currently, a callback registered through the RAMBlock notifier
> is not able to get the memory region type (i.e callback is not
> able to use memory_region_is_ram_device function). This is
> because mr->ram assignment happens _after_ the memory is allocated
> whereas the callback is executed during allocation.
> 
> Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1667249
> Suggested-by: Alex Williamson 
> Cc: Paolo Bonzini 
> Reviewed-by: Alex Williamson 
> Signed-off-by: Brijesh Singh 
> Message-Id: <20190204222322.26766-2-brijesh.si...@amd.com>
> Signed-off-by: Paolo Bonzini 
> 
> Cc: Paolo Bonzini  
> Cc: qemu-devel@nongnu.org 

Faithful backport.

Acked-by: Eduardo Habkost 

-- 
Eduardo

[Qemu-devel] [PATCH 2/3] qtest: Don't compile qtest accel on non-POSIX systems

2019-04-22 Thread Eduardo Habkost

qtest_available() will always return 0 on non-POSIX systems.
It's simpler to just not compile the accelerator code on those
systems instead of relying on the AccelClass::available function.

Signed-off-by: Eduardo Habkost 
---
 include/sysemu/qtest.h | 9 -
 accel/qtest.c  | 1 -
 accel/Makefile.objs| 2 +-
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/sysemu/qtest.h b/include/sysemu/qtest.h
index 70aa40aa72..096ddfc20c 100644
--- a/include/sysemu/qtest.h
+++ b/include/sysemu/qtest.h
@@ -27,13 +27,4 @@ bool qtest_driver(void);
 
 void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error **errp);
 
-static inline int qtest_available(void)
-{
-#ifdef CONFIG_POSIX
-return 1;
-#else
-return 0;
-#endif
-}
-
 #endif
diff --git a/accel/qtest.c b/accel/qtest.c
index a02b3c26c7..5b88f55921 100644
--- a/accel/qtest.c
+++ b/accel/qtest.c
@@ -34,7 +34,6 @@ static void qtest_accel_class_init(ObjectClass *oc, void 
*data)
 {
 AccelClass *ac = ACCEL_CLASS(oc);
 ac->name = "QTest";
-ac->available = qtest_available;
 ac->init_machine = qtest_init_accel;
 ac->allowed = &qtest_allowed;
 }
diff --git a/accel/Makefile.objs b/accel/Makefile.objs
index 2a5ed46940..8b498d39d8 100644
--- a/accel/Makefile.objs
+++ b/accel/Makefile.objs
@@ -1,5 +1,5 @@
 obj-$(CONFIG_SOFTMMU) += accel.o
-obj-$(CONFIG_SOFTMMU) += qtest.o
+obj-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_POSIX)) += qtest.o
 obj-$(CONFIG_KVM) += kvm/
 obj-$(CONFIG_TCG) += tcg/
 obj-y += stubs/
-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH 1/3] qtest: Move accel code to accel/qtest.c

2019-04-22 Thread Eduardo Habkost

QTest has two parts: the server (-qtest) and the accelerator
(-machine accel=qtest).  The accelerator depends on CONFIG_POSIX
due to its usage of sigwait(), but the server doesn't.

Move the accel code to accel/qtest.c.  Later we will disable
compilation of accel/qtest.c on non-POSIX systems.

Signed-off-by: Eduardo Habkost 
---
 accel/qtest.c   | 55 +
 qtest.c | 34 
 accel/Makefile.objs |  1 +
 3 files changed, 56 insertions(+), 34 deletions(-)
 create mode 100644 accel/qtest.c

diff --git a/accel/qtest.c b/accel/qtest.c
new file mode 100644
index 00..a02b3c26c7
--- /dev/null
+++ b/accel/qtest.c
@@ -0,0 +1,55 @@
+/*
+ * QTest accelerator code
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Anthony Liguori   
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
+#include "sysemu/accel.h"
+#include "sysemu/qtest.h"
+#include "sysemu/cpus.h"
+
+static int qtest_init_accel(MachineState *ms)
+{
+QemuOpts *opts = qemu_opts_create(qemu_find_opts("icount"), NULL, 0,
+  &error_abort);
+qemu_opt_set(opts, "shift", "0", &error_abort);
+configure_icount(opts, &error_abort);
+qemu_opts_del(opts);
+return 0;
+}
+
+static void qtest_accel_class_init(ObjectClass *oc, void *data)
+{
+AccelClass *ac = ACCEL_CLASS(oc);
+ac->name = "QTest";
+ac->available = qtest_available;
+ac->init_machine = qtest_init_accel;
+ac->allowed = &qtest_allowed;
+}
+
+#define TYPE_QTEST_ACCEL ACCEL_CLASS_NAME("qtest")
+
+static const TypeInfo qtest_accel_type = {
+.name = TYPE_QTEST_ACCEL,
+.parent = TYPE_ACCEL,
+.class_init = qtest_accel_class_init,
+};
+
+static void qtest_type_init(void)
+{
+type_register_static(&qtest_accel_type);
+}
+
+type_init(qtest_type_init);
diff --git a/qtest.c b/qtest.c
index 527141785f..f2e2f27778 100644
--- a/qtest.c
+++ b/qtest.c
@@ -749,16 +749,6 @@ static void qtest_event(void *opaque, int event)
 }
 }
 
-static int qtest_init_accel(MachineState *ms)
-{
-QemuOpts *opts = qemu_opts_create(qemu_find_opts("icount"), NULL, 0,
-  &error_abort);
-qemu_opt_set(opts, "shift", "0", &error_abort);
-configure_icount(opts, &error_abort);
-qemu_opts_del(opts);
-return 0;
-}
-
 void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error **errp)
 {
 Chardev *chr;
@@ -791,27 +781,3 @@ bool qtest_driver(void)
 {
 return qtest_chr.chr != NULL;
 }
-
-static void qtest_accel_class_init(ObjectClass *oc, void *data)
-{
-AccelClass *ac = ACCEL_CLASS(oc);
-ac->name = "QTest";
-ac->available = qtest_available;
-ac->init_machine = qtest_init_accel;
-ac->allowed = &qtest_allowed;
-}
-
-#define TYPE_QTEST_ACCEL ACCEL_CLASS_NAME("qtest")
-
-static const TypeInfo qtest_accel_type = {
-.name = TYPE_QTEST_ACCEL,
-.parent = TYPE_ACCEL,
-.class_init = qtest_accel_class_init,
-};
-
-static void qtest_type_init(void)
-{
-type_register_static(&qtest_accel_type);
-}
-
-type_init(qtest_type_init);
diff --git a/accel/Makefile.objs b/accel/Makefile.objs
index c3718a10c5..2a5ed46940 100644
--- a/accel/Makefile.objs
+++ b/accel/Makefile.objs
@@ -1,4 +1,5 @@
 obj-$(CONFIG_SOFTMMU) += accel.o
+obj-$(CONFIG_SOFTMMU) += qtest.o
 obj-$(CONFIG_KVM) += kvm/
 obj-$(CONFIG_TCG) += tcg/
 obj-y += stubs/
-- 
2.18.0.rc1.1.g3f1ff2140

Re: [Qemu-devel] [PATCH v5 1/6] libnvdimm: nd_region flush callback support

2019-04-22 Thread Jeff Moyer

Dan Williams  writes:

> On Mon, Apr 22, 2019 at 8:59 AM Jeff Moyer  wrote:
>>
>> Dan Williams  writes:
>>
>> > On Thu, Apr 18, 2019 at 9:18 AM Christoph Hellwig  
>> > wrote:
>> >>
>> >> On Thu, Apr 18, 2019 at 09:05:05AM -0700, Dan Williams wrote:
>> >> > > > I'd either add a comment about avoiding retpoline overhead here or 
>> >> > > > just
>> >> > > > make ->flush == NULL mean generic_nvdimm_flush(). Just so that 
>> >> > > > people don't
>> >> > > > get confused by the code.
>> >> > >
>> >> > > Isn't this premature optimization?  I really don't like adding things
>> >> > > like this without some numbers to show it's worth it.
>> >> >
>> >> > I don't think it's premature given this optimization technique is
>> >> > already being deployed elsewhere, see:
>> >> >
>> >> > https://lwn.net/Articles/774347/
>> >>
>> >> For one this one was backed by numbers, and second after feedback
>> >> from Linux we switched to the NULL pointer check instead.
>> >
>> > Ok I should have noticed the switch to NULL pointer check. However,
>> > the question still stands do we want everyone to run numbers to
>> > justify this optimization, or make it a new common kernel coding
>> > practice to do:
>> >
>> > if (!object->op)
>> > generic_op(object);
>> > else
>> > object->op(object);
>> >
>> > ...in hot paths?
>>
>> I don't think nvdimm_flush is a hot path.  Numbers of some
>> representative workload would prove one of us right.
>
> I'd rather say that the if "if (!op) do_generic()" pattern is more
> readable in the general case, saves grepping for who set the op in the
> common case. The fact that it has the potential to be faster is gravy
> at that point.

If the primary motivation is performance, then I'd expect performance
numbers to back it up.  If that isn't the primary motivation, then
choose whichever way you feel is appropriate.

Cheers,
Jeff

[Qemu-devel] [PATCH 0/3] Remove AccelClass::available field

2019-04-22 Thread Eduardo Habkost

The only user of AccelClass::available was the qtest accelerator,
to make it unavailable on non-POSIX systems.  We can drop the
field if we simply not compile the accelerator code on those
systems.

Eduardo Habkost (3):
  qtest: Move accel code to accel/qtest.c
  qtest: Don't compile qtest accel on non-POSIX systems
  accel: Remove unused AccelClass::available field

 include/sysemu/accel.h |  1 -
 include/sysemu/qtest.h |  9 ---
 accel/accel.c  |  5 
 accel/qtest.c  | 54 ++
 qtest.c| 34 --
 accel/Makefile.objs|  1 +
 6 files changed, 55 insertions(+), 49 deletions(-)
 create mode 100644 accel/qtest.c

-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH 3/3] accel: Remove unused AccelClass::available field

2019-04-22 Thread Eduardo Habkost

The field is not used anymore, we can remove it.

Signed-off-by: Eduardo Habkost 
---
 include/sysemu/accel.h | 1 -
 accel/accel.c  | 5 -
 2 files changed, 6 deletions(-)

diff --git a/include/sysemu/accel.h b/include/sysemu/accel.h
index 5565e00a96..70e9e2f2a1 100644
--- a/include/sysemu/accel.h
+++ b/include/sysemu/accel.h
@@ -38,7 +38,6 @@ typedef struct AccelClass {
 
 const char *opt_name;
 const char *name;
-int (*available)(void);
 int (*init_machine)(MachineState *ms);
 void (*setup_post)(MachineState *ms, AccelState *accel);
 bool *allowed;
diff --git a/accel/accel.c b/accel/accel.c
index 454fef9d92..5fa31717b4 100644
--- a/accel/accel.c
+++ b/accel/accel.c
@@ -107,11 +107,6 @@ void configure_accelerator(MachineState *ms, const char 
*progname)
 if (!acc) {
 continue;
 }
-if (acc->available && !acc->available()) {
-printf("%s not supported for this target\n",
-   acc->name);
-continue;
-}
 ret = accel_init_machine(acc, ms);
 if (ret < 0) {
 init_failed = true;
-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] Review my proposal

2019-04-22 Thread amit kumar rathore

Sir,
My proposal is still not review by my mentor. My mentor is JAN KISZKA .I 
already mail him but he not responding me. What I do now? How can I know my 
proposal is view or not.

Sent from Mail for Windows 10

Re: [Qemu-devel] [PATCH 1/2] add VirtIONet vhost_stopped flag to prevent multiple stops

2019-04-22 Thread Dan Streetman

On Fri, Apr 19, 2019 at 7:14 PM Michael S. Tsirkin  wrote:
>
> On Tue, Apr 16, 2019 at 02:46:23PM -0400, Dan Streetman wrote:
> > From: Dan Streetman 
> >
> > Buglink: https://launchpad.net/bugs/1823458
> >
> > There is a race condition when using the vhost-user driver, between a guest
> > shutdown and the vhost-user interface being closed.  This is explained in
> > more detail at the bug link above; the short explanation is the vhost-user
> > device can be closed while the main thread is in the middle of stopping
> > the vhost_net.  In this case, the main thread handling shutdown will
> > enter virtio_net_vhost_status() and move into the n->vhost_started (else)
> > block, and call vhost_net_stop(); while it is running that function,
> > another thread is notified that the vhost-user device has been closed,
> > and (indirectly) calls into virtio_net_vhost_status() also.  Since the
> > vhost_net status hasn't yet changed, the second thread also enters
> > the n->vhost_started block, and also calls vhost_net_stop().  This
> > causes problems for the second thread when it tries to stop the network
> > that's already been stopped.
> >
> > This adds a flag to the struct that's atomically set to prevent more than
> > one thread from calling vhost_net_stop().  The atomic_fetch_inc() is likely
> > overkill and probably could be done with a simple check-and-set, but
> > since it's a race condition there would still be a (very, very) small
> > window without using an atomic to set it.
>
> How? Isn't all this under the BQL?

I don't think so, although I'm not deeply familiar with the code.
Note the code path listed in my last email, run from
aio_bh_schedule_oneshot() - does that hold the bql while running?

>
> >
> > Signed-off-by: Dan Streetman 
> > ---
> >  hw/net/virtio-net.c| 3 ++-
> >  include/hw/virtio/virtio-net.h | 1 +
> >  2 files changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index ffe0872fff..d36f50d5dd 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -13,6 +13,7 @@
> >
> >  #include "qemu/osdep.h"
> >  #include "qemu/iov.h"
> > +#include "qemu/atomic.h"
> >  #include "hw/virtio/virtio.h"
> >  #include "net/net.h"
> >  #include "net/checksum.h"
> > @@ -240,7 +241,7 @@ static void virtio_net_vhost_status(VirtIONet *n, 
> > uint8_t status)
> >   "falling back on userspace virtio", -r);
> >  n->vhost_started = 0;
> >  }
> > -} else {
> > +} else if (atomic_fetch_inc(&n->vhost_stopped) == 0) {
> >  vhost_net_stop(vdev, n->nic->ncs, queues);
> >  n->vhost_started = 0;
> >  }
> > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> > index b96f0c643f..d03fd933d0 100644
> > --- a/include/hw/virtio/virtio-net.h
> > +++ b/include/hw/virtio/virtio-net.h
> > @@ -164,6 +164,7 @@ struct VirtIONet {
> >  uint8_t nouni;
> >  uint8_t nobcast;
> >  uint8_t vhost_started;
> > +int vhost_stopped;
> >  struct {
> >  uint32_t in_use;
> >  uint32_t first_multi;
>
> OK questions same as any state:
>
> - do we need to migrate this?
> - reset it on device reset?
>
> > --
> > 2.20.1

Re: [Qemu-devel] [PATCH 1/2] add VirtIONet vhost_stopped flag to prevent multiple stops

2019-04-22 Thread Dan Streetman

On Sun, Apr 21, 2019 at 10:50 PM Jason Wang  wrote:
>
>
> On 2019/4/17 上午2:46, Dan Streetman wrote:
> > From: Dan Streetman 
> >
> > Buglink: https://launchpad.net/bugs/1823458
> >
> > There is a race condition when using the vhost-user driver, between a guest
> > shutdown and the vhost-user interface being closed.  This is explained in
> > more detail at the bug link above; the short explanation is the vhost-user
> > device can be closed while the main thread is in the middle of stopping
> > the vhost_net.  In this case, the main thread handling shutdown will
> > enter virtio_net_vhost_status() and move into the n->vhost_started (else)
> > block, and call vhost_net_stop(); while it is running that function,
> > another thread is notified that the vhost-user device has been closed,
> > and (indirectly) calls into virtio_net_vhost_status() also.
>
>
> I think we need figure out why there are multiple vhost_net_stop() calls
> simultaneously. E.g vhost-user register fd handlers like:
>
>  qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
>   net_vhost_user_event, NULL, nc0->name,
> NULL,
>   true);
>
> which uses default main context, so it should only be called only in
> main thread.

net_vhost_user_event() schedules chr_closed_bh() to do its bottom half
work; does aio_bh_schedule_oneshot() execute its events from the main
thread?

For reference, the call chain is:

chr_closed_bh()
  qmp_set_link()
nc->info->link_status_changed() -> virtio_net_set_link_status()
  virtio_net_set_status()
virtio_net_vhost_status()

>
> Thanks
>
>
> >   Since the
> > vhost_net status hasn't yet changed, the second thread also enters
> > the n->vhost_started block, and also calls vhost_net_stop().  This
> > causes problems for the second thread when it tries to stop the network
> > that's already been stopped.
> >
> > This adds a flag to the struct that's atomically set to prevent more than
> > one thread from calling vhost_net_stop().  The atomic_fetch_inc() is likely
> > overkill and probably could be done with a simple check-and-set, but
> > since it's a race condition there would still be a (very, very) small
> > window without using an atomic to set it.
> >
> > Signed-off-by: Dan Streetman 
> > ---
> >   hw/net/virtio-net.c| 3 ++-
> >   include/hw/virtio/virtio-net.h | 1 +
> >   2 files changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index ffe0872fff..d36f50d5dd 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -13,6 +13,7 @@
> >
> >   #include "qemu/osdep.h"
> >   #include "qemu/iov.h"
> > +#include "qemu/atomic.h"
> >   #include "hw/virtio/virtio.h"
> >   #include "net/net.h"
> >   #include "net/checksum.h"
> > @@ -240,7 +241,7 @@ static void virtio_net_vhost_status(VirtIONet *n, 
> > uint8_t status)
> >"falling back on userspace virtio", -r);
> >   n->vhost_started = 0;
> >   }
> > -} else {
> > +} else if (atomic_fetch_inc(&n->vhost_stopped) == 0) {
> >   vhost_net_stop(vdev, n->nic->ncs, queues);
> >   n->vhost_started = 0;
> >   }
> > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> > index b96f0c643f..d03fd933d0 100644
> > --- a/include/hw/virtio/virtio-net.h
> > +++ b/include/hw/virtio/virtio-net.h
> > @@ -164,6 +164,7 @@ struct VirtIONet {
> >   uint8_t nouni;
> >   uint8_t nobcast;
> >   uint8_t vhost_started;
> > +int vhost_stopped;
> >   struct {
> >   uint32_t in_use;
> >   uint32_t first_multi;

Re: [Qemu-devel] [RFC 0/3] VirtIO RDMA

2019-04-22 Thread Jason Gunthorpe

On Fri, Apr 19, 2019 at 01:16:06PM +0200, Hannes Reinecke wrote:
> On 4/15/19 12:35 PM, Yuval Shaia wrote:
> > On Thu, Apr 11, 2019 at 07:02:15PM +0200, Cornelia Huck wrote:
> > > On Thu, 11 Apr 2019 14:01:54 +0300
> > > Yuval Shaia  wrote:
> > > 
> > > > Data center backends use more and more RDMA or RoCE devices and more and
> > > > more software runs in virtualized environment.
> > > > There is a need for a standard to enable RDMA/RoCE on Virtual Machines.
> > > > 
> > > > Virtio is the optimal solution since is the de-facto para-virtualizaton
> > > > technology and also because the Virtio specification
> > > > allows Hardware Vendors to support Virtio protocol natively in order to
> > > > achieve bare metal performance.
> > > > 
> > > > This RFC is an effort to addresses challenges in defining the RDMA/RoCE
> > > > Virtio Specification and a look forward on possible implementation
> > > > techniques.
> > > > 
> > > > Open issues/Todo list:
> > > > List is huge, this is only start point of the project.
> > > > Anyway, here is one example of item in the list:
> > > > - Multi VirtQ: Every QP has two rings and every CQ has one. This means 
> > > > that
> > > >in order to support for example 32K QPs we will need 64K VirtQ. Not 
> > > > sure
> > > >that this is reasonable so one option is to have one for all and
> > > >multiplex the traffic on it. This is not good approach as by design 
> > > > it
> > > >introducing an optional starvation. Another approach would be multi
> > > >queues and round-robin (for example) between them.
> > > > 
> Typically there will be a one-to-one mapping between QPs and CPUs (on the
> guest). 

Er we are really overloading words here.. The typical expectation is
that a 'RDMA QP' will have thousands and thousands of instances on a
system.

Most likely I think mapping 1:1 a virtio queue to a 'RDMA QP, CQ, SRQ,
etc' is a bad idea...

> However, I'm still curious about the overall intent of this driver. Where
> would the I/O be routed _to_ ?
> It's nice that we have a virtualized driver, but this driver is
> intended to do I/O (even if it doesn't _do_ any I/O ATM :-)
> And this I/O needs to be send to (and possibly received from)
> something.

As yet I have never heard of public RDMA HW that could be coupled to a
virtio scheme. All HW defines their own queue ring buffer formats
without standardization.

> If so, wouldn't it be more efficient to use vfio, either by using SR-IOV or
> by using virtio-mdev?

Using PCI pass through means the guest has to have drivers for the
device. A generic, perhaps slower, virtio path has some appeal in some
cases.

> If so, how would we route the I/O from one guest to the other?
> Shared memory? Implementing a full-blown RDMA switch in qemu?

RoCE rides over the existing ethernet switching layer quemu plugs
into

So if you built a shared memory, local host only, virtio-rdma then
you'd probably run through the ethernet switch upon connection
establishment to match the participating VMs.

Jason

[Qemu-devel] [PATCH v3 7/7] hw/sparc64: Implement fw_cfg_arch_key_name()

2019-04-22 Thread Philippe Mathieu-Daudé

Implement fw_cfg_arch_key_name(), which returns the name of a
sparc64-specific key.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sparc64/sun4u.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c
index 399f2d73c81..4230b17b873 100644
--- a/hw/sparc64/sun4u.c
+++ b/hw/sparc64/sun4u.c
@@ -91,6 +91,25 @@ typedef struct EbusState {
 #define TYPE_EBUS "ebus"
 #define EBUS(obj) OBJECT_CHECK(EbusState, (obj), TYPE_EBUS)
 
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+static const struct {
+uint16_t key;
+const char *name;
+} fw_cfg_arch_wellknown_keys[] = {
+{FW_CFG_SPARC64_WIDTH, "width"},
+{FW_CFG_SPARC64_HEIGHT, "height"},
+{FW_CFG_SPARC64_DEPTH, "depth"},
+};
+
+for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+if (fw_cfg_arch_wellknown_keys[i].key == key) {
+return fw_cfg_arch_wellknown_keys[i].name;
+}
+}
+return NULL;
+}
+
 static void fw_cfg_boot_set(void *opaque, const char *boot_device,
 Error **errp)
 {
-- 
2.20.1

[Qemu-devel] [PATCH v3 6/7] hw/sparc: Implement fw_cfg_arch_key_name()

2019-04-22 Thread Philippe Mathieu-Daudé

Implement fw_cfg_arch_key_name(), which returns the name of a
sparc32-specific key.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sparc/sun4m.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c
index ca1e3825d58..49251d62b35 100644
--- a/hw/sparc/sun4m.c
+++ b/hw/sparc/sun4m.c
@@ -97,6 +97,25 @@ struct sun4m_hwdef {
 uint8_t nvram_machine_id;
 };
 
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+static const struct {
+uint16_t key;
+const char *name;
+} fw_cfg_arch_wellknown_keys[] = {
+{FW_CFG_SUN4M_DEPTH, "depth"},
+{FW_CFG_SUN4M_WIDTH, "width"},
+{FW_CFG_SUN4M_HEIGHT, "height"},
+};
+
+for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+if (fw_cfg_arch_wellknown_keys[i].key == key) {
+return fw_cfg_arch_wellknown_keys[i].name;
+}
+}
+return NULL;
+}
+
 static void fw_cfg_boot_set(void *opaque, const char *boot_device,
 Error **errp)
 {
-- 
2.20.1

[Qemu-devel] [PATCH v3 5/7] hw/ppc: Implement fw_cfg_arch_key_name()

2019-04-22 Thread Philippe Mathieu-Daudé

Implement fw_cfg_arch_key_name(), which returns the name of a
ppc-specific key.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/ppc/Makefile.objs |  2 +-
 hw/ppc/fw_cfg.c  | 45 
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 hw/ppc/fw_cfg.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index b218a04..ae940981553 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -1,5 +1,5 @@
 # shared objects
-obj-y += ppc.o ppc_booke.o fdt.o
+obj-y += ppc.o ppc_booke.o fdt.o fw_cfg.o
 # IBM pSeries (sPAPR)
 obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
diff --git a/hw/ppc/fw_cfg.c b/hw/ppc/fw_cfg.c
new file mode 100644
index 000..a88b5c4bde2
--- /dev/null
+++ b/hw/ppc/fw_cfg.c
@@ -0,0 +1,45 @@
+/*
+ * fw_cfg helpers (PPC specific)
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Author:
+ *   Philippe Mathieu-Daudé 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/ppc/ppc.h"
+#include "hw/nvram/fw_cfg.h"
+
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+static const struct {
+uint16_t key;
+const char *name;
+} fw_cfg_arch_wellknown_keys[] = {
+{FW_CFG_PPC_WIDTH, "width"},
+{FW_CFG_PPC_HEIGHT, "height"},
+{FW_CFG_PPC_DEPTH, "depth"},
+{FW_CFG_PPC_TBFREQ, "tbfreq"},
+{FW_CFG_PPC_CLOCKFREQ, "clockfreq"},
+{FW_CFG_PPC_IS_KVM, "is_kvm"},
+{FW_CFG_PPC_KVM_HC, "kvm_hc"},
+{FW_CFG_PPC_KVM_PID, "pid"},
+{FW_CFG_PPC_NVRAM_ADDR, "nvram_addr"},
+{FW_CFG_PPC_BUSFREQ, "busfreq"},
+{FW_CFG_PPC_NVRAM_FLAT, "nvram_flat"},
+{FW_CFG_PPC_VIACONFIG, "viaconfig"},
+};
+
+for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+if (fw_cfg_arch_wellknown_keys[i].key == key) {
+return fw_cfg_arch_wellknown_keys[i].name;
+}
+}
+return NULL;
+}
-- 
2.20.1

[Qemu-devel] [PATCH v3 3/7] hw/i386: Extract fw_cfg definitions to local "fw_cfg.h"

2019-04-22 Thread Philippe Mathieu-Daudé

Extract the architecture-specific fw_cfg definitions to "fw_cfg.h".

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg.h | 20 
 hw/i386/pc.c |  7 +--
 2 files changed, 21 insertions(+), 6 deletions(-)
 create mode 100644 hw/i386/fw_cfg.h

diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
new file mode 100644
index 000..17a4bc32f22
--- /dev/null
+++ b/hw/i386/fw_cfg.h
@@ -0,0 +1,20 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HW_I386_FW_CFG_H
+#define HW_I386_FW_CFG_H
+
+#include "hw/nvram/fw_cfg.h"
+
+#define FW_CFG_ACPI_TABLES  (FW_CFG_ARCH_LOCAL + 0)
+#define FW_CFG_SMBIOS_ENTRIES   (FW_CFG_ARCH_LOCAL + 1)
+#define FW_CFG_IRQ0_OVERRIDE(FW_CFG_ARCH_LOCAL + 2)
+#define FW_CFG_E820_TABLE   (FW_CFG_ARCH_LOCAL + 3)
+#define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
+
+#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f2c15bf1f2c..acb8fd9667d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -30,6 +30,7 @@
 #include "hw/char/parallel.h"
 #include "hw/i386/apic.h"
 #include "hw/i386/topology.h"
+#include "hw/i386/fw_cfg.h"
 #include "sysemu/cpus.h"
 #include "hw/block/fdc.h"
 #include "hw/ide.h"
@@ -88,12 +89,6 @@
 #define DPRINTF(fmt, ...)
 #endif
 
-#define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
-#define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
-#define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
-#define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3)
-#define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
-
 #define E820_NR_ENTRIES16
 
 struct e820_entry {
-- 
2.20.1

[Qemu-devel] [PATCH v3 2/7] hw/nvram/fw_cfg: Add fw_cfg_arch_key_name()

2019-04-22 Thread Philippe Mathieu-Daudé

Add fw_cfg_arch_key_name() which returns the name of
an architecture-specific key.

Signed-off-by: Philippe Mathieu-Daudé 
---
 MAINTAINERS   |  1 +
 hw/nvram/fw_cfg.c |  2 +-
 include/hw/nvram/fw_cfg.h | 11 +++
 stubs/Makefile.objs   |  1 +
 stubs/fw_cfg.c| 21 +
 5 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 stubs/fw_cfg.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 56139ac8ab0..444783bb652 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1675,6 +1675,7 @@ R: Gerd Hoffmann 
 S: Supported
 F: docs/specs/fw_cfg.txt
 F: hw/nvram/fw_cfg.c
+F: stubs/fw_cfg.c
 F: include/hw/nvram/fw_cfg.h
 F: include/standard-headers/linux/qemu_fw_cfg.h
 F: tests/libqos/fw_cfg.c
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index d374a970fea..b2dc0a80cbc 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -100,7 +100,7 @@ static const char *key_name(uint16_t key)
 };
 
 if (key & FW_CFG_ARCH_LOCAL) {
-return NULL;
+return fw_cfg_arch_key_name(key);
 }
 if (key < FW_CFG_FILE_FIRST) {
 return fw_cfg_wellknown_keys[key];
diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h
index f5a6895a740..828ad9dedc6 100644
--- a/include/hw/nvram/fw_cfg.h
+++ b/include/hw/nvram/fw_cfg.h
@@ -226,4 +226,15 @@ FWCfgState *fw_cfg_init_mem_wide(hwaddr ctl_addr,
 FWCfgState *fw_cfg_find(void);
 bool fw_cfg_dma_enabled(void *opaque);
 
+/**
+ * fw_cfg_arch_key_name:
+ *
+ * @key: The uint16 selector key.
+ *
+ * Returns: The stringified architecture-specific name if the selector
+ *  refers to a well-known numerically defined item, or NULL on
+ *  key lookup failure.
+ */
+const char *fw_cfg_arch_key_name(uint16_t key);
+
 #endif
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 269dfa58326..73452ad2657 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -39,3 +39,4 @@ stub-obj-y += xen-hvm.o
 stub-obj-y += pci-host-piix.o
 stub-obj-y += ram-block.o
 stub-obj-y += ramfb.o
+stub-obj-y += fw_cfg.o
diff --git a/stubs/fw_cfg.c b/stubs/fw_cfg.c
new file mode 100644
index 000..bb1e3c8aa95
--- /dev/null
+++ b/stubs/fw_cfg.c
@@ -0,0 +1,21 @@
+/*
+ * fw_cfg stubs
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Author:
+ *   Philippe Mathieu-Daudé 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/nvram/fw_cfg.h"
+
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+return NULL;
+}
-- 
2.20.1

[Qemu-devel] [PATCH v3 4/7] hw/i386: Implement fw_cfg_arch_key_name()

2019-04-22 Thread Philippe Mathieu-Daudé

Implement fw_cfg_arch_key_name(), which returns the name of a
i386-specific key.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/Makefile.objs |  2 +-
 hw/i386/fw_cfg.c  | 38 ++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 hw/i386/fw_cfg.c

diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 27248a0777c..5d9c9efd5fa 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -3,7 +3,7 @@ obj-y += multiboot.o
 obj-y += pc.o
 obj-$(CONFIG_I440FX) += pc_piix.o
 obj-$(CONFIG_Q35) += pc_q35.o
-obj-y += pc_sysfw.o
+obj-y += fw_cfg.o pc_sysfw.o
 obj-y += x86-iommu.o
 obj-$(CONFIG_VTD) += intel_iommu.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
new file mode 100644
index 000..c5e8b4ead0b
--- /dev/null
+++ b/hw/i386/fw_cfg.c
@@ -0,0 +1,38 @@
+/*
+ * QEMU fw_cfg helpers (X86 specific)
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * Author:
+ *   Philippe Mathieu-Daudé 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/i386/fw_cfg.h"
+#include "hw/nvram/fw_cfg.h"
+
+const char *fw_cfg_arch_key_name(uint16_t key)
+{
+static const struct {
+uint16_t key;
+const char *name;
+} fw_cfg_arch_wellknown_keys[] = {
+{FW_CFG_ACPI_TABLES, "acpi_tables"},
+{FW_CFG_SMBIOS_ENTRIES, "smbios_entries"},
+{FW_CFG_IRQ0_OVERRIDE, "irq0_override"},
+{FW_CFG_E820_TABLE, "e820_tables"},
+{FW_CFG_HPET, "hpet"},
+};
+
+for (size_t i = 0; i < ARRAY_SIZE(fw_cfg_arch_wellknown_keys); i++) {
+if (fw_cfg_arch_wellknown_keys[i].key == key) {
+return fw_cfg_arch_wellknown_keys[i].name;
+}
+}
+return NULL;
+}
-- 
2.20.1

[Qemu-devel] [PATCH v3 1/7] hw/nvram/fw_cfg: Add trace events

2019-04-22 Thread Philippe Mathieu-Daudé

Add trace events to dump the key content.

Reviewed-by: Michael S. Tsirkin 
Reviewed-by: Laszlo Ersek 
Signed-off-by: Philippe Mathieu-Daudé 
---
v1 -> v3:
- moved static fw_cfg_wellknown_keys[] in key_name()
- split trace_key_name() (can return "unknown")
  from key_name() (can return NULL)

Since changes from v1 are trivial, keep S-o-b tags.
---
 hw/nvram/fw_cfg.c | 63 ++-
 hw/nvram/trace-events |  7 -
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 5c3a46ce6f2..d374a970fea 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -60,6 +60,62 @@ struct FWCfgEntry {
 FWCfgWriteCallback write_cb;
 };
 
+/**
+ * key_name:
+ *
+ * @key: The uint16 selector key.
+ *
+ * Returns: The stringified name if the selector refers to a well-known
+ *  numerically defined item, or NULL on key lookup failure.
+ */
+static const char *key_name(uint16_t key)
+{
+static const char *fw_cfg_wellknown_keys[FW_CFG_FILE_FIRST] = {
+[FW_CFG_SIGNATURE] = "signature",
+[FW_CFG_ID] = "id",
+[FW_CFG_UUID] = "uuid",
+[FW_CFG_RAM_SIZE] = "ram_size",
+[FW_CFG_NOGRAPHIC] = "nographic",
+[FW_CFG_NB_CPUS] = "nb_cpus",
+[FW_CFG_MACHINE_ID] = "machine_id",
+[FW_CFG_KERNEL_ADDR] = "kernel_addr",
+[FW_CFG_KERNEL_SIZE] = "kernel_size",
+[FW_CFG_KERNEL_CMDLINE] = "kernel_cmdline",
+[FW_CFG_INITRD_ADDR] = "initrd_addr",
+[FW_CFG_INITRD_SIZE] = "initdr_size",
+[FW_CFG_BOOT_DEVICE] = "boot_device",
+[FW_CFG_NUMA] = "numa",
+[FW_CFG_BOOT_MENU] = "boot_menu",
+[FW_CFG_MAX_CPUS] = "max_cpus",
+[FW_CFG_KERNEL_ENTRY] = "kernel_entry",
+[FW_CFG_KERNEL_DATA] = "kernel_data",
+[FW_CFG_INITRD_DATA] = "initrd_data",
+[FW_CFG_CMDLINE_ADDR] = "cmdline_addr",
+[FW_CFG_CMDLINE_SIZE] = "cmdline_size",
+[FW_CFG_CMDLINE_DATA] = "cmdline_data",
+[FW_CFG_SETUP_ADDR] = "setup_addr",
+[FW_CFG_SETUP_SIZE] = "setup_size",
+[FW_CFG_SETUP_DATA] = "setup_data",
+[FW_CFG_FILE_DIR] = "file_dir",
+};
+
+if (key & FW_CFG_ARCH_LOCAL) {
+return NULL;
+}
+if (key < FW_CFG_FILE_FIRST) {
+return fw_cfg_wellknown_keys[key];
+}
+
+return NULL;
+}
+
+static inline const char *trace_key_name(uint16_t key)
+{
+const char *name = key_name(key);
+
+return name ? name : "unknown";
+}
+
 #define JPG_FILE 0
 #define BMP_FILE 1
 
@@ -233,7 +289,7 @@ static int fw_cfg_select(FWCfgState *s, uint16_t key)
 }
 }
 
-trace_fw_cfg_select(s, key, ret);
+trace_fw_cfg_select(s, key, trace_key_name(key), ret);
 return ret;
 }
 
@@ -616,6 +672,7 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, 
uint16_t key,
 
 void fw_cfg_add_bytes(FWCfgState *s, uint16_t key, void *data, size_t len)
 {
+trace_fw_cfg_add_bytes(key, trace_key_name(key), len);
 fw_cfg_add_bytes_callback(s, key, NULL, NULL, NULL, data, len, true);
 }
 
@@ -623,6 +680,7 @@ void fw_cfg_add_string(FWCfgState *s, uint16_t key, const 
char *value)
 {
 size_t sz = strlen(value) + 1;
 
+trace_fw_cfg_add_string(key, trace_key_name(key), value);
 fw_cfg_add_bytes(s, key, g_memdup(value, sz), sz);
 }
 
@@ -632,6 +690,7 @@ void fw_cfg_add_i16(FWCfgState *s, uint16_t key, uint16_t 
value)
 
 copy = g_malloc(sizeof(value));
 *copy = cpu_to_le16(value);
+trace_fw_cfg_add_i16(key, trace_key_name(key), value);
 fw_cfg_add_bytes(s, key, copy, sizeof(value));
 }
 
@@ -651,6 +710,7 @@ void fw_cfg_add_i32(FWCfgState *s, uint16_t key, uint32_t 
value)
 
 copy = g_malloc(sizeof(value));
 *copy = cpu_to_le32(value);
+trace_fw_cfg_add_i32(key, trace_key_name(key), value);
 fw_cfg_add_bytes(s, key, copy, sizeof(value));
 }
 
@@ -660,6 +720,7 @@ void fw_cfg_add_i64(FWCfgState *s, uint16_t key, uint64_t 
value)
 
 copy = g_malloc(sizeof(value));
 *copy = cpu_to_le64(value);
+trace_fw_cfg_add_i64(key, trace_key_name(key), value);
 fw_cfg_add_bytes(s, key, copy, sizeof(value));
 }
 
diff --git a/hw/nvram/trace-events b/hw/nvram/trace-events
index e191991e2a8..0dea9260ce1 100644
--- a/hw/nvram/trace-events
+++ b/hw/nvram/trace-events
@@ -5,6 +5,11 @@ nvram_read(uint32_t addr, uint32_t ret) "read addr %d: 0x%02x"
 nvram_write(uint32_t addr, uint32_t old, uint32_t val) "write addr %d: 0x%02x 
-> 0x%02x"
 
 # fw_cfg.c
-fw_cfg_select(void *s, uint16_t key, int ret) "%p key %d = %d"
+fw_cfg_select(void *s, uint16_t key_value, const char *key_name, int ret) "%p 
key 0x%04" PRIx16 " '%s', ret: %d"
 fw_cfg_read(void *s, uint64_t ret) "%p = 0x%"PRIx64
+fw_cfg_add_bytes(uint16_t key_value, const char *key_name, size_t len) "key 
0x%04" PRIx16 " '%s', %zu bytes"
 fw_cfg_add_file(void *s, int index, char *name, size_t len) "%p #%d: %s (%zd 
bytes)"
+fw_cfg_add_string(uint16_t key_value, const char *key_name,

[Qemu-devel] [PATCH v3 0/7] fw_cfg: Improve tracing

2019-04-22 Thread Philippe Mathieu-Daudé

Trivial series to improve fw_cfg tracing.

Regards,

Phil.

Since v2: https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg02522.html
- Split arch-specific code (1 patch per arch) (requested by Laszlo)

Since v1: https://lists.gnu.org/archive/html/qemu-devel/2018-12/msg01598.html
- Added arch-specific keys

Philippe Mathieu-Daudé (7):
  hw/nvram/fw_cfg: Add trace events
  hw/nvram/fw_cfg: Add fw_cfg_arch_key_name()
  hw/i386: Extract fw_cfg definitions to local "fw_cfg.h"
  hw/i386: Implement fw_cfg_arch_key_name()
  hw/ppc: Implement fw_cfg_arch_key_name()
  hw/sparc: Implement fw_cfg_arch_key_name()
  hw/sparc64: Implement fw_cfg_arch_key_name()

 MAINTAINERS   |  1 +
 hw/i386/Makefile.objs |  2 +-
 hw/i386/fw_cfg.c  | 38 +++
 hw/i386/fw_cfg.h  | 20 +
 hw/i386/pc.c  |  7 +
 hw/nvram/fw_cfg.c | 63 ++-
 hw/nvram/trace-events |  7 -
 hw/ppc/Makefile.objs  |  2 +-
 hw/ppc/fw_cfg.c   | 45 
 hw/sparc/sun4m.c  | 19 
 hw/sparc64/sun4u.c| 19 
 include/hw/nvram/fw_cfg.h | 11 +++
 stubs/Makefile.objs   |  1 +
 stubs/fw_cfg.c| 21 +
 14 files changed, 246 insertions(+), 10 deletions(-)
 create mode 100644 hw/i386/fw_cfg.c
 create mode 100644 hw/i386/fw_cfg.h
 create mode 100644 hw/ppc/fw_cfg.c
 create mode 100644 stubs/fw_cfg.c

-- 
2.20.1

Re: [Qemu-devel] [PATCH v5 1/6] libnvdimm: nd_region flush callback support

2019-04-22 Thread Dan Williams

On Mon, Apr 22, 2019 at 8:59 AM Jeff Moyer  wrote:
>
> Dan Williams  writes:
>
> > On Thu, Apr 18, 2019 at 9:18 AM Christoph Hellwig  
> > wrote:
> >>
> >> On Thu, Apr 18, 2019 at 09:05:05AM -0700, Dan Williams wrote:
> >> > > > I'd either add a comment about avoiding retpoline overhead here or 
> >> > > > just
> >> > > > make ->flush == NULL mean generic_nvdimm_flush(). Just so that 
> >> > > > people don't
> >> > > > get confused by the code.
> >> > >
> >> > > Isn't this premature optimization?  I really don't like adding things
> >> > > like this without some numbers to show it's worth it.
> >> >
> >> > I don't think it's premature given this optimization technique is
> >> > already being deployed elsewhere, see:
> >> >
> >> > https://lwn.net/Articles/774347/
> >>
> >> For one this one was backed by numbers, and second after feedback
> >> from Linux we switched to the NULL pointer check instead.
> >
> > Ok I should have noticed the switch to NULL pointer check. However,
> > the question still stands do we want everyone to run numbers to
> > justify this optimization, or make it a new common kernel coding
> > practice to do:
> >
> > if (!object->op)
> > generic_op(object);
> > else
> > object->op(object);
> >
> > ...in hot paths?
>
> I don't think nvdimm_flush is a hot path.  Numbers of some
> representative workload would prove one of us right.

I'd rather say that the if "if (!op) do_generic()" pattern is more
readable in the general case, saves grepping for who set the op in the
common case. The fact that it has the potential to be faster is gravy
at that point.

Re: [Qemu-devel] [PATCH] docs/interop/bitmaps: rewrite and modernize doc

2019-04-22 Thread John Snow




On 4/18/19 12:38 PM, Vladimir Sementsov-Ogievskiy wrote:
> 18.04.2019 3:14, John Snow wrote:
>> This just about rewrites the entirety of the bitmaps.rst document to
>> make it consistent with the 4.0 release. I have added new features seen
>> in the 4.0 release, as well as tried to clarify some points that keep
>> coming up when discussing this feature both in-house and upstream.
>>
>> Yes, it's a lot longer, mostly due to examples. I get a bit chatty.
>> I could use a good editor to help reign in my chattiness.
>>
>> It does not yet cover pull backups or migration details, but I intend to
>> keep extending this document to cover those cases.
>>
>> Please try compiling it with sphinx and look at the rendered output, I
>> don't have hosting to share my copy at present. I think this new layout
>> reads nicer in the HTML format than the old one did, at the expense of
>> looking less readable in the source tree itself (though not completely
>> unmanagable. We did decide to convert it from Markdown to ReST, after
>> all, so I am going all-in on ReST.)
>>
>> Signed-off-by: John Snow 
>> ---
>>   docs/interop/bitmaps.rst | 1499 ++
>>   Makefile |2 +-
>>   2 files changed, 1192 insertions(+), 309 deletions(-)
>>
>> diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
>> index 7bcfe7f461..a39d1fc871 100644
>> --- a/docs/interop/bitmaps.rst
>> +++ b/docs/interop/bitmaps.rst
> 
> you may want to update copyright date at the beginning of the file
> 

Ah, I guess so. I don't really know how copyright works anyway :)

>> @@ -9,128 +9,481 @@
>>   Dirty Bitmaps and Incremental Backup
>>   
>>   
>> --  Dirty Bitmaps are objects that track which data needs to be backed up
>> -   for the next incremental backup.
>> +Dirty Bitmaps are in-memory objects that track writes to block devices. 
>> They can
>> +be used in conjunction with various block job operations to perform 
>> incremental
>> +or differential backup regimens.
>>   
>> --  Dirty bitmaps can be created at any time and attached to any node
>> -   (not just complete drives).
>> +This document explains the conceptual mechanisms, as well as up-to-date,
>> +complete and comprehensive documentation on the API to manipulate them.
>> +(Hopefully, the "why", "what", and "how".)
>> +
>> +The intended audience for this document is developers who are adding QEMU 
>> backup
>> +features to management applications, or power users who run and administer 
>> QEMU
>> +directly via QMP.
>>   
>>   .. contents::
>>   
>> +Overview
>> +
>> +
>> +Bitmaps are bit vectors where each '1' bit in the vector indicates a 
>> modified
>> +("dirty") segment of the corresponding block device. The size of the segment
>> +that is tracked is the granularity of the bitmap. If the granularity of a 
>> bitmap
>> +is 64K, each '1' bit means that an entire 64K region changed in some way.
> 
> hm not exactly. Сonversely, if we change not the entire region but only on 
> byte of it,
> corresponding bit in the bitmap would be set..
> 

Ah, yeah, I worded this oddly. I meant to say that taken as a whole, a
64k region changed in some way (possibly by as little as just one bit.)

I'll fix this.

>> +
>> +Smaller granularities mean more accurate tracking of modified disk data, but
>> +requires more computational overhead and larger bitmap sizes. Larger
>> +granularities mean smaller bitmap sizes, but less targeted backups.
>> +
>> +The size of a bitmap (in bytes) can be computed as such:
>> +``size`` = ((``image_size`` / ``granularity``) / 8)
> 
> both divisions should round up
> 

Will clarify. It's also not quite true because of the hierarchical
storage requirements too; this is really the size on disk ... but it's a
useful heuristic for people to know, anyway.

"It's about 1MB per 512GB, until you adjust tuning."

>> +
>> +e.g. the size of a 64KiB granularity bitmap on a 2TiB image is:
>> +``size`` = ((2147483648K / 64K) / 8)
>> + = 4194304B = 4MiB.
>> +
>> +QEMU uses these bitmaps when making incremental backups to know which
>> +sections of the file to copy out. They are not enabled by default and
>> +must be explicitly added in order to begin tracking writes.
>> +
>> +Bitmaps can be created at any time and can be attached to any
>> +arbitrary block node in the storage graph, but are most useful
>> +conceptually when attached to the root node attached to the guest's
>> +storage device model.
>> +
>> +(Which is a really chatty way of saying: It's likely most useful to
>> +track the guest's writes to disk, but you could theoretically track
>> +things like qcow2 metadata changes by attaching the bitmap elsewhere
>> +in the storage graph.)
>> +
>> +QEMU supports persisting these bitmaps to disk via the qcow2 image format.
>> +Bitmaps which are stored or loaded in this way are called "persistent", 
>> whereas
>> +bitmaps that are not are called "transient".
>> +
>> +QEMU also sup

Re: [Qemu-devel] [PATCH v14 0/2] support MAP_SYNC for memory-backend-file

2019-04-22 Thread Eduardo Habkost

On Mon, Apr 22, 2019 at 08:34:51AM -0400, Michael S. Tsirkin wrote:
> On Mon, Apr 22, 2019 at 08:48:47AM +0800, Wei Yang wrote:
> > Linux 4.15 introduces a new mmap flag MAP_SYNC, which can be used to
> > guarantee the write persistence to mmap'ed files supporting DAX (e.g.,
> > files on ext4/xfs file system mounted with '-o dax').
> > 
> > A description of MAP_SYNC and MAP_SHARED_VALIDATE can be found at
> > https://patchwork.kernel.org/patch/10028151/
> > 
> > In order to make sure that the file metadata is in sync after a fault 
> > while we are writing a shared DAX supporting backend files, this
> > patch-set enables QEMU to use MAP_SYNC flag for memory-backend-dax-file.
> > 
> > As the DAX vs DMA truncated issue was solved, we refined the code and
> > send out this feature for the v5 version.
> > 
> > We will pass MAP_SYNC to mmap(2); if MAP_SYNC is supported and
> > 'share=on' & 'pmem=on'. 
> > Or QEMU will not pass this flag to mmap(2)
> 
> OK this is in a good shape. As we are in freeze anyway,
> there's still a bit more time to polish it. I have a couple of
> suggestions:
> 
> - squash docs in same patch with code, no need for two patches
> - mmap errors are not silently ignored as the doc says,
>   a warning is produced

Note that a warning is produced only if both share=on and pmem=on
is specified.  If using pmem=on without share=on, no warning is
printed at all.

I agree we could squash the docs in the same patch, but I don't
want to prevent the code from being merged and require v15 to be
sent just because we are still polishing the documentation.

If there are no objections, I plan to apply this version of the
series including the following fixup (just removing the word
"silently"), and I suggest further improvements to be sent as
follow up patches.

diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
index bcd1456e72..b531cacd35 100644
--- a/docs/nvdimm.txt
+++ b/docs/nvdimm.txt
@@ -159,8 +159,8 @@ If these conditions are not satisfied i.e. if either 'pmem' 
or 'share'
 are not set, if the backend file does not support DAX or if MAP_SYNC
 is not supported by the host kernel, write persistence is not
 guaranteed after a system crash. For compatibility reasons, these
-conditions are silently ignored if not satisfied. Currently, no way
-is provided to test for them.
+conditions are ignored if not satisfied. Currently, no way is
+provided to test for them.
 For more details, please reference mmap(2) man page:
 http://man7.org/linux/man-pages/man2/mmap.2.html.
 
-- 
Eduardo

[Qemu-devel] [Bug 1615079] Re: GTK+ UI virtual consoles scrolling broken

2019-04-22 Thread Bug Report

VTE is required for this to work?

if yes, the configure script should have info about the dependency

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1615079

Title:
  GTK+ UI virtual consoles scrolling broken

Status in QEMU:
  New

Bug description:
  "In the virtual consoles, you can use Ctrl-Up, Ctrl-Down, Ctrl-PageUp
  and Ctrl-PageDown to move in the back log."

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1615079/+subscriptions

Re: [Qemu-devel] [PATCH v2 3/3] edu: uses uint64_t in dma operation

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 3:21 AM, Li Qiang wrote:
> 
> 
> Philippe Mathieu-Daudé mailto:phi...@redhat.com>> 于
> 2019年4月21日周日 下午6:32写道：
> 
> On 4/20/19 6:14 PM, Li Qiang wrote:
> > The dma related variable is dma_addr_t, it is uint64_t in
> > x64 platform. Change these usage from uint32_to uint64_t to
> > avoid trancation.
> 
> "to avoid address truncation"?
> 
> 
> 
> The dma.dst/src/cnt..is from guest and is 64-bits. But in 'edu_dma_timer',
> it is assigned to uint32_t, If it is 0x , it will be ok
> by the check
> but it is of course not allowed. 
> 
> Though this is just an edu device, I think we should avoid this. 

I agree with you, I was just trying to correct your English ;)

> Thanks,
> Li Qiang
> 
>  
> 
> >
> > Signed-off-by: Li Qiang mailto:liq...@163.com>>
> > ---
> > Change since v1:
> > Fix format compile error on Windows
> >
> >  hw/misc/edu.c | 15 ---
> >  1 file changed, 8 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/misc/edu.c b/hw/misc/edu.c
> > index 4018dddcb8..f4a6d5f1c5 100644
> > --- a/hw/misc/edu.c
> > +++ b/hw/misc/edu.c
> > @@ -98,23 +98,24 @@ static void edu_lower_irq(EduState *edu,
> uint32_t val)
> >      }
> >  }
> > 
> > -static bool within(uint32_t addr, uint32_t start, uint32_t end)
> > +static bool within(uint64_t addr, uint64_t start, uint64_t end)
> 
> OK.
> 
> >  {
> >      return start <= addr && addr < end;
> >  }
> > 
> > -static void edu_check_range(uint32_t addr, uint32_t size1,
> uint32_t start,
> > +static void edu_check_range(uint64_t addr, uint64_t size1,
> uint64_t start,
> >                  uint32_t size2)
> 
> OK for addr. MMIO range is 1MiB so you can keep uint32_t for
> size1/size2. Up to the maintainer (personally I'd prefer keep u32).
> 
> Reviewed-by: Philippe Mathieu-Daudé  >
> 
> >  {
> > -    uint32_t end1 = addr + size1;
> > -    uint32_t end2 = start + size2;
> > +    uint64_t end1 = addr + size1;
> > +    uint64_t end2 = start + size2;
> > 
> >      if (within(addr, start, end2) &&
> >              end1 > addr && within(end1, start, end2)) {
> >          return;
> >      }
> > 
> > -    hw_error("EDU: DMA range 0x%.8x-0x%.8x out of bounds
> (0x%.8x-0x%.8x)!",
> > +    hw_error("EDU: DMA range 0x%016"PRIx64"-0x%016"PRIx64
> > +             " out of bounds (0x%016"PRIx64"-0x%016"PRIx64")!",
> >              addr, end1 - 1, start, end2 - 1);
> >  }
> > 
> > @@ -139,13 +140,13 @@ static void edu_dma_timer(void *opaque)
> >      }
> > 
> >      if (EDU_DMA_DIR(edu->dma.cmd) == EDU_DMA_FROM_PCI) {
> > -        uint32_t dst = edu->dma.dst;
> > +        uint64_t dst = edu->dma.dst;
> >          edu_check_range(dst, edu->dma.cnt, DMA_START, DMA_SIZE);
> >          dst -= DMA_START;
> >          pci_dma_read(&edu->pdev, edu_clamp_addr(edu, edu->dma.src),
> >                  edu->dma_buf + dst, edu->dma.cnt);
> >      } else {
> > -        uint32_t src = edu->dma.src;
> > +        uint64_t src = edu->dma.src;
> >          edu_check_range(src, edu->dma.cnt, DMA_START, DMA_SIZE);
> >          src -= DMA_START;
> >          pci_dma_write(&edu->pdev, edu_clamp_addr(edu, edu->dma.dst),
> >
>

Re: [Qemu-devel] [PATCH v3 2/3] edu: mmio: allow 64-bit access in read dispatch

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 4:11 PM, Li Qiang wrote:
> The edu spec says when address >= 0x80, the MMIO area can
> be accessed by 64-bit.
> 
> Signed-off-by: Li Qiang 

Reviewed-by: Philippe Mathieu-Daude 

> ---
> Change since v2:
> Fix an error per Phillippe's advice
> 
>  hw/misc/edu.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/misc/edu.c b/hw/misc/edu.c
> index 65fc32b928..33de05141f 100644
> --- a/hw/misc/edu.c
> +++ b/hw/misc/edu.c
> @@ -185,7 +185,11 @@ static uint64_t edu_mmio_read(void *opaque, hwaddr addr, 
> unsigned size)
>  EduState *edu = opaque;
>  uint64_t val = ~0ULL;
>  
> -if (size != 4) {
> +if (addr < 0x80 && size != 4) {
> +return val;
> +}
> +
> +if (addr >= 0x80 && size != 4 && size != 8) {
>  return val;
>  }
>  
>

Re: [Qemu-devel] [PATCH v2 1/3] edu: mmio: set 'max_access_size' to 8

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 3:17 AM, Li Qiang wrote:
> 
> 
> Philippe Mathieu-Daudé mailto:phi...@redhat.com>> 于
> 2019年4月21日周日 下午6:28写道：
> 
> Hi Li,
> 
> The patch title is not very descriptive, maybe "allow 64-bit access"
> 
> 
> On 4/20/19 6:14 PM, Li Qiang wrote:
> > The edu spec said, the MMIO area can be accessed by 8 bytes.
> 
> or 64-bit...
> 
> > However currently the 'max_access_size' is not so the MMIO
> > access dispatch can only access 4 bytes one time. This patch
> 
> 32-bit
> 
> > fixes this to respect the spec.
> >
> > Notice: here the 'min_access_size' is not a must, I set this
> > for completement.
> 
> Which one? valid/impl? I think you can drop this comment from the commit
> description.
> 
> 
> Both needed. from memory_access_size, if we has no valid.max_access_size,
> this function will set it to 4.
> 
> static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
> {
>     unsigned access_size_max = mr->ops->valid.max_access_size;
> 
>     /* Regions are assumed to support 1-4 byte accesses unless
>        otherwise specified.  */
>     if (access_size_max == 0) {
>         access_size_max = 4;
>     }
>    ...
> } 
> 
> From access_with_adjusted_size, if we has no impl.max_access_size,
> this function will set it to 4.
> 
> ps: I will appreciate if anyone can explain what's the meaning of valid
> and impl's min/max_access_size
> and how it affects the behavior.

"valid" describes the valid access from the bus to the device.

Indeed in the EDU case those are 4 and 8.

"impl" describes the accesses implemented by the QEMU device model.
The developper who writes the device is free to choose the accesses he
will model.

If valid/impl accesses don't match, the function
access_with_adjusted_size() from memory.c will adjust the bus access to
the device implementation.

For example, if the device only implements 1 and 2 bytes accesses, with
a 1-4 valid access, if the CPU executes a 32-bit access, this function
will do 2x 16-bit access to the device (incrementing the address by 2)
and returns a 32-bit result.

Similarly, if the CPU does a 8-bit access on a 32-bit impl device,
access_with_adjusted_size() will execute a single 32-bit access to the
device, then mask/shift the returned value and returns a 8-bit result to
the caller.

> Thanks,
> Li Qiang
> 
> >
> > Signed-off-by: Li Qiang mailto:liq...@163.com>>
> > ---
> >  hw/misc/edu.c | 9 +
> >  1 file changed, 9 insertions(+)
> >
> > diff --git a/hw/misc/edu.c b/hw/misc/edu.c
> > index 91af452c9e..65fc32b928 100644
> > --- a/hw/misc/edu.c
> > +++ b/hw/misc/edu.c
> > @@ -289,6 +289,15 @@ static const MemoryRegionOps edu_mmio_ops = {
> >      .read = edu_mmio_read,
> >      .write = edu_mmio_write,
> >      .endianness = DEVICE_NATIVE_ENDIAN,
> > +    .valid = {
> > +        .min_access_size = 4,
> 
> Per the spec, this is correct.
> 
> > +        .max_access_size = 8,
> 
> Correct.
> 
> > +    },
> > +    .impl = {
> > +        .min_access_size = 4,
> 
> OK.
> 
> > +        .max_access_size = 8,
> 
> Correct.
> 
> > +    },
> > +
> >  };
> > 
> >  /*
> >
> 
> With title/description updated:
> Reviewed-by: Philippe Mathieu-Daudé  >
>

Re: [Qemu-devel] [PATCH] hvf: Add missing break statement

2019-04-22 Thread Philippe Mathieu-Daudé

On 4/22/19 5:42 AM, Chen Zhang via Qemu-devel wrote:
> In target/i386/hvf/hvf.c, a break statement was probably missing in 
> `hvf_vcpu_exec()`, in handling EXIT_REASON_HLT.
> 
> These lines seemed to be equivalent to `kvm_handle_halt()`.
> 

Fixes: c97d6d2cdf97

> Signed-off-by: Chen Zhang 
> ---
>  target/i386/hvf/hvf.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 42f9447303..2751c8125c 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -708,6 +708,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>  !(idtvec_info & VMCS_IDT_VEC_VALID)) {
>  cpu->halted = 1;
>  ret = EXCP_HLT;
> +break;

Oops... I'm surprised no compiler warned about this yet...

This probably mean:
- This code is not covered by Continuous Integration
- Upstream maintainers are not building this code
- Upstream is not running this code

Please tell me I'm wrong!

Meanwhile:
Reviewed-by: Philippe Mathieu-Daudé 

>  }
>  ret = EXCP_INTERRUPT;
>  break;
>

Re: [Qemu-devel] [PATCH v5 1/6] libnvdimm: nd_region flush callback support

2019-04-22 Thread Jeff Moyer

Dan Williams  writes:

> On Thu, Apr 18, 2019 at 9:18 AM Christoph Hellwig  wrote:
>>
>> On Thu, Apr 18, 2019 at 09:05:05AM -0700, Dan Williams wrote:
>> > > > I'd either add a comment about avoiding retpoline overhead here or just
>> > > > make ->flush == NULL mean generic_nvdimm_flush(). Just so that people 
>> > > > don't
>> > > > get confused by the code.
>> > >
>> > > Isn't this premature optimization?  I really don't like adding things
>> > > like this without some numbers to show it's worth it.
>> >
>> > I don't think it's premature given this optimization technique is
>> > already being deployed elsewhere, see:
>> >
>> > https://lwn.net/Articles/774347/
>>
>> For one this one was backed by numbers, and second after feedback
>> from Linux we switched to the NULL pointer check instead.
>
> Ok I should have noticed the switch to NULL pointer check. However,
> the question still stands do we want everyone to run numbers to
> justify this optimization, or make it a new common kernel coding
> practice to do:
>
> if (!object->op)
> generic_op(object);
> else
> object->op(object);
>
> ...in hot paths?

I don't think nvdimm_flush is a hot path.  Numbers of some
representative workload would prove one of us right.

-Jeff

[Qemu-devel] [Bug 1824768] Re: Qemu ARMv7 TCG MultiThreading for i386 guest doesn't work

2019-04-22 Thread TimeMaster

>> ./qemu-system-i386 -cdrom alpine.iso --accel tcg,thread=multi

I tried both, with -smp 4 and without..

In stable Qemu release, there is just a guest kernel panic, in
v4.0.0-rc3-dirty, there is also Illegal instruction error reported

> on an x-gene system, compiled for aarch32.
Yes, this should be Raspberry Pi 3, but I am not sure. Anyway, I can test this 
in one week when I receive one of those boards.

>Just so we're clear, which raspberry pi revision are you using?
>Based on comments I'm assuming pi 2, with the 4 core cortex-a7.

I am using Raspberry Pi 2, 4 cores, not sure where is the difference
between cortex-a7 and ARMv7, in my case it is just ARMv7.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1824768

Title:
  Qemu ARMv7 TCG MultiThreading for i386 guest doesn't work

Status in QEMU:
  Incomplete

Bug description:
  Using any Linux image (in this case Alpine Linux iso) I want to
  utilise all cores of my Raspberry with --accel,thread=multi. I know
  there is a probably still problem with memory ordering of the host,
  but I have also seen some very old commits which could potentially
  help with it.

  But anyway, with version qemu-i386 version 3.1.0 (Debian 1:3.1+dfsg-7)
  I can see OpenRC starting up services and then the kernel crash.

  With version QEMU emulator version 3.1.93 (v4.0.0-rc3-dirty)
  The whole machine crash with this error:
  Illegal instruction

  Using command:
  ./qemu-system-i386 -cdrom alpine.iso --accel tcg,thread=multi

  Full Console Output:
  qemu-system-i386: warning: Guest expects a stronger memory ordering than the 
host provides
  This may cause strange/hard to debug errors
  Illegal instruction

  Kernel:
  Linux raspberrypi 4.14.98-v7+ #1200 SMP Tue Feb 12 20:27:48 GMT 2019 armv7l 
GNU/Linux

  CPU:
  ARMv7 Processor rev 5 (v7l)
  Features: half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt 
vfpd32 lpae evtstrm
  4 cores

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1824768/+subscriptions

[Qemu-devel] [PATCH v4 0/3] Fix overflow bug in qcow2 discard

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

v4: now based on Kevin's block-next
01: add Eric's r-b
02: - s/which/where/ [Eric, hope, I fixed correct one which from two:)]
- s/it's/its
add Eric's r-b
03: - r-b and t-b by Eric
- grammar
- drop trace from test

v3: don't filter mapping info from qemu-img map output, otherwise
it don't show what I try to check [sorry for extra noise in list]

v2: [mostly by Eric's review]
01: new
02: point to bug introducing commit in cover letter [Eric]
 [but I failed to compile it, to check]
drop s/INT_MAX/BDRV_REQUEST_MAX_BYTES/ chunk
03: - improve wording
- cheating with preallocation=metadata and discards
  to make test quick and not eating disk space
- use new trace-point
- move it to be 250 iotest
- filter out extra qemu-img info output

Based-on: git://repo.or.cz/qemu/kevin.git block-next

Vladimir Sementsov-Ogievskiy (3):
  block/qcow2-refcount: add trace-point to qcow2_process_discards
  block/io: bdrv_pdiscard: support int64_t bytes parameter
  iotests: test big qcow2 shrink

 include/block/block.h  |  4 +--
 block/io.c | 16 -
 block/qcow2-refcount.c |  7 +++-
 block/trace-events |  3 ++
 tests/qemu-iotests/250 | 72 ++
 tests/qemu-iotests/250.out | 21 +++
 tests/qemu-iotests/group   |  1 +
 7 files changed, 113 insertions(+), 11 deletions(-)
 create mode 100755 tests/qemu-iotests/250
 create mode 100644 tests/qemu-iotests/250.out

-- 
2.18.0

[Qemu-devel] [PATCH v4 1/3] block/qcow2-refcount: add trace-point to qcow2_process_discards

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Let's at least trace ignored failure.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
---
 block/qcow2-refcount.c | 7 ++-
 block/trace-events | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index e0fe322500..60284bcaac 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -30,6 +30,7 @@
 #include "qemu/range.h"
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
+#include "trace.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 uint64_t max);
@@ -738,7 +739,11 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 
 /* Discard is optional, ignore the return value */
 if (ret >= 0) {
-bdrv_pdiscard(bs->file, d->offset, d->bytes);
+int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
+if (r2 < 0) {
+trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
+   r2);
+}
 }
 
 g_free(d);
diff --git a/block/trace-events b/block/trace-events
index 7335a42540..ea508f637e 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -91,6 +91,9 @@ qcow2_cache_get_done(void *co, int c, int i) "co %p 
is_l2_cache %d index %d"
 qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
 qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d"
 
+# qcow2-refcount.c
+qcow2_process_discards_failed_region(uint64_t offset, uint64_t bytes, int ret) 
"offset 0x%" PRIx64 " bytes 0x%" PRIx64 " ret %d"
+
 # qed-l2-cache.c
 qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
 qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
-- 
2.18.0

[Qemu-devel] [PATCH v4 2/3] block/io: bdrv_pdiscard: support int64_t bytes parameter

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

This fixes at least one overflow in qcow2_process_discards, which
passes 64bit region length to bdrv_pdiscard where bytes (or sectors in
the past) parameter is int since its introduction in 0b919fae.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
---
 include/block/block.h |  4 ++--
 block/io.c| 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index c7a26199aa..69fa18867e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -432,8 +432,8 @@ void bdrv_drain_all(void);
 AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),  \
cond); })
 
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes);
-int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes);
+int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);
diff --git a/block/io.c b/block/io.c
index dfc153b8d8..35c4157669 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2653,7 +2653,7 @@ int bdrv_flush(BlockDriverState *bs)
 typedef struct DiscardCo {
 BdrvChild *child;
 int64_t offset;
-int bytes;
+int64_t bytes;
 int ret;
 } DiscardCo;
 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
@@ -2664,14 +2664,15 @@ static void coroutine_fn bdrv_pdiscard_co_entry(void 
*opaque)
 aio_wait_kick();
 }
 
-int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
+int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
+  int64_t bytes)
 {
 BdrvTrackedRequest req;
 int max_pdiscard, ret;
 int head, tail, align;
 BlockDriverState *bs = child->bs;
 
-if (!bs || !bs->drv) {
+if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
 return -ENOMEDIUM;
 }
 
@@ -2679,9 +2680,8 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, 
int64_t offset, int bytes)
 return -EPERM;
 }
 
-ret = bdrv_check_byte_request(bs, offset, bytes);
-if (ret < 0) {
-return ret;
+if (offset < 0) {
+return -EIO;
 }
 
 /* Do nothing if disabled.  */
@@ -2716,7 +2716,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, 
int64_t offset, int bytes)
 assert(max_pdiscard >= bs->bl.request_alignment);
 
 while (bytes > 0) {
-int num = bytes;
+int64_t num = bytes;
 
 if (head) {
 /* Make small requests to get to alignment boundaries. */
@@ -2778,7 +2778,7 @@ out:
 return ret;
 }
 
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
+int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
 {
 Coroutine *co;
 DiscardCo rwco = {
-- 
2.18.0

[Qemu-devel] [PATCH v4 3/3] iotests: test big qcow2 shrink

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

This test checks bug in qcow2_process_discards, fixed by previous
commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Tested-by: Eric Blake 
---
 tests/qemu-iotests/250 | 72 ++
 tests/qemu-iotests/250.out | 21 +++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 94 insertions(+)
 create mode 100755 tests/qemu-iotests/250
 create mode 100644 tests/qemu-iotests/250.out

diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
new file mode 100755
index 00..315054f765
--- /dev/null
+++ b/tests/qemu-iotests/250
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+#
+# Test big discard in qcow2 shrink
+#
+# Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=vsement...@virtuozzo.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1   # failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# This test checks that qcow2_process_discards does not truncate a discard
+# request > 2G.
+# To reproduce bug we need to overflow int by one sequential discard, so we
+# need size > 2G, bigger cluster size (as with default 64k we may have maximum
+# of 512M sequential data, corresponding to one L1 entry), and we need some
+# data of the beginning of the disk mapped to the end of file to prevent
+# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
+# anyway.
+
+size=2100M
+IMGOPTS="cluster_size=1M,preallocation=metadata"
+
+_make_test_img $size
+$QEMU_IO -c 'discard 0 10M' -c 'discard 2090M 10M' \
+ -c 'write 2090M 10M' -c 'write 0 10M' "$TEST_IMG" | _filter_qemu_io
+
+# Check that our trick with swapping first and last 10M chunks succeeded.
+# Otherwise test may pass even if bdrv_pdiscard() fails in
+# qcow2_process_discards()
+$QEMU_IMG map "$TEST_IMG" | _filter_testdir
+$QEMU_IMG info "$TEST_IMG" | grep size |  _filter_testdir
+
+$QEMU_IMG resize --shrink "$TEST_IMG" 5M
+
+$QEMU_IMG info "$TEST_IMG" | grep size | _filter_testdir
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/250.out b/tests/qemu-iotests/250.out
new file mode 100644
index 00..a7076efc0c
--- /dev/null
+++ b/tests/qemu-iotests/250.out
@@ -0,0 +1,21 @@
+QA output created by 250
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=2202009600 
preallocation=metadata
+discard 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+discard 10485760/10485760 bytes at offset 2191523840
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 2191523840
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset  Length  Mapped to   File
+0   0xa00x82f0  TEST_DIR/t.qcow2
+0x82a0  0xa00x50TEST_DIR/t.qcow2
+virtual size: 2.05 GiB (2202009600 bytes)
+disk size: 24 MiB
+cluster_size: 1048576
+Image resized.
+virtual size: 5 MiB (5242880 bytes)
+disk size: 9 MiB
+cluster_size: 1048576
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index bae7718380..588ae8b8b1 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -248,3 +248,4 @@
 246 rw auto quick
 247 rw auto quick
 248 rw auto quick
+250 rw auto quick
-- 
2.18.0

Re: [Qemu-devel] [PATCH v3 3/3] iotests: test big qcow2 shrink

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

22.04.2019 16:59, Eric Blake wrote:
> On 4/19/19 9:05 AM, Vladimir Sementsov-Ogievskiy wrote:
>> This test checks bug in qcow2_process_discards, fixed by previous
>> commit.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
> 
>> +# This test checks that qcow2_process_discards does not truncate a discard
>> +# request > 2G.
>> +# To reproduce bug we need to overflow int by one sequential discard, so we
>> +# need size > 2G, bigger cluster size (as with default 64k we may have 
>> maximum
>> +# of 512M sequential data, corresponding to one L1 entry), and we need some
>> +# data of the beginning of the disk mapped to the end of file to prevent
>> +# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which may success
> 
> s/may success/might succeed/
> 
>> +# anyway.
>> +
>> +size=2100M
>> +IMGOPTS="cluster_size=1M,preallocation=metadata"
>> +
>> +_make_test_img $size
>> +$QEMU_IO -c 'discard 0 10M' -c 'discard 2090M 10M' \
>> + -c 'write 2090M 10M' -c 'write 0 10M' "$TEST_IMG" | _filter_qemu_io
>> +
>> +# Check that our trick with swapping first and last 10M chunks succeeded.
>> +# Otherwise test will may pass even if bdrv_pdiscard() fails in
>> +# qcow2_process_discards()
>> +$QEMU_IMG map "$TEST_IMG" | _filter_testdir
>> +$QEMU_IMG info "$TEST_IMG" | grep size |  _filter_testdir
> 
> Nice - that's a lot faster than v1! And makes the test fit in the quick
> group after all.
> 
>> +
>> +$QEMU_IMG -T 'qcow2_process_discards_failed*' resize --shrink "$TEST_IMG" 5M
> 
> However, I'm quite certain that trace output is not reliable in iotests.
> Depending on configure options, traces might not be enabled at all, or
> might not go to stderr. Drop the -T '...'.  Even without the trace line,
> 
>> +
>> +$QEMU_IMG info "$TEST_IMG" | grep size | _filter_testdir
> 
> this second image info is sufficient to prove whether the resize had an
> effect (post-patch) or exposes the bug (without patch 2/3). That is,
> applying this patch but not 2/3, I see this (with my cleanups to
> qemu-img info in place, from Kevin's block-next branch):
> 
> +++ /home/eblake/qemu/tests/qemu-iotests/250.out.bad  2019-04-22
> 08:52:26.072968731 -0500
> @@ -14,8 +14,9 @@
>   virtual size: 2.05 GiB (2202009600 bytes)
>   disk size: 24 MiB
>   cluster_size: 1048576
> +18274@1555941145.999195:qcow2_process_discards_failed_region offset
> 0x50 bytes 0x82a0 ret -5
>   Image resized.
>   virtual size: 5 MiB (5242880 bytes)
> -disk size: 9.01 MiB
> +disk size: 19 MiB
> 
> where the trace did indeed show show that we had a bug, but where even
> without the trace, the difference in size between 19M with incomplete
> discards vs. 9M with patch 2/3 is enough to show that patch 2/3 fixes a bug.
> 
>> +virtual size: 2.1G (2202009600 bytes)
>> +disk size: 24M
>> +cluster_size: 1048576
>> +Image resized.
>> +virtual size: 5.0M (5242880 bytes)
>> +disk size: 9.0M
>> +cluster_size: 1048576
> 
> When Kevin's block-next branch is applied, you'll need this squashed in:
> 
> diff --git a/tests/qemu-iotests/250.out b/tests/qemu-iotests/250.out
> index 37e37f0c9e7..d1c1c7180b5 100644
> --- a/tests/qemu-iotests/250.out
> +++ b/tests/qemu-iotests/250.out
> @@ -11,11 +11,11 @@ wrote 10485760/10485760 bytes at offset 0
>   Offset  Length  Mapped to   File
>   0   0xa00x82f0  TEST_DIR/t.qcow2
>   0x82a0  0xa00x50TEST_DIR/t.qcow2
> -virtual size: 2.1G (2202009600 bytes)
> -disk size: 24M
> +virtual size: 2.05 GiB (2202009600 bytes)
> +disk size: 24 MiB
>   cluster_size: 1048576
>   Image resized.
> -virtual size: 5.0M (5242880 bytes)
> -disk size: 9.0M
> +virtual size: 5 MiB (5242880 bytes)
> +disk size: 9.01 MiB
>   cluster_size: 1048576
>   *** done
> 
> With the updated output to match changes to qemu-img output, the grammar
> fixes, and with the -T option removed,
> 
> Reviewed-by: Eric Blake 
> Tested-by: Eric Blake 
> 

Thank you!

I'll resend soon.

-- 
Best regards,
Vladimir

[Qemu-devel] [PATCH 7/9] block/commit: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/commit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index ba60fef58a..08204fa6f8 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -48,16 +48,15 @@ static int coroutine_fn commit_populate(BlockBackend *bs, 
BlockBackend *base,
 void *buf)
 {
 int ret = 0;
-QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
 
 assert(bytes < SIZE_MAX);
 
-ret = blk_co_preadv(bs, offset, qiov.size, &qiov, 0);
+ret = blk_co_pread(bs, offset, bytes, buf, 0);
 if (ret < 0) {
 return ret;
 }
 
-ret = blk_co_pwritev(base, offset, qiov.size, &qiov, 0);
+ret = blk_co_pwrite(base, offset, bytes, buf, 0);
 if (ret < 0) {
 return ret;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 8/9] block/stream: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/stream.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index bfaebb861a..1a906fd860 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -42,12 +42,10 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
 int64_t offset, uint64_t bytes,
 void *buf)
 {
-QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
-
 assert(bytes < SIZE_MAX);
 
 /* Copy-on-read the unallocated clusters */
-return blk_co_preadv(blk, offset, qiov.size, &qiov, BDRV_REQ_COPY_ON_READ);
+return blk_co_pread(blk, offset, bytes, buf, BDRV_REQ_COPY_ON_READ);
 }
 
 static void stream_abort(Job *job)
-- 
2.18.0

[Qemu-devel] [PATCH 4/9] block/qed: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Move to _co_ versions of io functions qed_read_table() and
qed_write_table(), as we use qemu_co_mutex_unlock()
anyway.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qed-table.c | 12 +---
 block/qed.c   |  6 ++
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index c497bd4aec..cf30edd977 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -21,22 +21,22 @@
 /* Called with table_lock held.  */
 static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)
 {
-QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(
-qiov, table->offsets, s->header.cluster_size * s->header.table_size);
+unsigned int bytes = s->header.cluster_size * s->header.table_size;
+
 int noffsets;
 int i, ret;
 
 trace_qed_read_table(s, offset, table);
 
 qemu_co_mutex_unlock(&s->table_lock);
-ret = bdrv_preadv(s->bs->file, offset, &qiov);
+ret = bdrv_co_pread(s->bs->file, offset, bytes, table->offsets, 0);
 qemu_co_mutex_lock(&s->table_lock);
 if (ret < 0) {
 goto out;
 }
 
 /* Byteswap offsets */
-noffsets = qiov.size / sizeof(uint64_t);
+noffsets = bytes / sizeof(uint64_t);
 for (i = 0; i < noffsets; i++) {
 table->offsets[i] = le64_to_cpu(table->offsets[i]);
 }
@@ -66,7 +66,6 @@ static int qed_write_table(BDRVQEDState *s, uint64_t offset, 
QEDTable *table,
 unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
 unsigned int start, end, i;
 QEDTable *new_table;
-QEMUIOVector qiov;
 size_t len_bytes;
 int ret;
 
@@ -79,7 +78,6 @@ static int qed_write_table(BDRVQEDState *s, uint64_t offset, 
QEDTable *table,
 len_bytes = (end - start) * sizeof(uint64_t);
 
 new_table = qemu_blockalign(s->bs, len_bytes);
-qemu_iovec_init_buf(&qiov, new_table->offsets, len_bytes);
 
 /* Byteswap table */
 for (i = start; i < end; i++) {
@@ -91,7 +89,7 @@ static int qed_write_table(BDRVQEDState *s, uint64_t offset, 
QEDTable *table,
 offset += start * sizeof(uint64_t);
 
 qemu_co_mutex_unlock(&s->table_lock);
-ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+ret = bdrv_co_pwrite(s->bs->file, offset, len_bytes, new_table->offsets, 
0);
 qemu_co_mutex_lock(&s->table_lock);
 trace_qed_write_table_cb(s, table, flush, ret);
 if (ret < 0) {
diff --git a/block/qed.c b/block/qed.c
index 89af05d524..912edaf56a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -113,15 +113,13 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
 int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
 size_t len = nsectors * BDRV_SECTOR_SIZE;
 uint8_t *buf;
-QEMUIOVector qiov;
 int ret;
 
 assert(s->allocating_acb || s->allocating_write_reqs_plugged);
 
 buf = qemu_blockalign(s->bs, len);
-qemu_iovec_init_buf(&qiov, buf, len);
 
-ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
+ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
 if (ret < 0) {
 goto out;
 }
@@ -129,7 +127,7 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
 /* Update header */
 qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
 
-ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
+ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
 if (ret < 0) {
 goto out;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 3/9] block/qcow: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow.c | 19 ++-
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 10d2cf14b3..1bb8fd05e2 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -631,7 +631,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 int offset_in_cluster;
 int ret = 0, n;
 uint64_t cluster_offset;
-QEMUIOVector hd_qiov;
 uint8_t *buf;
 void *orig_buf;
 
@@ -663,11 +662,10 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 if (!cluster_offset) {
 if (bs->backing) {
 /* read from the base image */
-qemu_iovec_init_buf(&hd_qiov, buf, n);
 qemu_co_mutex_unlock(&s->lock);
 /* qcow2 emits this on bs->file instead of bs->backing */
 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-ret = bdrv_co_preadv(bs->backing, offset, n, &hd_qiov, 0);
+ret = bdrv_co_pread(bs->backing, offset, n, buf, 0);
 qemu_co_mutex_lock(&s->lock);
 if (ret < 0) {
 break;
@@ -688,11 +686,10 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 ret = -EIO;
 break;
 }
-qemu_iovec_init_buf(&hd_qiov, buf, n);
 qemu_co_mutex_unlock(&s->lock);
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-ret = bdrv_co_preadv(bs->file, cluster_offset + offset_in_cluster,
- n, &hd_qiov, 0);
+ret = bdrv_co_pread(bs->file, cluster_offset + offset_in_cluster,
+n, buf, 0);
 qemu_co_mutex_lock(&s->lock);
 if (ret < 0) {
 break;
@@ -731,7 +728,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 int offset_in_cluster;
 uint64_t cluster_offset;
 int ret = 0, n;
-QEMUIOVector hd_qiov;
 uint8_t *buf;
 void *orig_buf;
 
@@ -776,11 +772,10 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 }
 }
 
-qemu_iovec_init_buf(&hd_qiov, buf, n);
 qemu_co_mutex_unlock(&s->lock);
 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-  n, &hd_qiov, 0);
+ret = bdrv_co_pwrite(bs->file, cluster_offset + offset_in_cluster,
+ n, buf, 0);
 qemu_co_mutex_lock(&s->lock);
 if (ret < 0) {
 break;
@@ -1056,7 +1051,6 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t 
offset,
uint64_t bytes, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs->opaque;
-QEMUIOVector hd_qiov;
 z_stream strm;
 int ret, out_len;
 uint8_t *buf, *out_buf;
@@ -1122,9 +1116,8 @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t 
offset,
 }
 cluster_offset &= s->cluster_offset_mask;
 
-qemu_iovec_init_buf(&hd_qiov, out_buf, out_len);
 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
-ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
+ret = bdrv_co_pwrite(bs->file, cluster_offset, out_len, out_buf, 0);
 if (ret < 0) {
 goto fail;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 9/9] qemu-img: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-img.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index aa6f81f1ea..c40a4e8b83 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1671,7 +1671,6 @@ static int coroutine_fn convert_co_read(ImgConvertState 
*s, int64_t sector_num,
 int nb_sectors, uint8_t *buf)
 {
 int n, ret;
-QEMUIOVector qiov;
 
 assert(nb_sectors <= s->buf_sectors);
 while (nb_sectors > 0) {
@@ -1687,11 +1686,10 @@ static int coroutine_fn convert_co_read(ImgConvertState 
*s, int64_t sector_num,
 bs_sectors = s->src_sectors[src_cur];
 
 n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
-qemu_iovec_init_buf(&qiov, buf, n << BDRV_SECTOR_BITS);
 
-ret = blk_co_preadv(
+ret = blk_co_pread(
 blk, (sector_num - src_cur_offset) << BDRV_SECTOR_BITS,
-n << BDRV_SECTOR_BITS, &qiov, 0);
+n << BDRV_SECTOR_BITS, buf, 0);
 if (ret < 0) {
 return ret;
 }
@@ -1710,7 +1708,6 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
  enum ImgConvertBlockStatus status)
 {
 int ret;
-QEMUIOVector qiov;
 
 while (nb_sectors > 0) {
 int n = nb_sectors;
@@ -1738,10 +1735,8 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 (s->compressed &&
  !buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)))
 {
-qemu_iovec_init_buf(&qiov, buf, n << BDRV_SECTOR_BITS);
-
-ret = blk_co_pwritev(s->target, sector_num << BDRV_SECTOR_BITS,
- n << BDRV_SECTOR_BITS, &qiov, flags);
+ret = blk_co_pwrite(s->target, sector_num << BDRV_SECTOR_BITS,
+n << BDRV_SECTOR_BITS, buf, flags);
 if (ret < 0) {
 return ret;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 1/9] block: introduce byte-based io helpers

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h  | 16 
 include/sysemu/block-backend.h | 19 +++
 2 files changed, 35 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 01e855a066..94d45c9708 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -925,6 +925,22 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags);
 
+static inline int coroutine_fn bdrv_co_pread(BdrvChild *child,
+int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+return bdrv_co_preadv(child, offset, bytes, &qiov, flags);
+}
+
+static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child,
+int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+return bdrv_co_pwritev(child, offset, bytes, &qiov, flags);
+}
+
 extern unsigned int bdrv_drain_all_count;
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState 
*old_parent);
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 3be05c2d68..5be6224226 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -124,6 +124,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
unsigned int bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags);
+
+static inline int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset,
+unsigned int bytes, void *buf,
+BdrvRequestFlags flags)
+{
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+return blk_co_preadv(blk, offset, bytes, &qiov, flags);
+}
+
+static inline int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset,
+ unsigned int bytes, void *buf,
+ BdrvRequestFlags flags)
+{
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+return blk_co_pwritev(blk, offset, bytes, &qiov, flags);
+}
+
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
   int bytes, BdrvRequestFlags flags);
 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-- 
2.18.0

[Qemu-devel] [PATCH 2/9] block/qcow2: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3ace3b2209..c3b2ea294d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4088,7 +4088,6 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 uint64_t bytes, QEMUIOVector *qiov)
 {
 BDRVQcow2State *s = bs->opaque;
-QEMUIOVector hd_qiov;
 int ret;
 size_t out_len;
 uint8_t *buf, *out_buf;
@@ -4155,10 +4154,8 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 goto fail;
 }
 
-qemu_iovec_init_buf(&hd_qiov, out_buf, out_len);
-
 BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
-ret = bdrv_co_pwritev(s->data_file, cluster_offset, out_len, &hd_qiov, 0);
+ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
 if (ret < 0) {
 goto fail;
 }
@@ -4181,7 +4178,6 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
 int ret = 0, csize, nb_csectors;
 uint64_t coffset;
 uint8_t *buf, *out_buf;
-QEMUIOVector local_qiov;
 int offset_in_cluster = offset_into_cluster(s, offset);
 
 coffset = file_cluster_offset & s->cluster_offset_mask;
@@ -4192,12 +4188,11 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
 if (!buf) {
 return -ENOMEM;
 }
-qemu_iovec_init_buf(&local_qiov, buf, csize);
 
 out_buf = qemu_blockalign(bs, s->cluster_size);
 
 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-ret = bdrv_co_preadv(bs->file, coffset, csize, &local_qiov, 0);
+ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
 if (ret < 0) {
 goto fail;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 6/9] block/backup: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 9988753249..910ed764aa 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -107,7 +107,6 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
   void **bounce_buffer)
 {
 int ret;
-QEMUIOVector qiov;
 BlockBackend *blk = job->common.blk;
 int nbytes;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
@@ -118,9 +117,8 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 if (!*bounce_buffer) {
 *bounce_buffer = blk_blockalign(blk, job->cluster_size);
 }
-qemu_iovec_init_buf(&qiov, *bounce_buffer, nbytes);
 
-ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags);
+ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
 if (ret < 0) {
 trace_backup_do_cow_read_fail(job, start, ret);
 if (error_is_read) {
@@ -129,13 +127,13 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 goto fail;
 }
 
-if (qemu_iovec_is_zero(&qiov)) {
+if (buffer_is_zero(*bounce_buffer, nbytes)) {
 ret = blk_co_pwrite_zeroes(job->target, start,
-   qiov.size, write_flags | 
BDRV_REQ_MAY_UNMAP);
+   nbytes, write_flags | BDRV_REQ_MAY_UNMAP);
 } else {
-ret = blk_co_pwritev(job->target, start,
- qiov.size, &qiov, write_flags |
- (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
+ret = blk_co_pwrite(job->target, start,
+nbytes, *bounce_buffer, write_flags |
+(job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
 }
 if (ret < 0) {
 trace_backup_do_cow_write_fail(job, start, ret);
-- 
2.18.0

[Qemu-devel] [PATCH 5/9] block/parallels: use buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/parallels.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 15bc97b759..2747400577 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -220,20 +220,18 @@ static int64_t allocate_clusters(BlockDriverState *bs, 
int64_t sector_num,
 if (bs->backing) {
 int64_t nb_cow_sectors = to_allocate * s->tracks;
 int64_t nb_cow_bytes = nb_cow_sectors << BDRV_SECTOR_BITS;
-QEMUIOVector qiov =
-QEMU_IOVEC_INIT_BUF(qiov, qemu_blockalign(bs, nb_cow_bytes),
-nb_cow_bytes);
+void *buf = qemu_blockalign(bs, nb_cow_bytes);
 
-ret = bdrv_co_preadv(bs->backing, idx * s->tracks * BDRV_SECTOR_SIZE,
- nb_cow_bytes, &qiov, 0);
+ret = bdrv_co_pread(bs->backing, idx * s->tracks * BDRV_SECTOR_SIZE,
+nb_cow_bytes, buf, 0);
 if (ret < 0) {
-qemu_vfree(qemu_iovec_buf(&qiov));
+qemu_vfree(buf);
 return ret;
 }
 
 ret = bdrv_co_pwritev(bs->file, s->data_end * BDRV_SECTOR_SIZE,
-  nb_cow_bytes, &qiov, 0);
-qemu_vfree(qemu_iovec_buf(&qiov));
+  nb_cow_bytes, buf, 0);
+qemu_vfree(buf);
 if (ret < 0) {
 return ret;
 }
-- 
2.18.0

[Qemu-devel] [PATCH 0/9] block: buffer-based io

2019-04-22 Thread Vladimir Sementsov-Ogievskiy

Hi all!

We often need to do read/write with buffer, not qiov. Instead of
creating qiov in such cases, let's introduce corresponding helpers.

Vladimir Sementsov-Ogievskiy (9):
  block: introduce byte-based io helpers
  block/qcow2: use buffer-based io
  block/qcow: use buffer-based io
  block/qed: use buffer-based io
  block/parallels: use buffer-based io
  block/backup: use buffer-based io
  block/commit: use buffer-based io
  block/stream: use buffer-based io
  qemu-img: use buffer-based io

 include/block/block_int.h  | 16 
 include/sysemu/block-backend.h | 19 +++
 block/backup.c | 14 ++
 block/commit.c |  5 ++---
 block/parallels.c  | 14 ++
 block/qcow.c   | 19 ++-
 block/qcow2.c  |  9 ++---
 block/qed-table.c  | 12 +---
 block/qed.c|  6 ++
 block/stream.c |  4 +---
 qemu-img.c | 13 -
 11 files changed, 69 insertions(+), 62 deletions(-)

-- 
2.18.0

Re: [Qemu-devel] [PATCH v3 00/16] chardev: refactoring & many bugfixes related tcp_chr_wait_connected

2019-04-22 Thread Eric Blake

On 2/11/19 12:24 PM, Daniel P. Berrangé wrote:
> This is a followup to
> 
>   v1: https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg03344.html
>   v2: http://lists.nongnu.org/archive/html/qemu-devel/2019-01/msg05947.html
> 
> This series comes out of a discussion between myself & Yongji Xie in:
> 
>   https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg01881.html
> 
> I eventually understood that the problem faced was that
> tcp_chr_wait_connected was racing with the background connection attempt
> previously started, causing two connections to be established. This
> broke because some vhost user servers only allow a single connection.
> 
> After messing around with the code alot the final solution was in fact
> very easy. We simply have to delay the first background connection
> attempt until the main loop is running. It will then automatically
> turn into a no-op if tcp_chr_wait_connected has been run. This is
> dealt with in the last patch in this series
> 
> I believe this should solve the problem Yongji Xie faced, and thus not
> require us to add support for "nowait" option with client sockets at
> all. The reconnect=1 option effectively already implements nowait
> semantics, and now plays nicely with tcp_chr_wait_connected.
> 
> In investigating this I found various other bugs that needed fixing and
> identified some useful refactoring to simplify / clarify the code, hence
> this very long series.

Even with this series applied, I'm still seeing sporadic failures of
iotest 169. Max posted a hack patch a while back that tries to work
around the race:

https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg05907.html

which he originally diagnosed in iotest 147:
https://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html

but as it was a hack, he has not pursued it further, and so the symptoms
are still there, although not completely reproducible:

169 10s ... - output mismatch (see 169.out.bad)
--- /home/eblake/qemu/tests/qemu-iotests/169.out2018-11-16
15:48:12.018526748 -0600
+++ /home/eblake/qemu/tests/qemu-iotests/169.out.bad2019-04-22
09:38:45.481517132 -0500
@@ -1,3 +1,5 @@
+WARNING:qemu:qemu received signal 11:
/home/eblake/qemu/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev
socket,id=mon,path=/home/eblake/qemu/tests/qemu-iotests/scratch/tmp4clmPF/qemua-26803-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/home/eblake/qemu/tests/qemu-iotests/scratch/qemua-26803-qtest.sock
-machine accel=qtest -nodefaults -machine accel=qtest -drive
if=virtio,id=drive0,file=/home/eblake/qemu/tests/qemu-iotests/scratch/disk_a,format=qcow2,cache=writeback

Any chance you can take a look as to what a non-hack fix should be?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 1/2] vfio/mdev: add version field as mandatory attribute for mdev device

2019-04-22 Thread Alex Williamson

On Fri, 19 Apr 2019 04:35:04 -0400
Yan Zhao  wrote:

> device version attribute in mdev sysfs is used by user space software
> (e.g. libvirt) to query device compatibility for live migration of VFIO
> mdev devices. This attribute is mandatory if a mdev device supports live
> migration.

The Subject: doesn't quite match what's being proposed here.
 
> It consists of two parts: common part and vendor proprietary part.
> common part: 32 bit. lower 16 bits is vendor id and higher 16 bits
>  identifies device type. e.g., for pci device, it is
>  "pci vendor id" | (VFIO_DEVICE_FLAGS_PCI << 16).

What purpose does this serve?  If it's intended as some sort of
namespace feature, shouldn't we first assume that we can only support
migration to devices of the same type?  Therefore each type would
already have its own namespace.  Also that would make the trailing bit
of the version string listed below in the example redundant.  A vendor
is still welcome to include this in their version string if they wish,
but I think the string should be entirely vendor defined.

> vendor proprietary part: this part is varied in length. vendor driver can
>  specify any string to identify a device.
> 
> When reading this attribute, it should show device version string of the
> device of type . If a device does not support live migration, it
> should return errno.
> When writing a string to this attribute, it returns errno for
> incompatibility or returns written string length in compatibility case.
> If a device does not support live migration, it always returns errno.
> 
> For user space software to use:
> 1.
> Before starting live migration, user space software first reads source side
> mdev device's version. e.g.
> "#cat \
> /sys/bus/pci/devices/\:00\:02.0/5ac1fb20-2bbf-4842-bb7e-36c58c3be9cd/mdev_type/version"
> 00028086-193b-i915-GVTg_V5_4
> 
> 2.
> Then, user space software writes the source side returned version string
> to device version attribute in target side, and checks the return value.
> If a negative errno is returned in the target side, then mdev devices in
> source and target sides are not compatible;
> If a positive number is returned and it equals to the length of written
> string, then the two mdev devices in source and target side are compatible.
> e.g.
> (a) compatibility case
> "# echo 00028086-193b-i915-GVTg_V5_4 >
> /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> 
> (b) incompatibility case
> "#echo 00028086-193b-i915-GVTg_V5_1 >
> /sys/bus/pci/devices/\:00\:02.0/882cc4da-dede-11e7-9180-078a62063ab1/mdev_type/version"
> -bash: echo: write error: Invalid argument
> 
> 3. if two mdev devices are compatible, user space software can start
> live migration, and vice versa.
> 
> Note: if a mdev device does not support live migration, it either does
> not provide a version attribute, or always returns errno when its version
> attribute is read/written.

I think it would be cleaner to do the former, not supply the
attribute.  This seems to do the latter in the sample drivers.  Thanks,

Alex

> Cc: Alex Williamson 
> Cc: Erik Skultety 
> Cc: "Dr. David Alan Gilbert" 
> Cc: Cornelia Huck 
> Cc: "Tian, Kevin" 
> Cc: Zhenyu Wang 
> Cc: "Wang, Zhi A" 
> Cc: Neo Jia 
> Cc: Kirti Wankhede 
> 
> Signed-off-by: Yan Zhao 
> ---
>  Documentation/vfio-mediated-device.txt | 36 ++
>  samples/vfio-mdev/mbochs.c | 17 
>  samples/vfio-mdev/mdpy.c   | 16 
>  samples/vfio-mdev/mtty.c   | 16 
>  4 files changed, 85 insertions(+)
> 
> diff --git a/Documentation/vfio-mediated-device.txt 
> b/Documentation/vfio-mediated-device.txt
> index c3f69bcaf96e..bc28471c0667 100644
> --- a/Documentation/vfio-mediated-device.txt
> +++ b/Documentation/vfio-mediated-device.txt
> @@ -202,6 +202,7 @@ Directories and files under the sysfs for Each Physical 
> Device
>| |   |--- available_instances
>| |   |--- device_api
>| |   |--- description
> +  | |   |--- version
>| |   |--- [devices]
>| |--- []
>| |   |--- create
> @@ -209,6 +210,7 @@ Directories and files under the sysfs for Each Physical 
> Device
>| |   |--- available_instances
>| |   |--- device_api
>| |   |--- description
> +  | |   |--- version
>| |   |--- [devices]
>| |--- []
>|  |--- create
> @@ -216,6 +218,7 @@ Directories and files under the sysfs for Each Physical 
> Device
>|  |--- available_instances
>|  |--- device_api
>|  |--- description
> +  |  |--- version
>|  |--- [devices]
>  
>  * [mdev_supported_types]
> @@ -225,6 +228,8 @@ Directories and files under the sysfs for Each Physical 
> Device
>[], device_api, and available_instances are mandatory attributes
>that should be provided by vendor driver.
>  
> +  version is a mandatory a

[Qemu-devel] [PATCH v3 0/3] hw: edu: some fixes

2019-04-22 Thread Li Qiang

Recently I am considering write a driver for edu device.
After reading the spec, I found these three small issue.
Two first two related the MMIO access and the third is
related the DMA operation.

Change since v2:
Fix an error in patch 2
Fix some commit message and title.

Change since v1:
Fix format compile error

Li Qiang (3):
  edu: mmio: allow 64-bit access
  edu: mmio: allow 64-bit access in read dispatch
  edu: uses uint64_t in dma operation

 hw/misc/edu.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

-- 
2.17.1

[Qemu-devel] [PATCH v3 2/3] edu: mmio: allow 64-bit access in read dispatch

2019-04-22 Thread Li Qiang

The edu spec says when address >= 0x80, the MMIO area can
be accessed by 64-bit.

Signed-off-by: Li Qiang 
---
Change since v2:
Fix an error per Phillippe's advice

 hw/misc/edu.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/misc/edu.c b/hw/misc/edu.c
index 65fc32b928..33de05141f 100644
--- a/hw/misc/edu.c
+++ b/hw/misc/edu.c
@@ -185,7 +185,11 @@ static uint64_t edu_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
 EduState *edu = opaque;
 uint64_t val = ~0ULL;
 
-if (size != 4) {
+if (addr < 0x80 && size != 4) {
+return val;
+}
+
+if (addr >= 0x80 && size != 4 && size != 8) {
 return val;
 }
 
-- 
2.17.1

[Qemu-devel] [PATCH v3 3/3] edu: uses uint64_t in dma operation

2019-04-22 Thread Li Qiang

The dma related variable dma.dst/src/cnt is dma_addr_t, it is
uint64_t in x64 platform. Change these usage from uint32_to
uint64_t to avoid trancation in edu_dma_timer.

Signed-off-by: Li Qiang 
Reviewed-by: Philippe Mathieu-Daude 
---
 hw/misc/edu.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/misc/edu.c b/hw/misc/edu.c
index 33de05141f..401ada74af 100644
--- a/hw/misc/edu.c
+++ b/hw/misc/edu.c
@@ -98,23 +98,24 @@ static void edu_lower_irq(EduState *edu, uint32_t val)
 }
 }
 
-static bool within(uint32_t addr, uint32_t start, uint32_t end)
+static bool within(uint64_t addr, uint64_t start, uint64_t end)
 {
 return start <= addr && addr < end;
 }
 
-static void edu_check_range(uint32_t addr, uint32_t size1, uint32_t start,
+static void edu_check_range(uint64_t addr, uint64_t size1, uint64_t start,
 uint32_t size2)
 {
-uint32_t end1 = addr + size1;
-uint32_t end2 = start + size2;
+uint64_t end1 = addr + size1;
+uint64_t end2 = start + size2;
 
 if (within(addr, start, end2) &&
 end1 > addr && within(end1, start, end2)) {
 return;
 }
 
-hw_error("EDU: DMA range 0x%.8x-0x%.8x out of bounds (0x%.8x-0x%.8x)!",
+hw_error("EDU: DMA range 0x%016"PRIx64"-0x%016"PRIx64
+ " out of bounds (0x%016"PRIx64"-0x%016"PRIx64")!",
 addr, end1 - 1, start, end2 - 1);
 }
 
@@ -139,13 +140,13 @@ static void edu_dma_timer(void *opaque)
 }
 
 if (EDU_DMA_DIR(edu->dma.cmd) == EDU_DMA_FROM_PCI) {
-uint32_t dst = edu->dma.dst;
+uint64_t dst = edu->dma.dst;
 edu_check_range(dst, edu->dma.cnt, DMA_START, DMA_SIZE);
 dst -= DMA_START;
 pci_dma_read(&edu->pdev, edu_clamp_addr(edu, edu->dma.src),
 edu->dma_buf + dst, edu->dma.cnt);
 } else {
-uint32_t src = edu->dma.src;
+uint64_t src = edu->dma.src;
 edu_check_range(src, edu->dma.cnt, DMA_START, DMA_SIZE);
 src -= DMA_START;
 pci_dma_write(&edu->pdev, edu_clamp_addr(edu, edu->dma.dst),
-- 
2.17.1

[Qemu-devel] [PATCH v3 1/3] edu: mmio: allow 64-bit access

2019-04-22 Thread Li Qiang

The edu spec says the MMIO area can be accessed by 64-bit.
However currently the 'max_access_size' is not so the MMIO
access dispatch can only access 32-bit one time. This patch fixes
this to respect the spec.

Signed-off-by: Li Qiang 
Reviewed-by: Philippe Mathieu-Daude 
---
 hw/misc/edu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/hw/misc/edu.c b/hw/misc/edu.c
index 91af452c9e..65fc32b928 100644
--- a/hw/misc/edu.c
+++ b/hw/misc/edu.c
@@ -289,6 +289,15 @@ static const MemoryRegionOps edu_mmio_ops = {
 .read = edu_mmio_read,
 .write = edu_mmio_write,
 .endianness = DEVICE_NATIVE_ENDIAN,
+.valid = {
+.min_access_size = 4,
+.max_access_size = 8,
+},
+.impl = {
+.min_access_size = 4,
+.max_access_size = 8,
+},
+
 };
 
 /*
-- 
2.17.1

Re: [Qemu-devel] [PATCH v3 3/3] iotests: test big qcow2 shrink

2019-04-22 Thread Eric Blake

On 4/19/19 9:05 AM, Vladimir Sementsov-Ogievskiy wrote:
> This test checks bug in qcow2_process_discards, fixed by previous
> commit.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---

> +# This test checks that qcow2_process_discards does not truncate a discard
> +# request > 2G.
> +# To reproduce bug we need to overflow int by one sequential discard, so we
> +# need size > 2G, bigger cluster size (as with default 64k we may have 
> maximum
> +# of 512M sequential data, corresponding to one L1 entry), and we need some
> +# data of the beginning of the disk mapped to the end of file to prevent
> +# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which may success

s/may success/might succeed/

> +# anyway.
> +
> +size=2100M
> +IMGOPTS="cluster_size=1M,preallocation=metadata"
> +
> +_make_test_img $size
> +$QEMU_IO -c 'discard 0 10M' -c 'discard 2090M 10M' \
> + -c 'write 2090M 10M' -c 'write 0 10M' "$TEST_IMG" | _filter_qemu_io
> +
> +# Check that our trick with swapping first and last 10M chunks succeeded.
> +# Otherwise test will may pass even if bdrv_pdiscard() fails in
> +# qcow2_process_discards()
> +$QEMU_IMG map "$TEST_IMG" | _filter_testdir
> +$QEMU_IMG info "$TEST_IMG" | grep size |  _filter_testdir

Nice - that's a lot faster than v1! And makes the test fit in the quick
group after all.

> +
> +$QEMU_IMG -T 'qcow2_process_discards_failed*' resize --shrink "$TEST_IMG" 5M

However, I'm quite certain that trace output is not reliable in iotests.
Depending on configure options, traces might not be enabled at all, or
might not go to stderr. Drop the -T '...'.  Even without the trace line,

> +
> +$QEMU_IMG info "$TEST_IMG" | grep size | _filter_testdir

this second image info is sufficient to prove whether the resize had an
effect (post-patch) or exposes the bug (without patch 2/3). That is,
applying this patch but not 2/3, I see this (with my cleanups to
qemu-img info in place, from Kevin's block-next branch):

+++ /home/eblake/qemu/tests/qemu-iotests/250.out.bad2019-04-22
08:52:26.072968731 -0500
@@ -14,8 +14,9 @@
 virtual size: 2.05 GiB (2202009600 bytes)
 disk size: 24 MiB
 cluster_size: 1048576
+18274@1555941145.999195:qcow2_process_discards_failed_region offset
0x50 bytes 0x82a0 ret -5
 Image resized.
 virtual size: 5 MiB (5242880 bytes)
-disk size: 9.01 MiB
+disk size: 19 MiB

where the trace did indeed show show that we had a bug, but where even
without the trace, the difference in size between 19M with incomplete
discards vs. 9M with patch 2/3 is enough to show that patch 2/3 fixes a bug.

> +virtual size: 2.1G (2202009600 bytes)
> +disk size: 24M
> +cluster_size: 1048576
> +Image resized.
> +virtual size: 5.0M (5242880 bytes)
> +disk size: 9.0M
> +cluster_size: 1048576

When Kevin's block-next branch is applied, you'll need this squashed in:

diff --git a/tests/qemu-iotests/250.out b/tests/qemu-iotests/250.out
index 37e37f0c9e7..d1c1c7180b5 100644
--- a/tests/qemu-iotests/250.out
+++ b/tests/qemu-iotests/250.out
@@ -11,11 +11,11 @@ wrote 10485760/10485760 bytes at offset 0
 Offset  Length  Mapped to   File
 0   0xa00x82f0  TEST_DIR/t.qcow2
 0x82a0  0xa00x50TEST_DIR/t.qcow2
-virtual size: 2.1G (2202009600 bytes)
-disk size: 24M
+virtual size: 2.05 GiB (2202009600 bytes)
+disk size: 24 MiB
 cluster_size: 1048576
 Image resized.
-virtual size: 5.0M (5242880 bytes)
-disk size: 9.0M
+virtual size: 5 MiB (5242880 bytes)
+disk size: 9.01 MiB
 cluster_size: 1048576
 *** done

With the updated output to match changes to qemu-img output, the grammar
fixes, and with the -T option removed,

Reviewed-by: Eric Blake 
Tested-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v3 2/3] block/io: bdrv_pdiscard: support int64_t bytes parameter

2019-04-22 Thread Eric Blake

On 4/19/19 9:05 AM, Vladimir Sementsov-Ogievskiy wrote:
> This fixes at least one overflow in qcow2_process_discards, which
> passes 64bit region length to bdrv_pdiscard which bytes (or sectors in

s/which/where/

> the past) parameter is int since it's introduction in 0b919fae.

s/it's/its/

> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/block.h |  4 ++--
>  block/io.c| 16 
>  2 files changed, 10 insertions(+), 10 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v3 1/3] block/qcow2-refcount: add trace-point to qcow2_process_discards

2019-04-22 Thread Eric Blake

On 4/19/19 9:05 AM, Vladimir Sementsov-Ogievskiy wrote:
> Let's at least trace ignored failure.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/qcow2-refcount.c | 7 ++-
>  block/trace-events | 3 +++
>  2 files changed, 9 insertions(+), 1 deletion(-)

Reviewed-by: Eric Blake 

> 
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index e0fe322500..60284bcaac 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -30,6 +30,7 @@
>  #include "qemu/range.h"
>  #include "qemu/bswap.h"
>  #include "qemu/cutils.h"
> +#include "trace.h"
>  
>  static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
>  uint64_t max);
> @@ -738,7 +739,11 @@ void qcow2_process_discards(BlockDriverState *bs, int 
> ret)
>  
>  /* Discard is optional, ignore the return value */
>  if (ret >= 0) {
> -bdrv_pdiscard(bs->file, d->offset, d->bytes);
> +int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
> +if (r2 < 0) {
> +trace_qcow2_process_discards_failed_region(d->offset, 
> d->bytes,
> +   r2);
> +}
>  }
>  
>  g_free(d);
> diff --git a/block/trace-events b/block/trace-events
> index 7335a42540..ea508f637e 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -91,6 +91,9 @@ qcow2_cache_get_done(void *co, int c, int i) "co %p 
> is_l2_cache %d index %d"
>  qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
>  qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index 
> %d"
>  
> +# qcow2-refcount.c
> +qcow2_process_discards_failed_region(uint64_t offset, uint64_t bytes, int 
> ret) "offset 0x%" PRIx64 " bytes 0x%" PRIx64 " ret %d"
> +
>  # qed-l2-cache.c
>  qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
>  qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 0/2] HPPA fixes for NetBSD/hppa emulation

2019-04-22 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20190422123411.5178-1-nick.hud...@gmx.co.uk/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20190422123411.5178-1-nick.hud...@gmx.co.uk
Subject: [Qemu-devel] [PATCH v2 0/2] HPPA fixes for NetBSD/hppa emulation

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag]   patchew/20190422123411.5178-1-nick.hud...@gmx.co.uk 
-> patchew/20190422123411.5178-1-nick.hud...@gmx.co.uk
Switched to a new branch 'test'
a6b41a65f4 Always return EXCP_DMAR for protection id trap as EXCP_DMP is 
considered legacy.
5894d61641 Implement the pcxl and pcxl2 Fast TLB Insert instructions as used by 
NetBSD (and OpenBSD)

=== OUTPUT BEGIN ===
1/2 Checking commit 5894d6164109 (Implement the pcxl and pcxl2 Fast TLB Insert 
instructions as used by NetBSD (and OpenBSD))
WARNING: Block comments use a leading /* on a separate line
#36: FILE: target/hppa/translate.c:2521:
+/* Implement the pcxl and pcxl2 Fast TLB Insert instructions.

WARNING: Block comments use a trailing */ on a separate line
#39: FILE: target/hppa/translate.c:2524:
+ * page 13-9 (195/206) */

ERROR: do not use C99 // comments
#51: FILE: target/hppa/translate.c:2536:
+//if (not (pcx or pcxl2))

ERROR: do not use C99 // comments
#52: FILE: target/hppa/translate.c:2537:
+//return gen_illegal(ctx);

total: 2 errors, 2 warnings, 67 lines checked

Patch 1/2 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

2/2 Checking commit a6b41a65f4d3 (Always return EXCP_DMAR for protection id 
trap as EXCP_DMP is considered legacy.)
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20190422123411.5178-1-nick.hud...@gmx.co.uk/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[Qemu-devel] [PATCH v2 2/2] Always return EXCP_DMAR for protection id trap as EXCP_DMP is considered legacy.

2019-04-22 Thread Nick Hudson

From: Nick Hudson 

"In PA-RISC 1.1 (Second Edition) and later revisions, processors must use
traps 26, 27,and 28 which provide equivalent functionality"

Signed-off-by: Nick Hudson 
---
 target/hppa/mem_helper.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index c9b57d07c3..77fb544838 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -154,8 +154,7 @@ int hppa_get_physical_address(CPUHPPAState *env, vaddr 
addr, int mmu_idx,

 if (unlikely(!(prot & type))) {
 /* The access isn't allowed -- Inst/Data Memory Protection Fault.  */
-ret = (type & PAGE_EXEC ? EXCP_IMP :
-   prot & PAGE_READ ? EXCP_DMP : EXCP_DMAR);
+ret = (type & PAGE_EXEC) ? EXCP_IMP : EXCP_DMAR;
 goto egress;
 }

--
2.17.1

[Qemu-devel] [PATCH v2 0/2] HPPA fixes for NetBSD/hppa emulation

2019-04-22 Thread Nick Hudson

From: Nick Hudson 

Here are the required changes to allow qemu to emulate NetBSD/hppa.

v2 changes:
 - remove old debug code

Nick Hudson (2):
  Implement the pcxl and pcxl2 Fast TLB Insert instructions as used by
NetBSD (and OpenBSD)
  Always return EXCP_DMAR for protection id trap as EXCP_DMP is
considered legacy.

 target/hppa/insns.decode |  3 +++
 target/hppa/mem_helper.c |  3 +--
 target/hppa/translate.c  | 52 
 3 files changed, 56 insertions(+), 2 deletions(-)

--
2.17.1

[Qemu-devel] [PATCH v2 1/2] Implement the pcxl and pcxl2 Fast TLB Insert instructions as used by NetBSD (and OpenBSD)

2019-04-22 Thread Nick Hudson

From: Nick Hudson 

See
 https://parisc.wiki.kernel.org/images-parisc/a/a9/Pcxl2_ers.pdf
 page 13-9 (195/206)

Signed-off-by: Nick Hudson 
---
 target/hppa/insns.decode |  3 +++
 target/hppa/translate.c  | 52 
 2 files changed, 55 insertions(+)

diff --git a/target/hppa/insns.decode b/target/hppa/insns.decode
index 098370c2f0..f0dd71dd08 100644
--- a/target/hppa/insns.decode
+++ b/target/hppa/insns.decode
@@ -133,6 +133,9 @@ ixtlbx  01 b:5 r:5 sp:2 010 addr:1 0 0  
data=1
 ixtlbx  01 b:5 r:5 ... 00 addr:1 0 0\
 sp=%assemble_sr3x data=0

+# pcxl and pcxl2 Fast TLB Insert instructions
+ixtlbxf 01 0 r:5 00 0 data:1 01000 addr:1 0 0
+
 pxtlbx  01 b:5 x:5 sp:2 0100100 local:1 m:1 -   data=1
 pxtlbx  01 b:5 x:5 ... 000100 local:1 m:1 - \
 sp=%assemble_sr3x data=0
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 43b74367ea..6038cbc3dd 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -2518,6 +2518,58 @@ static bool trans_pxtlbx(DisasContext *ctx, arg_pxtlbx 
*a)
 #endif
 }

+/* Implement the pcxl and pcxl2 Fast TLB Insert instructions.
+ * See
+ * https://parisc.wiki.kernel.org/images-parisc/a/a9/Pcxl2_ers.pdf
+ * page 13-9 (195/206) */
+static bool trans_ixtlbxf(DisasContext *ctx, arg_ixtlbxf *a)
+{
+CHECK_MOST_PRIVILEGED(EXCP_PRIV_OPR);
+#ifndef CONFIG_USER_ONLY
+TCGv_tl addr;
+TCGv_reg reg;
+TCGv_reg ar, sr;
+TCGv_tl atl, stl;
+
+nullify_over(ctx);
+
+//if (not (pcx or pcxl2))
+//return gen_illegal(ctx);
+
+ar = get_temp(ctx);
+sr = get_temp(ctx);
+atl = get_temp_tl(ctx);
+stl = get_temp_tl(ctx);
+addr = get_temp_tl(ctx);
+
+
+if (a->data) {
+tcg_gen_ld_reg(sr, cpu_env, offsetof(CPUHPPAState, cr[CR_ISR]));
+tcg_gen_ld_reg(ar, cpu_env, offsetof(CPUHPPAState, cr[CR_IOR]));
+} else {
+tcg_gen_ld_reg(sr, cpu_env, offsetof(CPUHPPAState, cr[CR_IIASQ]));
+tcg_gen_ld_reg(ar, cpu_env, offsetof(CPUHPPAState, cr[CR_IIAOQ]));
+}
+
+tcg_gen_extu_reg_tl(atl, ar);
+tcg_gen_extu_reg_tl(stl, sr);
+tcg_gen_shli_i64(stl, stl, 32);
+tcg_gen_or_tl(addr, atl, stl);
+reg = load_gpr(ctx, a->r);
+if (a->addr) {
+gen_helper_itlba(cpu_env, addr, reg);
+} else {
+gen_helper_itlbp(cpu_env, addr, reg);
+}
+
+/* Exit TB for TLB change if mmu is enabled.  */
+if (ctx->tb_flags & PSW_C) {
+ctx->base.is_jmp = DISAS_IAQ_N_STALE;
+}
+return nullify_end(ctx);
+#endif
+}
+
 static bool trans_lpa(DisasContext *ctx, arg_ldst *a)
 {
 CHECK_MOST_PRIVILEGED(EXCP_PRIV_OPR);
--
2.17.1

Re: [Qemu-devel] [PATCH v14 0/2] support MAP_SYNC for memory-backend-file

2019-04-22 Thread Michael S. Tsirkin

On Mon, Apr 22, 2019 at 08:48:47AM +0800, Wei Yang wrote:
> Linux 4.15 introduces a new mmap flag MAP_SYNC, which can be used to
> guarantee the write persistence to mmap'ed files supporting DAX (e.g.,
> files on ext4/xfs file system mounted with '-o dax').
> 
> A description of MAP_SYNC and MAP_SHARED_VALIDATE can be found at
> https://patchwork.kernel.org/patch/10028151/
> 
> In order to make sure that the file metadata is in sync after a fault 
> while we are writing a shared DAX supporting backend files, this
> patch-set enables QEMU to use MAP_SYNC flag for memory-backend-dax-file.
> 
> As the DAX vs DMA truncated issue was solved, we refined the code and
> send out this feature for the v5 version.
> 
> We will pass MAP_SYNC to mmap(2); if MAP_SYNC is supported and
> 'share=on' & 'pmem=on'. 
> Or QEMU will not pass this flag to mmap(2)

OK this is in a good shape. As we are in freeze anyway,
there's still a bit more time to polish it. I have a couple of
suggestions:

- squash docs in same patch with code, no need for two patches
- mmap errors are not silently ignored as the doc says,
  a warning is produced

Also, it might make sense to send the warnings to an errp object and not stderr.
I would leave that to a follow-up patch.


> Test with below cases:
> 1. pmem=on is set, shared=on is set, MAP_SYNC supported:
>a: backend is a dax supporting file.
>1) start VM1 with options:
>-object 
> memory-backend-file,id=nv_be4,share,mem-path=${DAX_FILE_1},size=${DAX_FILE_SIZE_1},align=128M,pmem=on,share=on
>-device nvdimm,id=nv4,memdev=nv_be4,label-size=2M.
>
>2) start VM2 with options:
>-object 
> memory-backend-file,id=nv_be4,share,mem-path=${DAX_FILE_2,size=${DAX_FILE_SIZE_2},align=128M,pmem=on,share=on
>-device nvdimm,id=nv4,memdev=nv_be4,label-size=2M.
> 
>3) live migrate from VM1 to VM2.
>
>4) Suddenly let Host crash or power failure.
> 
>5) check DAX_FILE_1 and DAX_FILE_2, no corrupt.
> 
>b: backend is a regular file.
>1) start with options
>-object 
> memory-backend-file,id=nv_be4,share,mem-path=${REG_FILE},size=${REG_FILE_SIZE},align=128M,pmem=on,share=on
>-device nvdimm,id=nv4,memdev=nv_be4,label-size=2M.
> 
>will warning "failed to validate with mapping flags: Operation not 
> supported"
>FILE_1 and FILE_2 random corrupt.
> 
> 2. Other cases:
>FILE_1 and FILE_2 random corrupt.
> 
> Changes in V14:
>  * 1/2 rebase on top of current upstream and tested
> 
> Changes in V13:
>  * 4/5 Micheal: move the inlcude to mmap_alloc.c.
>  * 4/5 Micheal: refine the warning message.
>  * 5/5 Micheal: refine the Documentations.
> 
> Changes in V12:
>  * 2/5: Micheal: Update update-linux-headers.sh
>  * 3/5: Micheal: Use script update add linux/mman.h
>  * 4/5: Pankaj,Micheal: 1) fallback to mmap without
> MAP_SYNC & MAP_SHARED_VALIDATE if sync not supported or failed
>   2) Replace the include with 3/5 added linux/mman.h
>  * 5/5: Micheal: Refine the Documentations.
> 
> Changes in V11:
>  * 1/3: Micheal: Change to just add a bool is_pmem in qemu_ram_mmap.
>  * 2/3: Micheal: Fix the compatibility for old kernel.
>  * 2/3&3/3: Micheal&Eduardo :Update the behavior below: 
>Waning at no-dax and continue without MAP_SYNC.
>Test if fails again for compatibility, then remove the MAP_VALIDATE and
>silently proceed.
> 
> Changes in V10:
>  * 4/4: refine the document.
>  * 3/4: Reviewed-by: Stefano Garzarella 
>  * 2/4: refine the commit message, Added MAP_SHARED_VALIDATE.
>  * 2/4: Fix the wrong include header
> 
> Changes in V9:
>  * 1/6: Reviewed-by: Eduardo Habkost 
>  * 2/6: New Added: Micheal: use sparse feature define RAM_FLAG. 
>  since I don't have much knowledge about the sparse feature, @Micheal Could 
> you 
>  add some documentation/commit message on this patch? Thank you very much.
>  * 3/6: from 2/5: Eduardo: updated the commit message. 
>  * 4/6: from 3/5: Micheal: don't ignore MAP_SYNC failures silently.
>  * 5/6: from 4/5: Eduardo: updated the commit message.
>  * 6/6: from 5/5: Micheal: Drop the sync option, document the MAP_SYNC.
> 
> Changes in v8:
>  * Micheal: 3/5, remove the duplicated define in the os_dep.h
>  * Micheal: 2/5, make type define safety.
>  * Micheal: 2/5, fixed the incorrect define MAP_SHARE on qemu_anon_ram_alloc.
>  * 4/6 removed, we remove the on/off/auto define of sync,  as by now,
>MAP_SYNC only worked with pmem=on.
>  * @Micheal, I still reuse the RAM_SYNC flag, it is much straightforward to 
> parse 
>all the flags in one parameter.
> 
> Changes in v7:
>  * Micheal: [3,4,6]/6 limited the "sync" flag only on a nvdimm 
> backend.(pmem=on)
> 
> Changes in v6:
>  * Pankaj: 3/7 are squashed with 2/7
>  * Pankaj: 7/7 update comments to "consistent filesystem metadata".
>  * Pankaj, Igor: 1/7 Added Reviewed-by in patch-1/7
>  * Stefan, 4/7 move the include header from "/linux/mman.h" to "osdep.h"
>  * Stefan, 5/7 Add missing "munmap"
>  * Stefan, 2/7 refine the shared/flag.
>

[Qemu-devel] [PATCH for-4.1 24/24] icount: clean up cpu_can_io before jumping to the next block

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

Most of IO instructions can be executed only at the end of the block in
icount mode. Therefore translator can set cpu_can_io flag when translating
the last instruction.
But when the blocks are chained, then this flag is not reset and may
remain set at the beginning of the next block.
This patch resets the flag before "chaining" the translation blocks.

Signed-off-by: Pavel Dovgalyuk 
---
 accel/tcg/tcg-runtime.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index d0d4484406..5871f5aba2 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -151,6 +151,8 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
 target_ulong cs_base, pc;
 uint32_t flags;
 
+/* We are going to jump to the next block. can_do_io should be reset */
+cpu->can_do_io = !use_icount;
 tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
 if (tb == NULL) {
 return tcg_ctx->code_gen_epilogue;

[Qemu-devel] [PATCH for-4.1 22/24] replay: fix replay shutdown

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch fixes shutdown of the replay process, which is terminated with
the assert when shutdown event is read from the log.
replay_finish_event reads new data_kind and therefore the value of data_kind
should be preserved to be valid at qemu_system_shutdown_request call.

Signed-off-by: Pavel Dovgalyuk 
---
 replay/replay.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/replay/replay.c b/replay/replay.c
index e578958659..8f2e17c8cb 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -49,14 +49,14 @@ bool replay_next_event_is(int event)
 }
 
 while (true) {
-if (event == replay_state.data_kind) {
+unsigned int data_kind = replay_state.data_kind;
+if (event == data_kind) {
 res = true;
 }
-switch (replay_state.data_kind) {
+switch (data_kind) {
 case EVENT_SHUTDOWN ... EVENT_SHUTDOWN_LAST:
 replay_finish_event();
-qemu_system_shutdown_request(replay_state.data_kind -
- EVENT_SHUTDOWN);
+qemu_system_shutdown_request(data_kind - EVENT_SHUTDOWN);
 break;
 default:
 /* clock, time_t, checkpoint and other events */

[Qemu-devel] [PATCH for-4.1 23/24] replay: rename step-related variables and functions

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch renames replay_get_current_step() and related variables
to make these names consistent with hmp/qmp commands.

Signed-off-by: Pavel Dovgalyuk 
---
 blockdev.c|2 +-
 include/sysemu/replay.h   |2 +-
 migration/savevm.c|2 +-
 replay/replay-debugging.c |   30 --
 replay/replay-events.c|4 ++--
 replay/replay-internal.c  |   10 +-
 replay/replay-internal.h  |   10 +-
 replay/replay-snapshot.c  |6 +++---
 replay/replay-time.c  |2 +-
 replay/replay.c   |   26 +-
 10 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 7265ef0525..7c870142bd 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1454,7 +1454,7 @@ static void internal_snapshot_prepare(BlkActionState 
*common,
 sn->date_nsec = tv.tv_usec * 1000;
 sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 if (replay_mode != REPLAY_MODE_NONE) {
-sn->icount = replay_get_current_step();
+sn->icount = replay_get_current_icount();
 } else {
 sn->icount = -1ULL;
 }
diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index b7394a1f5c..057a458463 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -96,7 +96,7 @@ void replay_breakpoint(void);
 /* Processing the instructions */
 
 /*! Returns number of executed instructions. */
-uint64_t replay_get_current_step(void);
+uint64_t replay_get_current_icount(void);
 /*! Returns number of instructions to execute in replay mode. */
 int replay_get_instructions(void);
 /*! Updates instructions counter in replay mode. */
diff --git a/migration/savevm.c b/migration/savevm.c
index 2198d61b81..3cfdfe5ea1 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2597,7 +2597,7 @@ int save_snapshot(const char *name, Error **errp)
 sn->date_nsec = tv.tv_usec * 1000;
 sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 if (replay_mode != REPLAY_MODE_NONE) {
-sn->icount = replay_get_current_step();
+sn->icount = replay_get_current_icount();
 } else {
 sn->icount = -1ULL;
 }
diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c
index abb5fe687f..b2275612c7 100644
--- a/replay/replay-debugging.c
+++ b/replay/replay-debugging.c
@@ -38,7 +38,7 @@ void hmp_info_replay(Monitor *mon, const QDict *qdict)
 monitor_printf(mon,
 "%s execution '%s': instruction count = %"PRId64"\n",
 replay_mode == REPLAY_MODE_RECORD ? "Recording" : "Replaying",
-replay_get_filename(), replay_get_current_step());
+replay_get_filename(), replay_get_current_icount());
 }
 }
 
@@ -51,7 +51,7 @@ ReplayInfo *qmp_query_replay(Error **errp)
 retval->filename = g_strdup(replay_get_filename());
 retval->has_filename = true;
 }
-retval->icount = replay_get_current_step();
+retval->icount = replay_get_current_icount();
 return retval;
 }
 
@@ -59,7 +59,7 @@ static void replay_break(uint64_t icount, QEMUTimerCB 
callback, void *opaque)
 {
 assert(replay_mode == REPLAY_MODE_PLAY);
 assert(replay_mutex_locked());
-assert(replay_break_icount >= replay_get_current_step());
+assert(replay_break_icount >= replay_get_current_icount());
 assert(callback);
 
 replay_break_icount = icount;
@@ -94,7 +94,7 @@ static void replay_stop_vm(void *opaque)
 void qmp_replay_break(int64_t icount, Error **errp)
 {
 if (replay_mode == REPLAY_MODE_PLAY) {
-if (icount >= replay_get_current_step()) {
+if (icount >= replay_get_current_icount()) {
 replay_break(icount, replay_stop_vm, NULL);
 } else {
 error_setg(errp,
@@ -196,14 +196,14 @@ static void replay_seek(int64_t icount, QEMUTimerCB 
callback, Error **errp)
 
 snapshot = replay_find_nearest_snapshot(icount, &snapshot_icount);
 if (snapshot) {
-if (icount < replay_get_current_step()
-|| replay_get_current_step() < snapshot_icount) {
+if (icount < replay_get_current_icount()
+|| replay_get_current_icount() < snapshot_icount) {
 vm_stop(RUN_STATE_RESTORE_VM);
 load_snapshot(snapshot, errp);
 }
 g_free(snapshot);
 }
-if (replay_get_current_step() <= icount) {
+if (replay_get_current_icount() <= icount) {
 replay_break(icount, callback, NULL);
 vm_start();
 } else {
@@ -242,8 +242,9 @@ bool replay_reverse_step(void)
 
 assert(replay_mode == REPLAY_MODE_PLAY);
 
-if (replay_get_current_step() != 0) {
-replay_seek(replay_get_current_step() - 1, replay_stop_vm_debug, &err);
+if (replay_get_current_icount() != 0) {
+replay_seek(replay_get_current_icount() - 1,
+replay_stop_vm_debug, &err);
 if (err) {
 error_free(err);
 return false;
@@ -283,7 +284,7 @

[Qemu-devel] [PATCH for-4.1 21/24] util/qemu-timer: refactor deadline calculation for external timers

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

icount-based record/replay uses qemu_clock_deadline_ns_all to measure
the period until vCPU may be interrupted.
This function takes in account the virtual timers, because they belong
to the virtual devices that may generate interrupt request or affect
the virtual machine state.
However, there are a subset of virtual timers, that are marked with
'external' flag. These do not change the virtual machine state and
only based on virtual clock. Calculating the deadling using the external
timers breaks the determinism, because they do not belong to the replayed
part of the virtual machine.
This patch fixes the deadline calculation for this case.

Signed-off-by: Pavel Dovgalyuk 

--

v15 changes:
 - fixed misprint in the test
---
 cpus.c|9 -
 include/qemu/timer.h  |7 +++
 qtest.c   |2 +-
 tests/ptimer-test-stubs.c |4 ++--
 tests/ptimer-test.c   |4 ++--
 util/qemu-timer.c |   41 +
 6 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/cpus.c b/cpus.c
index ba4849861d..2cad26f96c 100644
--- a/cpus.c
+++ b/cpus.c
@@ -550,7 +550,7 @@ void qtest_clock_warp(int64_t dest)
 assert(qtest_enabled());
 aio_context = qemu_get_aio_context();
 while (clock < dest) {
-int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+int64_t deadline = virtual_clock_deadline_ns();
 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 
 seqlock_write_lock(&timers_state.vm_clock_seqlock,
@@ -610,7 +610,7 @@ void qemu_start_warp_timer(void)
 
 /* We want to use the earliest deadline from ALL vm_clocks */
 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
-deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+deadline = virtual_clock_deadline_ns();
 if (deadline < 0) {
 static bool notified;
 if (!icount_sleep && !notified) {
@@ -1356,7 +1356,7 @@ static int64_t tcg_get_icount_limit(void)
 int64_t deadline;
 
 if (replay_mode != REPLAY_MODE_PLAY) {
-deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+deadline = virtual_clock_deadline_ns();
 
 /* Maintain prior (possibly buggy) behaviour where if no deadline
  * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
@@ -1377,8 +1377,7 @@ static void handle_icount_deadline(void)
 {
 assert(qemu_in_vcpu_thread());
 if (use_icount) {
-int64_t deadline =
-qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+int64_t deadline = virtual_clock_deadline_ns();
 
 if (deadline == 0) {
 /* Wake up other AioContexts.  */
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index a86330c987..cfc77450f2 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -176,16 +176,15 @@ bool qemu_clock_expired(QEMUClockType type);
 bool qemu_clock_use_for_deadline(QEMUClockType type);
 
 /**
- * qemu_clock_deadline_ns_all:
- * @type: the clock type
+ * virtual_clock_deadline_ns:
  *
  * Calculate the deadline across all timer lists associated
- * with a clock (as opposed to just the default one)
+ * with virtual clock (excluding external timers)
  * in nanoseconds, or -1 if no timer is set to expire.
  *
  * Returns: time until expiry in nanoseconds or -1
  */
-int64_t qemu_clock_deadline_ns_all(QEMUClockType type);
+int64_t virtual_clock_deadline_ns(void);
 
 /**
  * qemu_clock_get_main_loop_timerlist:
diff --git a/qtest.c b/qtest.c
index 527141785f..8000ee0714 100644
--- a/qtest.c
+++ b/qtest.c
@@ -655,7 +655,7 @@ static void qtest_process_command(CharBackend *chr, gchar 
**words)
 int ret = qemu_strtoi64(words[1], NULL, 0, &ns);
 g_assert(ret == 0);
 } else {
-ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+ns = virtual_clock_deadline_ns();
 }
 qtest_clock_warp(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns);
 qtest_send_prefix(chr);
diff --git a/tests/ptimer-test-stubs.c b/tests/ptimer-test-stubs.c
index 54b3fd26f6..49acfed76f 100644
--- a/tests/ptimer-test-stubs.c
+++ b/tests/ptimer-test-stubs.c
@@ -88,9 +88,9 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
 return ptimer_test_time_ns;
 }
 
-int64_t qemu_clock_deadline_ns_all(QEMUClockType type)
+int64_t virtual_clock_deadline_ns(void)
 {
-QEMUTimerList *timer_list = main_loop_tlg.tl[type];
+QEMUTimerList *timer_list = main_loop_tlg.tl[QEMU_CLOCK_VIRTUAL];
 QEMUTimer *t = timer_list->active_timers.next;
 int64_t deadline = -1;
 
diff --git a/tests/ptimer-test.c b/tests/ptimer-test.c
index b30aad0737..338a4e0c10 100644
--- a/tests/ptimer-test.c
+++ b/tests/ptimer-test.c
@@ -50,13 +50,13 @@ static void ptimer_test_set_qemu_time_ns(int64_t ns)
 
 static void qemu_clock_step(uint64_t ns)
 {
-int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+int64_t deadline

[Qemu-devel] [PATCH for-4.1 18/24] replay: describe reverse debugging in docs/replay.txt

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch updates the documentation and describes usage of the reverse
debugging in QEMU+GDB.

Signed-off-by: Pavel Dovgalyuk 
---
 docs/replay.txt |   33 +
 1 file changed, 33 insertions(+)

diff --git a/docs/replay.txt b/docs/replay.txt
index ce97c3f72f..47b4f2d9f8 100644
--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -293,6 +293,39 @@ for recording and replaying must contain identical number 
of ports in record
 and replay modes, but their backends may differ.
 E.g., '-serial stdio' in record mode, and '-serial null' in replay mode.
 
+Reverse debugging
+-
+
+Reverse debugging allows "executing" the program in reverse direction.
+GDB remote protocol supports "reverse step" and "reverse continue"
+commands. The first one steps single instruction backwards in time,
+and the second one finds the last breakpoint in the past.
+
+Recorded executions may be used to enable reverse debugging. QEMU can't
+execute the code in backwards direction, but can load a snapshot and
+replay forward to find the desired position or breakpoint.
+
+The following GDB commands are supported:
+ - reverse-stepi (or rsi) - step one instruction backwards
+ - reverse-continue (or rc) - find last breakpoint in the past
+
+Reverse step loads the nearest snapshot and replays the execution until
+the required instruction is met.
+
+Reverse continue may include several passes of examining the execution
+between the snapshots. Each of the passes include the following steps:
+ 1. loading the snapshot
+ 2. replaying to examine the breakpoints
+ 3. if breakpoint or watchpoint was met
+- loading the snaphot again
+- replaying to the required breakpoint
+ 4. else
+- proceeding to the p.1 with the earlier snapshot
+
+Therefore usage of the reverse debugging requires at least one snapshot
+created in advance. See the "Snapshotting" section to learn about running
+record/replay and creating the snapshot in these modes.
+
 Replay log format
 -

[Qemu-devel] [PATCH for-4.1 20/24] replay: document development rules

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch introduces docs/devel/replay.txt which describes the rules
that should be followed to make virtual devices usable in record/replay mode.

Signed-off-by: Pavel Dovgalyuk 

--

v9: fixed external virtual clock description (reported by Artem Pisarenko)
---
 docs/devel/replay.txt |   46 ++
 1 file changed, 46 insertions(+)
 create mode 100644 docs/devel/replay.txt

diff --git a/docs/devel/replay.txt b/docs/devel/replay.txt
new file mode 100644
index 00..e641c35add
--- /dev/null
+++ b/docs/devel/replay.txt
@@ -0,0 +1,46 @@
+Record/replay mechanism, that could be enabled through icount mode, expects
+the virtual devices to satisfy the following requirements.
+
+The main idea behind this document is that everything that affects
+the guest state during execution in icount mode should be deterministic.
+
+Timers
+==
+
+All virtual devices should use virtual clock for timers that change the guest
+state. Virtual clock is deterministic, therefore such timers are deterministic
+too.
+
+Virtual devices can also use realtime clock for the events that do not change
+the guest state directly. When the clock ticking should depend on VM execution
+speed, use virtual clock with EXTERNAL attribute. It is not deterministic,
+but its speed depends on the guest execution. This clock is used by
+the virtual devices (e.g., slirp routing device) that lie outside the
+replayed guest.
+
+Bottom halves
+=
+
+Bottom half callbacks, that affect the guest state, should be invoked through
+replay_bh_schedule_event or replay_bh_schedule_oneshot_event functions.
+Their invocations are saved in record mode and synchronized with the existing
+log in replay mode.
+
+Saving/restoring the VM state
+=
+
+All fields in the device state structure (including virtual timers)
+should be restored by loadvm to the same values they had before savevm.
+
+Avoid accessing other devices' state, because the order of saving/restoring
+is not defined. It means that you should not call functions like
+'update_irq' in post_load callback. Save everything explicitly to avoid
+the dependencies that may make restoring the VM state non-deterministic.
+
+Stopping the VM
+===
+
+Stopping the guest should not interfere with its state (with the exception
+of the network connections, that could be broken by the remote timeouts).
+VM can be stopped at any moment of replay by the user. Restarting the VM
+after that stop should not break the replay by the unneeded guest state change.

[Qemu-devel] [PATCH for-4.1 12/24] replay: introduce breakpoint at the specified step

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch introduces replay_break, replay_delete_break
qmp and hmp commands.
These commands allow stopping at the specified instruction.
It may be useful for debugging when there are some known
events that should be investigated.
replay_break command has one argument - number of instructions
executed since the start of the replay.
replay_delete_break removes previously set breakpoint.

Signed-off-by: Pavel Dovgalyuk 
Acked-by: Markus Armbruster 

--

v2:
 - renamed replay_break qmp command into replay-break
   (suggested by Eric Blake)
v7:
 - introduces replay_delete_break command
v9:
 - changed 'step' parameter name to 'icount'
 - moved json stuff to replay.json and updated the description
   (suggested by Markus Armbruster)
v10:
 - updated descriptions (suggested by Markus Armbruster)
---
 hmp-commands.hx   |   34 ++
 hmp.h |2 +
 qapi/replay.json  |   36 +++
 replay/replay-debugging.c |   86 +
 replay/replay-internal.h  |4 ++
 replay/replay.c   |   17 +
 6 files changed, 179 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 9b4035965c..1eb07e7bcc 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1911,6 +1911,40 @@ ETEXI
 STEXI
 @item qom-set @var{path} @var{property} @var{value}
 Set QOM property @var{property} of object at location @var{path} to value 
@var{value}
+ETEXI
+
+{
+.name   = "replay_break",
+.args_type  = "icount:i",
+.params = "icount",
+.help   = "set breakpoint at the specified instruction count",
+.cmd= hmp_replay_break,
+},
+
+STEXI
+@item replay_break @var{icount}
+@findex replay_break
+Set replay breakpoint at instruction count @var{icount}.
+Execution stops when the specified instruction is reached.
+There can be at most one breakpoint. When breakpoint is set, any prior
+one is removed.  The breakpoint may be set only in replay mode and only
+"in the future", i.e. at instruction counts greater than the current one.
+The current instruction count can be observed with 'info replay'.
+ETEXI
+
+{
+.name   = "replay_delete_break",
+.args_type  = "",
+.params = "",
+.help   = "removes replay breakpoint",
+.cmd= hmp_replay_delete_break,
+},
+
+STEXI
+@item replay_delete_break
+@findex replay_delete_break
+Remove replay breakpoint which was previously set with replay_break.
+The command is ignored when there are no replay breakpoints.
 ETEXI
 
 {
diff --git a/hmp.h b/hmp.h
index c53e14e917..c3e7354dd3 100644
--- a/hmp.h
+++ b/hmp.h
@@ -151,5 +151,7 @@ void hmp_info_vm_generation_id(Monitor *mon, const QDict 
*qdict);
 void hmp_info_memory_size_summary(Monitor *mon, const QDict *qdict);
 void hmp_info_sev(Monitor *mon, const QDict *qdict);
 void hmp_info_replay(Monitor *mon, const QDict *qdict);
+void hmp_replay_break(Monitor *mon, const QDict *qdict);
+void hmp_replay_delete_break(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/qapi/replay.json b/qapi/replay.json
index 3f29c3c1fe..425e053e4d 100644
--- a/qapi/replay.json
+++ b/qapi/replay.json
@@ -63,3 +63,39 @@
 ##
 { 'command': 'query-replay',
   'returns': 'ReplayInfo' }
+
+##
+# @replay-break:
+#
+# Set replay breakpoint at instruction count @icount.
+# Execution stops when the specified instruction is reached.
+# There can be at most one breakpoint. When breakpoint is set, any prior
+# one is removed.  The breakpoint may be set only in replay mode and only
+# "in the future", i.e. at instruction counts greater than the current one.
+# The current instruction count can be observed with @query-replay.
+#
+# @icount: instruction count to stop at
+#
+# Since: 4.1
+#
+# Example:
+#
+# -> { "execute": "replay-break", "data": { "icount": 220414 } }
+#
+##
+{ 'command': 'replay-break', 'data': { 'icount': 'int' } }
+
+##
+# @replay-delete-break:
+#
+# Remove replay breakpoint which was set with @replay-break.
+# The command is ignored when there are no replay breakpoints.
+#
+# Since: 4.1
+#
+# Example:
+#
+# -> { "execute": "replay-delete-break" }
+#
+##
+{ 'command': 'replay-delete-break' }
diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c
index 51f1c4d82d..a94685e437 100644
--- a/replay/replay-debugging.c
+++ b/replay/replay-debugging.c
@@ -16,6 +16,8 @@
 #include "hmp.h"
 #include "monitor/monitor.h"
 #include "qapi/qapi-commands-replay.h"
+#include "qapi/qmp/qdict.h"
+#include "qemu/timer.h"
 
 void hmp_info_replay(Monitor *mon, const QDict *qdict)
 {
@@ -41,3 +43,87 @@ ReplayInfo *qmp_query_replay(Error **errp)
 retval->icount = replay_get_current_step();
 return retval;
 }
+
+static void replay_break(uint64_t icount, QEMUTimerCB callback, void *opaque)
+{
+assert(replay_mode == REPLAY_MODE_PLAY);
+assert(replay_mutex_locked());
+assert(replay_break_icount >= replay_get_curr

[Qemu-devel] [PATCH for-4.1 16/24] gdbstub: add reverse step support in replay mode

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

GDB remote protocol supports two reverse debugging commands:
reverse step and reverse continue.
This patch adds support of the first one to the gdbstub.
Reverse step is intended to step one instruction in the backwards
direction. This is not possible in regular execution.
But replayed execution is deterministic, therefore we can load one of
the prior snapshots and proceed to the desired step. It is equivalent
to stepping one instruction back.
There should be at least one snapshot preceding the debugged part of
the replay log.

Signed-off-by: Pavel Dovgalyuk 
---
 accel/tcg/translator.c|1 +
 cpus.c|   14 +++---
 exec.c|7 +++
 gdbstub.c |   44 +---
 include/sysemu/replay.h   |   11 +++
 replay/replay-debugging.c |   33 +
 stubs/replay.c|5 +
 7 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index afd0a49ea6..33a543e82f 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -17,6 +17,7 @@
 #include "exec/gen-icount.h"
 #include "exec/log.h"
 #include "exec/translator.h"
+#include "sysemu/replay.h"
 
 /* Pairs with tcg_clear_temp_count.
To be called by #TranslatorOps.{translate_insn,tb_stop} if
diff --git a/cpus.c b/cpus.c
index 5ff61fbf53..44d0b23435 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1104,9 +1104,17 @@ static bool cpu_can_run(CPUState *cpu)
 
 static void cpu_handle_guest_debug(CPUState *cpu)
 {
-gdb_set_stop_cpu(cpu);
-qemu_system_debug_request();
-cpu->stopped = true;
+if (!replay_running_debug()) {
+gdb_set_stop_cpu(cpu);
+qemu_system_debug_request();
+cpu->stopped = true;
+} else {
+if (!cpu->singlestep_enabled) {
+cpu_single_step(cpu, SSTEP_ENABLE);
+} else {
+cpu_single_step(cpu, 0);
+}
+}
 }
 
 #ifdef CONFIG_LINUX
diff --git a/exec.c b/exec.c
index 6ab62f4eee..4f0d24214e 100644
--- a/exec.c
+++ b/exec.c
@@ -2772,6 +2772,13 @@ static void check_watchpoint(int offset, int len, 
MemTxAttrs attrs, int flags)
 QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 if (cpu_watchpoint_address_matches(wp, vaddr, len)
 && (wp->flags & flags)) {
+if (replay_running_debug()) {
+/*
+ * Don't process the watchpoints when we are
+ * in a reverse debugging operation.
+ */
+return;
+}
 if (flags == BP_MEM_READ) {
 wp->flags |= BP_WATCHPOINT_HIT_READ;
 } else {
diff --git a/gdbstub.c b/gdbstub.c
index d54abd17cc..e49e0f6c65 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -39,6 +39,7 @@
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
 #include "exec/exec-all.h"
+#include "sysemu/replay.h"
 
 #ifdef CONFIG_USER_ONLY
 #define GDB_ATTACHED "0"
@@ -344,6 +345,20 @@ typedef struct GDBState {
  */
 static int sstep_flags = SSTEP_ENABLE|SSTEP_NOIRQ|SSTEP_NOTIMER;
 
+/* Retrieves flags for single step mode. */
+static int get_sstep_flags(void)
+{
+/*
+ * In replay mode all events written into the log should be replayed.
+ * That is why NOIRQ flag is removed in this mode.
+ */
+if (replay_mode != REPLAY_MODE_NONE) {
+return SSTEP_ENABLE;
+} else {
+return sstep_flags;
+}
+}
+
 static GDBState *gdbserver_state;
 
 bool gdb_has_xml;
@@ -434,7 +449,7 @@ static int gdb_continue_partial(GDBState *s, char 
*newstates)
 CPU_FOREACH(cpu) {
 if (newstates[cpu->cpu_index] == 's') {
 trace_gdbstub_op_stepping(cpu->cpu_index);
-cpu_single_step(cpu, sstep_flags);
+cpu_single_step(cpu, get_sstep_flags());
 }
 }
 s->running_state = 1;
@@ -453,7 +468,7 @@ static int gdb_continue_partial(GDBState *s, char 
*newstates)
 break; /* nothing to do here */
 case 's':
 trace_gdbstub_op_stepping(cpu->cpu_index);
-cpu_single_step(cpu, sstep_flags);
+cpu_single_step(cpu, get_sstep_flags());
 cpu_resume(cpu);
 flag = 1;
 break;
@@ -1424,9 +1439,28 @@ static int gdb_handle_packet(GDBState *s, const char 
*line_buf)
 addr = strtoull(p, (char **)&p, 16);
 gdb_set_cpu_pc(s, addr);
 }
-cpu_single_step(s->c_cpu, sstep_flags);
+cpu_single_step(s->c_cpu, get_sstep_flags());
 gdb_continue(s);
 return RS_IDLE;
+case 'b':
+/* Backward debugging commands */
+if (replay_mode == REPLAY_MODE_PLAY) {
+switch (*p) {
+case 's':
+if (replay_reverse_step()) {
+gdb_continue(s);
+return RS_IDLE;
+} else {
+put_packet(s, "

[Qemu-devel] [PATCH for-4.1 13/24] replay: implement replay-seek command

2019-04-22 Thread Pavel Dovgalyuk

From: Pavel Dovgalyuk 

This patch adds hmp/qmp commands replay_seek/replay-seek that proceed
the execution to the specified instruction count.
The command automatically loads nearest snapshot and replays the execution
to find the desired instruction count.

Signed-off-by: Pavel Dovgalyuk 
Acked-by: Markus Armbruster 

--

v2:
 - renamed replay_seek qmp command into replay-seek
   (suggested by Eric Blake)
v7:
 - small fixes related to Markus Armbruster's review
v9:
 - changed 'step' parameter name to 'icount'
 - moved json stuff to replay.json and updated the description
   (suggested by Markus Armbruster)
v10:
 - updated the descriptions
---
 hmp-commands.hx   |   19 +
 hmp.h |1 
 qapi/replay.json  |   20 ++
 replay/replay-debugging.c |   92 +
 4 files changed, 132 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 1eb07e7bcc..88a2d32110 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1945,6 +1945,25 @@ STEXI
 @findex replay_delete_break
 Remove replay breakpoint which was previously set with replay_break.
 The command is ignored when there are no replay breakpoints.
+ETEXI
+
+{
+.name   = "replay_seek",
+.args_type  = "icount:i",
+.params = "icount",
+.help   = "replay execution to the specified instruction count",
+.cmd= hmp_replay_seek,
+},
+
+STEXI
+@item replay_seek @var{icount}
+@findex replay_seek
+Automatically proceed to the instruction count @var{icount}, when
+replaying the execution. The command automatically loads nearest
+snapshot and replays the execution to find the desired instruction.
+When there is no preceding snapshot or the execution is not replayed,
+then the command fails.
+icount for the reference may be observed with 'info replay' command.
 ETEXI
 
 {
diff --git a/hmp.h b/hmp.h
index c3e7354dd3..c67a911f2e 100644
--- a/hmp.h
+++ b/hmp.h
@@ -153,5 +153,6 @@ void hmp_info_sev(Monitor *mon, const QDict *qdict);
 void hmp_info_replay(Monitor *mon, const QDict *qdict);
 void hmp_replay_break(Monitor *mon, const QDict *qdict);
 void hmp_replay_delete_break(Monitor *mon, const QDict *qdict);
+void hmp_replay_seek(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/qapi/replay.json b/qapi/replay.json
index 425e053e4d..aa768d496c 100644
--- a/qapi/replay.json
+++ b/qapi/replay.json
@@ -99,3 +99,23 @@
 #
 ##
 { 'command': 'replay-delete-break' }
+
+##
+# @replay-seek:
+#
+# Automatically proceed to the instruction count @icount, when
+# replaying the execution. The command automatically loads nearest
+# snapshot and replays the execution to find the desired instruction.
+# When there is no preceding snapshot or the execution is not replayed,
+# then the command fails.
+# icount for the reference may be obtained with @query-replay command.
+#
+# @icount: target instruction count
+#
+# Since: 4.1
+#
+# Example:
+#
+# -> { "execute": "replay-seek", "data": { "icount": 220414 } }
+##
+{ 'command': 'replay-seek', 'data': { 'icount': 'int' } }
diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c
index a94685e437..e3821ab1ba 100644
--- a/replay/replay-debugging.c
+++ b/replay/replay-debugging.c
@@ -18,6 +18,8 @@
 #include "qapi/qapi-commands-replay.h"
 #include "qapi/qmp/qdict.h"
 #include "qemu/timer.h"
+#include "block/snapshot.h"
+#include "migration/snapshot.h"
 
 void hmp_info_replay(Monitor *mon, const QDict *qdict)
 {
@@ -127,3 +129,93 @@ void hmp_replay_delete_break(Monitor *mon, const QDict 
*qdict)
 return;
 }
 }
+
+static char *replay_find_nearest_snapshot(int64_t icount,
+  int64_t *snapshot_icount)
+{
+BlockDriverState *bs;
+QEMUSnapshotInfo *sn_tab;
+QEMUSnapshotInfo *nearest = NULL;
+char *ret = NULL;
+int nb_sns, i;
+AioContext *aio_context;
+
+*snapshot_icount = -1;
+
+bs = bdrv_all_find_vmstate_bs();
+if (!bs) {
+goto fail;
+}
+aio_context = bdrv_get_aio_context(bs);
+
+aio_context_acquire(aio_context);
+nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+aio_context_release(aio_context);
+
+for (i = 0; i < nb_sns; i++) {
+if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) {
+if (sn_tab[i].icount != -1ULL
+&& sn_tab[i].icount <= icount
+&& (!nearest || nearest->icount < sn_tab[i].icount)) {
+nearest = &sn_tab[i];
+}
+}
+}
+if (nearest) {
+ret = g_strdup(nearest->name);
+*snapshot_icount = nearest->icount;
+}
+g_free(sn_tab);
+
+fail:
+return ret;
+}
+
+static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp)
+{
+char *snapshot = NULL;
+int64_t snapshot_icount;
+
+if (replay_mode != REPLAY_MODE_PLAY) {
+error_setg(errp, "replay must be enabled to seek");
+return;
+}
+if (!replay_

1 2 >

1 - 100 of 130 matches

Mail list logo