[Qemu-devel] [PULL 3/4] hw/ppc/spapr: Fix the selection of the processor features

2016-10-12 Thread David Gibson
From: Thomas Huth 

The current code uses pa_features_206 for POWERPC_MMU_2_06, and
for everything else, it uses pa_features_207. This is bad in some
cases because there is also a "degraded" MMU version of ISA 2.06,
called POWERPC_MMU_2_06a, which should of course use the flags for
2.06 instead. And there is also the possibility that the user runs
the pseries machine with a POWER5+ or even 970 processor. In that
case we certainly do not want to set the flags for 2.07, and rather
simply skip the setting of the pa-features property instead.

Signed-off-by: Thomas Huth 
Reviewed-by: Cédric Le Goater 
Signed-off-by: David Gibson 
(cherry picked from commit 4cbec30d769a73853b60dc7f275e6e7da9ab5162)
---
 hw/ppc/spapr.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 36d9077..9f0d99b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -607,12 +607,19 @@ static void spapr_populate_pa_features(CPUPPCState *env, 
void *fdt, int offset)
 uint8_t *pa_features;
 size_t pa_size;
 
-if (env->mmu_model == POWERPC_MMU_2_06) {
+switch (env->mmu_model) {
+case POWERPC_MMU_2_06:
+case POWERPC_MMU_2_06a:
 pa_features = pa_features_206;
 pa_size = sizeof(pa_features_206);
-} else { /* env->mmu_model == POWERPC_MMU_2_07 */
+break;
+case POWERPC_MMU_2_07:
+case POWERPC_MMU_2_07a:
 pa_features = pa_features_207;
 pa_size = sizeof(pa_features_207);
+break;
+default:
+return;
 }
 
 if (env->ci_large_pages) {
-- 
2.7.4




[Qemu-devel] [PULL 1/4] linux-headers: update

2016-10-12 Thread David Gibson
From: Cornelia Huck 

Update headers against 4.8-rc2.

Signed-off-by: Cornelia Huck 
---
 include/standard-headers/linux/input-event-codes.h | 32 
 include/standard-headers/linux/input.h |  1 +
 include/standard-headers/linux/virtio_config.h | 10 ++-
 include/standard-headers/linux/virtio_ids.h|  1 +
 include/standard-headers/linux/virtio_net.h|  3 +
 include/standard-headers/linux/virtio_vsock.h  | 94 ++
 linux-headers/asm-arm/kvm.h|  4 +-
 linux-headers/asm-arm64/kvm.h  |  2 +
 linux-headers/asm-s390/kvm.h   | 41 ++
 linux-headers/asm-x86/unistd_x32.h |  4 +-
 linux-headers/linux/kvm.h  | 18 -
 linux-headers/linux/vhost.h| 33 
 12 files changed, 236 insertions(+), 7 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_vsock.h

diff --git a/include/standard-headers/linux/input-event-codes.h 
b/include/standard-headers/linux/input-event-codes.h
index 354f0de..5c10f7e 100644
--- a/include/standard-headers/linux/input-event-codes.h
+++ b/include/standard-headers/linux/input-event-codes.h
@@ -611,6 +611,37 @@
 #define KEY_KBDINPUTASSIST_ACCEPT  0x264
 #define KEY_KBDINPUTASSIST_CANCEL  0x265
 
+/* Diagonal movement keys */
+#define KEY_RIGHT_UP   0x266
+#define KEY_RIGHT_DOWN 0x267
+#define KEY_LEFT_UP0x268
+#define KEY_LEFT_DOWN  0x269
+
+#define KEY_ROOT_MENU  0x26a /* Show Device's Root Menu */
+/* Show Top Menu of the Media (e.g. DVD) */
+#define KEY_MEDIA_TOP_MENU 0x26b
+#define KEY_NUMERIC_11 0x26c
+#define KEY_NUMERIC_12 0x26d
+/*
+ * Toggle Audio Description: refers to an audio service that helps blind and
+ * visually impaired consumers understand the action in a program. Note: in
+ * some countries this is referred to as "Video Description".
+ */
+#define KEY_AUDIO_DESC 0x26e
+#define KEY_3D_MODE0x26f
+#define KEY_NEXT_FAVORITE  0x270
+#define KEY_STOP_RECORD0x271
+#define KEY_PAUSE_RECORD   0x272
+#define KEY_VOD0x273 /* Video on Demand */
+#define KEY_UNMUTE 0x274
+#define KEY_FASTREVERSE0x275
+#define KEY_SLOWREVERSE0x276
+/*
+ * Control a data application associated with the currently viewed channel,
+ * e.g. teletext or data broadcast application (MHEG, MHP, HbbTV, etc.)
+ */
+#define KEY_DATA   0x275
+
 #define BTN_TRIGGER_HAPPY  0x2c0
 #define BTN_TRIGGER_HAPPY1 0x2c0
 #define BTN_TRIGGER_HAPPY2 0x2c1
@@ -749,6 +780,7 @@
 #define SW_ROTATE_LOCK 0x0c  /* set = rotate locked/disabled */
 #define SW_LINEIN_INSERT   0x0d  /* set = inserted */
 #define SW_MUTE_DEVICE 0x0e  /* set = device disabled */
+#define SW_PEN_INSERTED0x0f  /* set = pen inserted */
 #define SW_MAX_0x0f
 #define SW_CNT (SW_MAX_+1)
 
diff --git a/include/standard-headers/linux/input.h 
b/include/standard-headers/linux/input.h
index a52b202..7361a16 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -244,6 +244,7 @@ struct input_mask {
 #define BUS_ATARI  0x1B
 #define BUS_SPI0x1C
 #define BUS_RMI0x1D
+#define BUS_CEC0x1E
 
 /*
  * MT_TOOL types
diff --git a/include/standard-headers/linux/virtio_config.h 
b/include/standard-headers/linux/virtio_config.h
index b30d0cb..b777069 100644
--- a/include/standard-headers/linux/virtio_config.h
+++ b/include/standard-headers/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START   28
-#define VIRTIO_TRANSPORT_F_END 33
+#define VIRTIO_TRANSPORT_F_END 34
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -63,4 +63,12 @@
 /* v1.0 compliant. */
 #define VIRTIO_F_VERSION_1 32
 
+/*
+ * If clear - device has the IOMMU bypass quirk feature.
+ * If set - use platform tools to detect the IOMMU.
+ *
+ * Note the reverse polarity (compared to most other features),
+ * this is for compatibility with legacy systems.
+ */
+#define VIRTIO_F_IOMMU_PLATFORM33
 #endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/include/standard-headers/linux/virtio_ids.h 
b/include/standard-headers/linux/virtio_ids.h
index 77925f5..3228d58 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h

Re: [Qemu-devel] [PATCH v4 03/20] ppc/pnv: add a core mask to PnvChip

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 10:53:12AM +0200, Cédric Le Goater wrote:
> On 10/11/2016 12:24 PM, David Gibson wrote:
> > On Mon, Oct 10, 2016 at 02:56:25PM +0200, Cédric Le Goater wrote:
> >>
>  @@ -227,11 +227,44 @@ static void ppc_powernv_init(MachineState *machine)
>   snprintf(chip_name, sizeof(chip_name), "chip[%d]", 
>  CHIP_HWID(i));
>   object_property_add_child(OBJECT(pnv), chip_name, chip, 
>  _fatal);
>   object_property_set_int(chip, CHIP_HWID(i), "chip-id", 
>  _fatal);
>  +object_property_set_int(chip, smp_cores, "nr-cores", 
>  _fatal);
>  +/*
>  + * We could customize cores_mask for the chip here. May be
>  + * using a powernv machine property, like 'num-chips'. Let the
>  + * chip choose the default for now.
> >>>
> >>> I don't think you need any special mechanism for this.  If you just
> >>> remove this explicit assignment the chip default will apply, but the
> >>> user can alter it using -global.
> >>
> >> Using a command line with :
> >>
> >>-global powernv-chip-POWER8.cores-mask=0x7070
> >>
> >> would work for one chip but not for more. Let's start with that, I will 
> >> remove the comment for now. multiple chip is for later.
> > 
> > Well, it works for more than one chip if you want the same mask for
> > each of them.  If you want different masks, I think you can still do
> > it with -set, but working out the right arguments can be a PITA.
> 
> That would be the best solution but I did not find a way to address one
> chip object to do a "qom set".

Yeah, that's what I mean working out the arguments can be tricky.

> 
> (qemu) info qom-tree 
> /machine (powernv-machine)
>   /unattached (container)
> /system[0] (qemu:memory-region)
> /sysbus (System)
> /ppc_powernv.ram[0] (qemu:memory-region)
> /io[0] (qemu:memory-region)
>   /peripheral-anon (container)
>   /peripheral (container)
>   /chip[1] (powernv-chip-POWER8)
> ...
>   /chip[0] (powernv-chip-POWER8)
> ...
>  
> We will have a similar need with the ram to spread the contents on
> the chips. 

Sure.  Let's cross these bridges when we come to them.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[Qemu-devel] [PULL 4/4] ppc: Check the availability of transactional memory

2016-10-12 Thread David Gibson
From: Thomas Huth 

KVM-PR currently does not support transactional memory, and the
implementation in TCG is just a fake. We should not announce TM
support in the ibm,pa-features property when running on such a
system, so disable it by default and only enable it if the KVM
implementation supports it (i.e. recent versions of KVM-HV).
These changes are based on some earlier work from Anton Blanchard
(thanks!).

Signed-off-by: Thomas Huth 
Reviewed-by: Cédric Le Goater 
Signed-off-by: David Gibson 
(cherry picked from commit bac3bf287ab60e264b636f5f00c116a19b655762)
---
 hw/ppc/spapr.c   | 5 -
 target-ppc/kvm.c | 7 +++
 target-ppc/kvm_ppc.h | 6 ++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9f0d99b..82723d1 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -603,7 +603,7 @@ static void spapr_populate_pa_features(CPUPPCState *env, 
void *fdt, int offset)
 0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-0x80, 0x00, 0x80, 0x00, 0x80, 0x00 };
+0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
 uint8_t *pa_features;
 size_t pa_size;
 
@@ -632,6 +632,9 @@ static void spapr_populate_pa_features(CPUPPCState *env, 
void *fdt, int offset)
  */
 pa_features[3] |= 0x20;
 }
+if (kvmppc_has_cap_htm() && pa_size > 24) {
+pa_features[24] |= 0x80;/* Transactional memory support */
+}
 
 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index dcb68b9..f26a141 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -79,6 +79,7 @@ static int cap_ppc_watchdog;
 static int cap_papr;
 static int cap_htab_fd;
 static int cap_fixup_hcalls;
+static int cap_htm; /* Hardware transactional memory support */
 
 static uint32_t debug_inst_opcode;
 
@@ -121,6 +122,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
  * only activated after this by kvmppc_set_papr() */
 cap_htab_fd = kvm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
 cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
+cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
 
 if (!cap_interrupt_level) {
 fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
@@ -2339,6 +2341,11 @@ bool kvmppc_has_cap_fixup_hcalls(void)
 return cap_fixup_hcalls;
 }
 
+bool kvmppc_has_cap_htm(void)
+{
+return cap_htm;
+}
+
 static PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc)
 {
 ObjectClass *oc = OBJECT_CLASS(pcc);
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 5461d10..e45c815 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -54,6 +54,7 @@ void kvmppc_hash64_free_pteg(uint64_t token);
 void kvmppc_hash64_write_pte(CPUPPCState *env, target_ulong pte_index,
  target_ulong pte0, target_ulong pte1);
 bool kvmppc_has_cap_fixup_hcalls(void);
+bool kvmppc_has_cap_htm(void);
 int kvmppc_enable_hwrng(void);
 int kvmppc_put_books_sregs(PowerPCCPU *cpu);
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
@@ -244,6 +245,11 @@ static inline bool kvmppc_has_cap_fixup_hcalls(void)
 abort();
 }
 
+static inline bool kvmppc_has_cap_htm(void)
+{
+return false;
+}
+
 static inline int kvmppc_enable_hwrng(void)
 {
 return -1;
-- 
2.7.4




Re: [Qemu-devel] [Qemu-block] [PATCH v4 0/3] iotests: Fix test 162

2016-10-12 Thread Hao QingFeng



在 2016-10-13 3:46, Max Reitz 写道:

On 12.10.2016 10:55, Hao QingFeng wrote:

Max,

Just a common question for this case, if sshx block driver wasn't built
into qemu-img, this case would fail as below:

Good point, and thanks for bringing it up, but it's not directly linked
to this series other than by its subject, of course, so I'd rather add a
fix on top.

Thanks and sorry for sending to the improper mail series.

exec /home/haoqf/KVMonz/qemu/tests/qemu-iotests/../../qemu-img info
--image-opts driver=ssh,host=localhost,port=0.42,path=/foo
qemu-img: Could not open
'driver=ssh,host=localhost,port=0.42,path=/foo': Unknown driver 'ssh'

Adding 162.notrun can bypass this case but it would skip it even if
qemu-img has sshx block driver, in which case I think it should be run.

So How about adding a script to dynamically check at runtime if the
current env qemu-img can meet the requirement to run the test or not?

Unfortunately, the list of block drivers listed by will not contain ssh
if ssh is built as a module, which is possible.
Actually I am not sure if I understood it. Do you mean 
"CONFIG_LIBSSH2=m" set

rather than "CONFIG_LIBSSH2=y" in config-host.mak? But in the configure it's
set to be "CONFIG_LIBSSH2=y":
if test "$libssh2" = "yes" ; then
  echo "CONFIG_LIBSSH2=y" >> $config_host_mak
  echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak
  echo "LIBSSH2_LIBS=$libssh2_libs" >> $config_host_mak
fi
Meanwhile I changed it to be "CONFIG_LIBSSH2=m" and reconfig, make the qemu,
qemu-img --help can still prompt ssh.

This is a bug that should be fixed, but I'd rather do so in a separate
series from this one.

In any case, once it is fixed I'd rather just take the approach quorum
tests take already (e.g. test 081), which is something like:

test_ssh=$($QEMU_IMG --help | grep '^Supported formats:.* ssh\( \|$\)')
[ "$test_ssh" = "" ] && _notrun "ssh support required"

Cool. Agree with this like what was done in 081.  thanks

Max



--
QingFeng Hao




Re: [Qemu-devel] [PATCH v4 08/20] ppc/pnv: add a LPC controller

2016-10-12 Thread David Gibson
On Mon, Oct 03, 2016 at 09:24:44AM +0200, Cédric Le Goater wrote:
> From: Benjamin Herrenschmidt 
> 
> The LPC (Low Pin Count) interface on a POWER8 is made accessible to
> the system through the ADU (XSCOM interface). This interface is part
> of set of units connected together via a local OPB (On-Chip Peripheral
> Bus) which act as a bridge between the ADU and the off chip LPC
> endpoints, like external flash modules.
> 
> The most important units of this OPB are :
>  - OPB Master: contains the ADU slave logic, a set of internal
>registers and the logic to control the OPB.
>  - LPCHC (LPC HOST Controller): which implements a OPB Slave, a set of
>internal registers and the LPC HOST Controller to control the LPC
>interface.
> 
> Four address spaces are provided to the ADU :
>  - LPC Bus Firmware Memory
>  - LPC Bus Memory
>  - LPC Bus I/O (ISA bus)
>  - and the registers for the OPB Master and the LPC Host Controller
> 
> On POWER8, an intermediate hop is necessary to reach the OPB, through
> a unit called the ECCB. OPB commands are simply mangled in ECCB write
> commands.
> 
> On POWER9, the OPB master address space can be accessed via MMIO. The
> logic is same but the code will be simpler as the XSCOM and ECCB hops
> are not necessary anymore.
> 
> This version of the LPC controller model doesn't yet implement support
> for the SerIRQ deserializer present in the Naples version of the chip
> though some preliminary work is there.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> [clg: - updated for qemu-2.7
>   - ported on latest PowerNV patchset
>   - changed the XSCOM interface to fit new model
>   - QOMified the model
>   - moved the ISA hunks in another patch
>   - removed printf logging
>   - added a couple of UNIMP logging
>   - rewrote commit log ]
> Signed-off-by: Cédric Le Goater 

It looks reasonable as far as it goes.

I don't see anything wiring this up to qemu's common ISA
infrastructure, which seems a bit odd.

> ---
> 
>  Todo:
> 
>  - rework the address_space read/write ops as they should be shared
>with the P9 support.
> 
>  Changes since v3:
> 
>  - rewrote commit log
>  - fixed appendprop
>  
>  hw/ppc/Makefile.objs   |   2 +-
>  hw/ppc/pnv.c   |   9 +
>  hw/ppc/pnv_lpc.c   | 471 
> +
>  include/hw/ppc/pnv.h   |   3 +
>  include/hw/ppc/pnv_lpc.h   |  67 +++
>  include/hw/ppc/pnv_xscom.h |   3 +
>  6 files changed, 554 insertions(+), 1 deletion(-)
>  create mode 100644 hw/ppc/pnv_lpc.c
>  create mode 100644 include/hw/ppc/pnv_lpc.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index 08c213c40684..ebc72af0a7c6 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o 
> spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
>  # IBM PowerNV
> -obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o
> +obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index ffe245fe59d2..e41244294435 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -563,6 +563,9 @@ static void pnv_chip_init(Object *obj)
>  PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
>  
>  chip->xscom_base = pcc->xscom_base;
> +
> +object_initialize(>lpc, sizeof(chip->lpc), TYPE_PNV_LPC);
> +object_property_add_child(obj, "lpc", OBJECT(>lpc), NULL);
>  }
>  
>  static void pnv_chip_realize(DeviceState *dev, Error **errp)
> @@ -626,6 +629,12 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> **errp)
>   _CORE(pnv_core)->xscom_regs);
>  }
>  g_free(typename);
> +
> +/* Create LPC controller */
> +object_property_set_bool(OBJECT(>lpc), true, "realized",
> + _fatal);
> +memory_region_add_subregion(>xscom, PNV_XSCOM_LPC_BASE << 3,
> +>lpc.xscom_regs);
>  }
>  
>  static Property pnv_chip_properties[] = {
> diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
> new file mode 100644
> index ..210cc1cff167
> --- /dev/null
> +++ b/hw/ppc/pnv_lpc.c
> @@ -0,0 +1,471 @@
> +/*
> + * QEMU PowerPC PowerNV LPC controller
> + *
> + * Copyright (c) 2016, IBM Corporation.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or 

[Qemu-devel] [PULL 0/4] ppc patches for qemu-2.7 stable branch

2016-10-12 Thread David Gibson
The following changes since commit 1dc33ed90bf1fe1c2014dffa0d9e863c520d953a:

  Update version for v2.7.0 release (2016-09-02 13:44:11 +0100)

are available in the git repository at:

  git://github.com/dgibson/qemu.git tags/ppc-for-2.7-20161013

for you to fetch changes up to 2e68f28854f0120c9a938a61b64aaf1eaecb162b:

  ppc: Check the availability of transactional memory (2016-10-13 12:58:06 
+1100)


qemu-2.7 (stable): ppc patch queue 2016-10-13

TCG for ppc does not properly implement hardware transactional memory.
It has a stub implementation in which transactions always fail.
Unfortunately in v2.7.0, HTM is advertised as being available to
guests, which means guests may incorrectly attempt to use it and hang.

This has been the case for a while, but has become more urgent with
recent (guest) Linux kernel versions which attempt to lazily enable
TM.  Under TCG that now triggers the problem regularly, instead of
just when running a TM aware userspace program.

The problem is already fixed in the 2.8/master branch, by correctly
advertising HTM as not being available with TCG.  This series
backports the relevant patches to the qemu-2.7 stable branch to fix
the problem there.


Cornelia Huck (1):
  linux-headers: update

Thomas Huth (3):
  hw/ppc/spapr: Move code related to "ibm,pa-features" to a separate 
function
  hw/ppc/spapr: Fix the selection of the processor features
  ppc: Check the availability of transactional memory

 hw/ppc/spapr.c | 76 ++---
 include/standard-headers/linux/input-event-codes.h | 32 
 include/standard-headers/linux/input.h |  1 +
 include/standard-headers/linux/virtio_config.h | 10 ++-
 include/standard-headers/linux/virtio_ids.h|  1 +
 include/standard-headers/linux/virtio_net.h|  3 +
 include/standard-headers/linux/virtio_vsock.h  | 94 ++
 linux-headers/asm-arm/kvm.h|  4 +-
 linux-headers/asm-arm64/kvm.h  |  2 +
 linux-headers/asm-s390/kvm.h   | 41 ++
 linux-headers/asm-x86/unistd_x32.h |  4 +-
 linux-headers/linux/kvm.h  | 18 -
 linux-headers/linux/vhost.h| 33 
 target-ppc/kvm.c   |  7 ++
 target-ppc/kvm_ppc.h   |  6 ++
 15 files changed, 295 insertions(+), 37 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_vsock.h



Re: [Qemu-devel] [PATCH v4 06/20] ppc/pnv: add XSCOM infrastructure

2016-10-12 Thread David Gibson
On Mon, Oct 03, 2016 at 09:24:42AM +0200, Cédric Le Goater wrote:
> On a real POWER8 system, the Pervasive Interconnect Bus (PIB) serves
> as a backbone to connect different units of the system. The host
> firmware connects to the PIB through a bridge unit, the
> Alter-Display-Unit (ADU), which gives him access to all the chiplets
> on the PCB network (Pervasive Connect Bus), the PIB acting as the root
> of this network.
> 
> XSCOM (serial communication) is the interface to the sideband bus
> provided by the POWER8 pervasive unit to read and write to chiplets
> resources. This is needed by the host firmware, OPAL and to a lesser
> extent, Linux. This is among others how the PCI Host bridges get
> configured at boot or how the LPC bus is accessed.
> 
> To represent the ADU of a real system, we introduce a specific
> AddressSpace to dispatch XSCOM accesses to the targeted chiplets. The
> translation of an XSCOM address into a PCB register address is
> slightly different between the P9 and the P8. This is handled before
> the dispatch using a 8byte alignment for all.
> 
> To customize the device tree, a QOM InterfaceClass, PnvXScomInterface,
> is provided with a populate() handler. The chip populates the device
> tree by simply looping on its children. Therefore, each model needing
> custom nodes should not forget to declare itself as a child at
> instantiation time.
> 
> Based on previous work done by :
>   Benjamin Herrenschmidt 
> 
> Signed-off-by: Cédric Le Goater 
> ---
> 
>  Changes since v3:
> 
>  - reworked the model to dispatch addresses to the memory regions
>using pcb_addr << 3, which is a no-op for the P9. The benefit is
>that all the address translation work can be done before dispatch
>and the conversion handlers in the chip and in the xscom interface
>are gone.
>
>  - removed the proxy PnnXscom object and extended the PnvChip object
>with an address space for XSCOM and its associated memory region.
>
>  - changed the read/write handlers in the address space to use
>address_space_stq() and address_space_ldq()
>
>  - introduced 'fake' default read/write handlers to handle 'core'
>registers. We can add a real device model when more work needs to
>be done under these.
>
>  - fixed an issue with the monitor doing read/write in the XSCOM
>address space. When under the monitor, we don't have a cpu to
>update the HMER SPR. That might need more work in the long term.
>
>  - introduced a xscom base field to hold the xscom base address as
>it is different on P9
> 
>  - renamed the devnode() handler to populate()
> 
>  Changes since v2:
> 
>  - QOMified the model.
>  
>  - all mappings in main memory space are now gathered in
>pnv_chip_realize() as done on other architectures.
>
>  - removed XScomBus. The parenthood is established through the QOM
>model
>
>  - replaced the XScomDevice with an InterfaceClass : PnvXScomInterface. 
>  - introduced an XSCOM address space to dispatch accesses to the
>chiplets
> 
>  hw/ppc/Makefile.objs   |   2 +-
>  hw/ppc/pnv.c   |  25 +
>  hw/ppc/pnv_xscom.c | 262 
> +
>  include/hw/ppc/pnv.h   |  15 +++
>  include/hw/ppc/pnv_xscom.h |  47 
>  5 files changed, 350 insertions(+), 1 deletion(-)
>  create mode 100644 hw/ppc/pnv_xscom.c
>  create mode 100644 include/hw/ppc/pnv_xscom.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f8c7d1db9ade..08c213c40684 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o 
> spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
>  # IBM PowerNV
> -obj-$(CONFIG_POWERNV) += pnv.o pnv_core.o
> +obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 2376bb222918..5e19b6880387 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -32,6 +32,8 @@
>  #include "exec/address-spaces.h"
>  #include "qemu/cutils.h"
>  
> +#include "hw/ppc/pnv_xscom.h"
> +
>  #include 
>  
>  #define FDT_MAX_SIZE0x0010
> @@ -218,6 +220,8 @@ static void powernv_populate_chip(PnvChip *chip, void 
> *fdt)
>  size_t typesize = object_type_get_instance_size(typename);
>  int i;
>  
> +pnv_xscom_populate(chip, fdt, 0);
> +
>  for (i = 0; i < chip->nr_cores; i++) {
>  PnvCore *pnv_core = PNV_CORE(chip->cores + i * typesize);
>  
> @@ -450,6 +454,7 @@ static void pnv_chip_power8e_class_init(ObjectClass 
> *klass, void *data)
>  k->chip_cfam_id = 0x221ef0498000ull;  /* P8 Murano DD2.1 */
>  k->cores_mask = POWER8E_CORE_MASK;
>  k->core_pir = pnv_chip_core_pir_p8;
> +k->xscom_base = 

Re: [Qemu-devel] [PATCH v4 08/20] ppc/pnv: add a LPC controller

2016-10-12 Thread David Gibson
On Thu, Oct 13, 2016 at 01:52:55PM +1100, David Gibson wrote:
> On Mon, Oct 03, 2016 at 09:24:44AM +0200, Cédric Le Goater wrote:
> > From: Benjamin Herrenschmidt 
> > 
> > The LPC (Low Pin Count) interface on a POWER8 is made accessible to
> > the system through the ADU (XSCOM interface). This interface is part
> > of set of units connected together via a local OPB (On-Chip Peripheral
> > Bus) which act as a bridge between the ADU and the off chip LPC
> > endpoints, like external flash modules.
> > 
> > The most important units of this OPB are :
> >  - OPB Master: contains the ADU slave logic, a set of internal
> >registers and the logic to control the OPB.
> >  - LPCHC (LPC HOST Controller): which implements a OPB Slave, a set of
> >internal registers and the LPC HOST Controller to control the LPC
> >interface.
> > 
> > Four address spaces are provided to the ADU :
> >  - LPC Bus Firmware Memory
> >  - LPC Bus Memory
> >  - LPC Bus I/O (ISA bus)
> >  - and the registers for the OPB Master and the LPC Host Controller
> > 
> > On POWER8, an intermediate hop is necessary to reach the OPB, through
> > a unit called the ECCB. OPB commands are simply mangled in ECCB write
> > commands.
> > 
> > On POWER9, the OPB master address space can be accessed via MMIO. The
> > logic is same but the code will be simpler as the XSCOM and ECCB hops
> > are not necessary anymore.
> > 
> > This version of the LPC controller model doesn't yet implement support
> > for the SerIRQ deserializer present in the Naples version of the chip
> > though some preliminary work is there.
> > 
> > Signed-off-by: Benjamin Herrenschmidt 
> > [clg: - updated for qemu-2.7
> >   - ported on latest PowerNV patchset
> >   - changed the XSCOM interface to fit new model
> >   - QOMified the model
> >   - moved the ISA hunks in another patch
> >   - removed printf logging
> >   - added a couple of UNIMP logging
> >   - rewrote commit log ]
> > Signed-off-by: Cédric Le Goater 
> 
> It looks reasonable as far as it goes.
> 
> I don't see anything wiring this up to qemu's common ISA
> infrastructure, which seems a bit odd.

Sorry.. hadn't read the next patch yet.

Reviewed-by: David Gibson 

> 
> > ---
> > 
> >  Todo:
> > 
> >  - rework the address_space read/write ops as they should be shared
> >with the P9 support.
> > 
> >  Changes since v3:
> > 
> >  - rewrote commit log
> >  - fixed appendprop
> >  
> >  hw/ppc/Makefile.objs   |   2 +-
> >  hw/ppc/pnv.c   |   9 +
> >  hw/ppc/pnv_lpc.c   | 471 
> > +
> >  include/hw/ppc/pnv.h   |   3 +
> >  include/hw/ppc/pnv_lpc.h   |  67 +++
> >  include/hw/ppc/pnv_xscom.h |   3 +
> >  6 files changed, 554 insertions(+), 1 deletion(-)
> >  create mode 100644 hw/ppc/pnv_lpc.c
> >  create mode 100644 include/hw/ppc/pnv_lpc.h
> > 
> > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> > index 08c213c40684..ebc72af0a7c6 100644
> > --- a/hw/ppc/Makefile.objs
> > +++ b/hw/ppc/Makefile.objs
> > @@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o 
> > spapr_rtas.o
> >  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> >  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
> >  # IBM PowerNV
> > -obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o
> > +obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o
> >  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
> >  obj-y += spapr_pci_vfio.o
> >  endif
> > diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> > index ffe245fe59d2..e41244294435 100644
> > --- a/hw/ppc/pnv.c
> > +++ b/hw/ppc/pnv.c
> > @@ -563,6 +563,9 @@ static void pnv_chip_init(Object *obj)
> >  PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
> >  
> >  chip->xscom_base = pcc->xscom_base;
> > +
> > +object_initialize(>lpc, sizeof(chip->lpc), TYPE_PNV_LPC);
> > +object_property_add_child(obj, "lpc", OBJECT(>lpc), NULL);
> >  }
> >  
> >  static void pnv_chip_realize(DeviceState *dev, Error **errp)
> > @@ -626,6 +629,12 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> > **errp)
> >   _CORE(pnv_core)->xscom_regs);
> >  }
> >  g_free(typename);
> > +
> > +/* Create LPC controller */
> > +object_property_set_bool(OBJECT(>lpc), true, "realized",
> > + _fatal);
> > +memory_region_add_subregion(>xscom, PNV_XSCOM_LPC_BASE << 3,
> > +>lpc.xscom_regs);
> >  }
> >  
> >  static Property pnv_chip_properties[] = {
> > diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
> > new file mode 100644
> > index ..210cc1cff167
> > --- /dev/null
> > +++ b/hw/ppc/pnv_lpc.c
> > @@ -0,0 +1,471 @@
> > +/*
> > + * QEMU PowerPC PowerNV LPC controller
> > + *
> > + * Copyright (c) 2016, IBM Corporation.
> > + *
> > + * This library is free 

Re: [Qemu-devel] [PATCH v4 09/20] ppc/pnv: add a ISA bus

2016-10-12 Thread David Gibson
On Mon, Oct 03, 2016 at 09:24:45AM +0200, Cédric Le Goater wrote:
> As Qemu only supports a single instance of the ISA bus, we use the LPC
> controller of chip 0 to create one and plug in a couple of useful
> devices, like an UART and RTC. An IPMI BT device, which is also an ISA
> device, can be defined on the command line to connect an external BMC.
> That is for later.
> 
> The PowerNV machine now has a console. Skiboot should load a kernel
> and jump into it but execution will stop quite early because we lack a
> model for the native XICS controller for the moment :
> 
> [0.00] NR_IRQS:512 nr_irqs:512 16
> [0.00] XICS: Cannot find a Presentation Controller !
> [0.00] [ cut here ]
> [0.00] WARNING: at arch/powerpc/platforms/powernv/setup.c:81
> ...
> [0.00] NIP [c079d65c] pnv_init_IRQ+0x30/0x44
> 
> You can still do a few things under xmon.
> 
> Based on previous work from :
>   Benjamin Herrenschmidt 
> 
> Signed-off-by: Cédric Le Goater 

Reviewed-by: David Gibson 

> ---
>  hw/ppc/pnv.c | 65 
> 
>  include/hw/ppc/pnv.h |  2 ++
>  2 files changed, 67 insertions(+)
> 
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index e41244294435..4a71b18bf38b 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -34,6 +34,10 @@
>  
>  #include "hw/ppc/pnv_xscom.h"
>  
> +#include "hw/isa/isa.h"
> +#include "hw/char/serial.h"
> +#include "hw/timer/mc146818rtc.h"
> +
>  #include 
>  
>  #define FDT_MAX_SIZE0x0010
> @@ -302,6 +306,58 @@ static void ppc_powernv_reset(void)
>  cpu_physical_memory_write(POWERNV_FDT_ADDR, fdt, fdt_totalsize(fdt));
>  }
>  
> +/* If we don't use the built-in LPC interrupt deserializer, we need
> + * to provide a set of qirqs for the ISA bus or things will go bad.
> + *
> + * Most machines using pre-Naples chips (without said deserializer)
> + * have a CPLD that will collect the SerIRQ and shoot them as a
> + * single level interrupt to the P8 chip. So let's setup a hook
> + * for doing just that.
> + *
> + * Note: The actual interrupt input isn't emulated yet, this will
> + * come with the PSI bridge model.
> + */
> +static void pnv_lpc_isa_irq_handler_cpld(void *opaque, int n, int level)
> +{
> +/* We don't yet emulate the PSI bridge which provides the external
> + * interrupt, so just drop interrupts on the floor
> + */
> +}
> +
> +static void pnv_lpc_isa_irq_handler(void *opaque, int n, int level)
> +{
> + /* XXX TODO */
> +}
> +
> +static ISABus *pnv_isa_create(PnvChip *chip)
> +{
> +PnvLpcController *lpc = >lpc;
> +ISABus *isa_bus;
> +qemu_irq *irqs;
> +PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
> +
> +/* let isa_bus_new() create its own bridge on SysBus otherwise
> + * devices speficied on the command line won't find the bus and
> + * will fail to create.
> + */
> +isa_bus = isa_bus_new(NULL, >isa_mem, >isa_io,
> +  _fatal);
> +
> +/* Not all variants have a working serial irq decoder. If not,
> + * handling of LPC interrupts becomes a platform issue (some
> + * platforms have a CPLD to do it).
> + */
> +if (pcc->chip_type == PNV_CHIP_POWER8NVL) {
> +irqs = qemu_allocate_irqs(pnv_lpc_isa_irq_handler, lpc, 
> ISA_NUM_IRQS);
> +} else {
> +irqs = qemu_allocate_irqs(pnv_lpc_isa_irq_handler_cpld, NULL,
> +  ISA_NUM_IRQS);
> +}
> +
> +isa_bus_irqs(isa_bus, irqs);
> +return isa_bus;
> +}
> +
>  static void ppc_powernv_init(MachineState *machine)
>  {
>  PnvMachineState *pnv = POWERNV_MACHINE(machine);
> @@ -390,6 +446,15 @@ static void ppc_powernv_init(MachineState *machine)
>  object_property_set_bool(chip, true, "realized", _fatal);
>  }
>  g_free(chip_typename);
> +
> +/* Instantiate ISA bus on chip 0 */
> +pnv->isa_bus = pnv_isa_create(pnv->chips[0]);
> +
> +/* Create serial port */
> +serial_hds_isa_init(pnv->isa_bus, MAX_SERIAL_PORTS);
> +
> +/* Create an RTC ISA device too */
> +rtc_init(pnv->isa_bus, 2000, NULL);
>  }
>  
>  /*
> diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
> index e586ff4e735e..617c3fdd4f06 100644
> --- a/include/hw/ppc/pnv.h
> +++ b/include/hw/ppc/pnv.h
> @@ -110,6 +110,8 @@ typedef struct PnvMachineState {
>  
>  uint32_t  num_chips;
>  PnvChip   **chips;
> +
> +ISABus *isa_bus;
>  } PnvMachineState;
>  
>  #define POWERNV_FDT_ADDR0x0100

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[Qemu-devel] [PULL 2/4] hw/ppc/spapr: Move code related to "ibm, pa-features" to a separate function

2016-10-12 Thread David Gibson
From: Thomas Huth 

The function spapr_populate_cpu_dt() has become quite big
already, and since we likely have to extend the pa-features
property for every new processor generation, it is nicer
if we put the related code into a separate function.

Signed-off-by: Thomas Huth 
Reviewed-by: Cédric Le Goater 
Signed-off-by: David Gibson 
(cherry picked from commit 230bf719d3a3b144a4ffa441e5d6170ef0ad8999)
---
 hw/ppc/spapr.c | 66 --
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 30d6800..36d9077 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -594,6 +594,41 @@ static int spapr_populate_memory(sPAPRMachineState *spapr, 
void *fdt)
 return 0;
 }
 
+/* Populate the "ibm,pa-features" property */
+static void spapr_populate_pa_features(CPUPPCState *env, void *fdt, int offset)
+{
+uint8_t pa_features_206[] = { 6, 0,
+0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
+uint8_t pa_features_207[] = { 24, 0,
+0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
+0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
+0x80, 0x00, 0x80, 0x00, 0x80, 0x00 };
+uint8_t *pa_features;
+size_t pa_size;
+
+if (env->mmu_model == POWERPC_MMU_2_06) {
+pa_features = pa_features_206;
+pa_size = sizeof(pa_features_206);
+} else { /* env->mmu_model == POWERPC_MMU_2_07 */
+pa_features = pa_features_207;
+pa_size = sizeof(pa_features_207);
+}
+
+if (env->ci_large_pages) {
+/*
+ * Note: we keep CI large pages off by default because a 64K capable
+ * guest provisioned with large pages might otherwise try to map a qemu
+ * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
+ * even if that qemu runs on a 4k host.
+ * We dd this bit back here if we are confident this is not an issue
+ */
+pa_features[3] |= 0x20;
+}
+
+_FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
+}
+
 static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
   sPAPRMachineState *spapr)
 {
@@ -621,24 +656,6 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, 
int offset,
 _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
 }
 
-/* Note: we keep CI large pages off for now because a 64K capable guest
- * provisioned with large pages might otherwise try to map a qemu
- * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
- * even if that qemu runs on a 4k host.
- *
- * We can later add this bit back when we are confident this is not
- * an issue (!HV KVM or 64K host)
- */
-uint8_t pa_features_206[] = { 6, 0,
-0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
-uint8_t pa_features_207[] = { 24, 0,
-0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
-0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-0x80, 0x00, 0x80, 0x00, 0x80, 0x00 };
-uint8_t *pa_features;
-size_t pa_size;
-
 _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
 _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
 
@@ -705,18 +722,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, 
int offset,
   page_sizes_prop, page_sizes_prop_size)));
 }
 
-/* Do the ibm,pa-features property, adjust it for ci-large-pages */
-if (env->mmu_model == POWERPC_MMU_2_06) {
-pa_features = pa_features_206;
-pa_size = sizeof(pa_features_206);
-} else /* env->mmu_model == POWERPC_MMU_2_07 */ {
-pa_features = pa_features_207;
-pa_size = sizeof(pa_features_207);
-}
-if (env->ci_large_pages) {
-pa_features[3] |= 0x20;
-}
-_FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
+spapr_populate_pa_features(env, fdt, offset);
 
 _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
cs->cpu_index / vcpus_per_socket)));
-- 
2.7.4




Re: [Qemu-devel] [PATCH v1 3/3] target-ppc: implement xxbr[qdwh] instruction

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 10:38:53AM +0530, Nikunj A Dadhania wrote:
> Add required helpers (GEN_XX2FORM_EO) for supporting this instruction.
> 
> xxbrh: VSX Vector Byte-Reverse Halfword
> xxbrw: VSX Vector Byte-Reverse Word
> xxbrd: VSX Vector Byte-Reverse Doubleword
> xxbrq: VSX Vector Byte-Reverse Quadword
> 
> Signed-off-by: Nikunj A Dadhania 
> ---
>  target-ppc/translate.c  | 32 +++
>  target-ppc/translate/vsx-impl.inc.c | 77 
> +
>  target-ppc/translate/vsx-ops.inc.c  |  8 
>  3 files changed, 117 insertions(+)
> 
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index dab8f19..94989b2 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -376,6 +376,9 @@ GEN_OPCODE2(name, onam, opc1, opc2, opc3, inval, type, 
> type2)
>  #define GEN_HANDLER_E_2(name, opc1, opc2, opc3, opc4, inval, type, type2)
>  \
>  GEN_OPCODE3(name, opc1, opc2, opc3, opc4, inval, type, type2)
>  
> +#define GEN_HANDLER2_E_2(name, onam, opc1, opc2, opc3, opc4, inval, typ, 
> typ2) \
> +GEN_OPCODE4(name, onam, opc1, opc2, opc3, opc4, inval, typ, typ2)
> +
>  typedef struct opcode_t {
>  unsigned char opc1, opc2, opc3, opc4;
>  #if HOST_LONG_BITS == 64 /* Explicitly align to 64 bits */
> @@ -662,6 +665,21 @@ EXTRACT_HELPER(IMM8, 11, 8);
>  },   
>  \
>  .oname = stringify(name),
>  \
>  }
> +#define GEN_OPCODE4(name, onam, op1, op2, op3, op4, invl, _typ, _typ2)   
>  \
> +{
>  \
> +.opc1 = op1, 
>  \
> +.opc2 = op2, 
>  \
> +.opc3 = op3, 
>  \
> +.opc4 = op4, 
>  \
> +.handler = { 
>  \
> +.inval1  = invl, 
>  \
> +.type = _typ,
>  \
> +.type2 = _typ2,  
>  \
> +.handler = _##name,  
>  \
> +.oname = onam,   
>  \
> +},   
>  \
> +.oname = onam,   
>  \
> +}
>  #else
>  #define GEN_OPCODE(name, op1, op2, op3, invl, _typ, _typ2)   
>  \
>  {
>  \
> @@ -720,6 +738,20 @@ EXTRACT_HELPER(IMM8, 11, 8);
>  },   
>  \
>  .oname = stringify(name),
>  \
>  }
> +#define GEN_OPCODE4(name, onam, op1, op2, op3, op4, invl, _typ, _typ2)   
>  \
> +{
>  \
> +.opc1 = op1, 
>  \
> +.opc2 = op2, 
>  \
> +.opc3 = op3, 
>  \
> +.opc4 = op4, 
>  \
> +.handler = { 
>  \
> +.inval1  = invl, 
>  \
> +.type = _typ,
>  \
> +.type2 = _typ2,  
>  \
> +.handler = _##name,  
>  \
> +},   
>  \
> +.oname = onam,   
>  \
> +}
>  #endif
>  
>  /* SPR load/store helpers */
> diff --git a/target-ppc/translate/vsx-impl.inc.c 
> b/target-ppc/translate/vsx-impl.inc.c
> index 23ec1e1..52af5c1 100644
> --- a/target-ppc/translate/vsx-impl.inc.c
> +++ b/target-ppc/translate/vsx-impl.inc.c
> @@ -132,6 +132,22 @@ static void gen_bswap16x8(TCGv_i64 outh, TCGv_i64 outl,
>  tcg_temp_free_i64(mask);
>  }
>  
> +static void gen_bswap32x4(TCGv_i64 outh, TCGv_i64 outl,
> +  TCGv_i64 inh, TCGv_i64 inl)
> +{
> +TCGv_i64 hi = tcg_temp_new_i64();
> +TCGv_i64 lo = tcg_temp_new_i64();
> +
> +tcg_gen_bswap64_i64(hi, inh);
> +tcg_gen_bswap64_i64(lo, inl);
> +tcg_gen_shri_i64(outh, hi, 32);
> +

Re: [Qemu-devel] [PATCH v1 2/3] target-ppc: implement vnegw/d instructions

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 10:38:52AM +0530, Nikunj A Dadhania wrote:
> Vector Integer Negate Instructions:
> 
> vnegw: Vector Negate Word
> vnegd: Vector Negate Doubleword
> 
> Signed-off-by: Nikunj A Dadhania 
> ---
>  target-ppc/helper.h |  2 ++
>  target-ppc/int_helper.c | 12 
>  target-ppc/translate/vmx-impl.inc.c |  2 ++
>  target-ppc/translate/vmx-ops.inc.c  |  2 ++
>  4 files changed, 18 insertions(+)
> 
> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> index 04c6421..5fcc546 100644
> --- a/target-ppc/helper.h
> +++ b/target-ppc/helper.h
> @@ -272,6 +272,8 @@ DEF_HELPER_2(vextsh2w, void, avr, avr)
>  DEF_HELPER_2(vextsb2d, void, avr, avr)
>  DEF_HELPER_2(vextsh2d, void, avr, avr)
>  DEF_HELPER_2(vextsw2d, void, avr, avr)
> +DEF_HELPER_2(vnegw, void, avr, avr)
> +DEF_HELPER_2(vnegd, void, avr, avr)
>  DEF_HELPER_2(vupkhpx, void, avr, avr)
>  DEF_HELPER_2(vupklpx, void, avr, avr)
>  DEF_HELPER_2(vupkhsb, void, avr, avr)
> diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
> index 5aee0a8..7446e4e 100644
> --- a/target-ppc/int_helper.c
> +++ b/target-ppc/int_helper.c
> @@ -1949,6 +1949,18 @@ VEXT_SIGNED(vextsh2d, s64, UINT16_MAX, int16_t, 
> int64_t)
>  VEXT_SIGNED(vextsw2d, s64, UINT32_MAX, int32_t, int64_t)
>  #undef VEXT_SIGNED
>  
> +#define VNEG(name, element, mask)   \

The mask parameter appears to be unused.

> +void helper_##name(ppc_avr_t *r, ppc_avr_t *b)  \
> +{   \
> +int i;  \
> +VECTOR_FOR_INORDER_I(i, element) {  \
> +r->element[i] = -b->element[i]; \
> +}   \
> +}
> +VNEG(vnegw, s32, UINT32_MAX)
> +VNEG(vnegd, s64, UINT64_MAX)
> +#undef VNEG
> +
>  #define VSPLTI(suffix, element, splat_type) \
>  void helper_vspltis##suffix(ppc_avr_t *r, uint32_t splat)   \
>  {   \
> diff --git a/target-ppc/translate/vmx-impl.inc.c 
> b/target-ppc/translate/vmx-impl.inc.c
> index c8998f3..563f101 100644
> --- a/target-ppc/translate/vmx-impl.inc.c
> +++ b/target-ppc/translate/vmx-impl.inc.c
> @@ -815,6 +815,8 @@ GEN_VXFORM_NOA(vclzb, 1, 28)
>  GEN_VXFORM_NOA(vclzh, 1, 29)
>  GEN_VXFORM_NOA(vclzw, 1, 30)
>  GEN_VXFORM_NOA(vclzd, 1, 31)
> +GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
> +GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
>  GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
>  GEN_VXFORM_NOA_2(vextsh2w, 1, 24, 17)
>  GEN_VXFORM_NOA_2(vextsb2d, 1, 24, 24)
> diff --git a/target-ppc/translate/vmx-ops.inc.c 
> b/target-ppc/translate/vmx-ops.inc.c
> index 68cba3e..ab64ab2 100644
> --- a/target-ppc/translate/vmx-ops.inc.c
> +++ b/target-ppc/translate/vmx-ops.inc.c
> @@ -215,6 +215,8 @@ GEN_VXFORM_DUAL_INV(vspltish, vinserth, 6, 13, 
> 0x, 0x10,
>  GEN_VXFORM_DUAL_INV(vspltisw, vinsertw, 6, 14, 0x, 0x10,
> PPC_ALTIVEC),
>  GEN_VXFORM_300_EXT(vinsertd, 6, 15, 0x10),
> +GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06),
> +GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07),
>  GEN_VXFORM_300_EO(vextsb2w, 0x01, 0x18, 0x10),
>  GEN_VXFORM_300_EO(vextsh2w, 0x01, 0x18, 0x11),
>  GEN_VXFORM_300_EO(vextsb2d, 0x01, 0x18, 0x18),

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v4 07/20] ppc/pnv: add XSCOM handlers to PnvCore

2016-10-12 Thread David Gibson
On Mon, Oct 03, 2016 at 09:24:43AM +0200, Cédric Le Goater wrote:
> Now that we are using real HW ids for the cores in PowerNV chips, we
> can route the XSCOM accesses to them. We just need to attach a
> specific XSCOM memory region to each core in the appropriate window
> for the core number.
> 
> To start with, let's install the DTS (Digital Thermal Sensor) handlers
> which should return 38°C for each core.
> 
> Signed-off-by: Cédric Le Goater 
> ---
> 
>  Changes since v3:
> 
>  - moved to new XSCOM model
>  - kept the write op on the XSCOM memory region for later use
> 
>  Changes since v2:
> 
>  - added a XSCOM memory region to handle access to the EX core
>registers   
>  - extended the PnvCore object with a XSCOM_INTERFACE so that we can
>use pnv_xscom_pcba() and pnv_xscom_addr() to handle XSCOM address
>translation.
> 
>  hw/ppc/pnv.c   |  4 
>  hw/ppc/pnv_core.c  | 50 
> ++
>  include/hw/ppc/pnv_core.h  |  2 ++
>  include/hw/ppc/pnv_xscom.h | 19 ++
>  4 files changed, 75 insertions(+)
> 
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 5e19b6880387..ffe245fe59d2 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -620,6 +620,10 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> **errp)
>   _fatal);
>  object_unref(OBJECT(pnv_core));
>  i++;
> +
> +memory_region_add_subregion(>xscom,
> + PNV_XSCOM_EX_CORE_BASE(core_hwid) << 3,
> + _CORE(pnv_core)->xscom_regs);

Might be worth adding some convenience functions for doing the various
bits of xscom MR juggling, otherwise this looks fine.

>  }
>  g_free(typename);
>  }
> diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> index d37788f142f4..a1c8a14f06b6 100644
> --- a/hw/ppc/pnv_core.c
> +++ b/hw/ppc/pnv_core.c
> @@ -19,6 +19,7 @@
>  #include "qemu/osdep.h"
>  #include "sysemu/sysemu.h"
>  #include "qapi/error.h"
> +#include "qemu/log.h"
>  #include "target-ppc/cpu.h"
>  #include "hw/ppc/ppc.h"
>  #include "hw/ppc/pnv.h"
> @@ -64,6 +65,51 @@ static void powernv_cpu_init(PowerPCCPU *cpu, Error **errp)
>  powernv_cpu_reset(cpu);
>  }
>  
> +/*
> + * These values are read by the PowerNV HW monitors under Linux
> + */
> +#define PNV_XSCOM_EX_DTS_RESULT0 0x5
> +#define PNV_XSCOM_EX_DTS_RESULT1 0x50001
> +
> +static uint64_t pnv_core_xscom_read(void *opaque, hwaddr addr,
> +unsigned int width)
> +{
> +uint32_t offset = addr >> 3;
> +uint64_t val = 0;
> +
> +/* The result should be 38 C */
> +switch (offset) {
> +case PNV_XSCOM_EX_DTS_RESULT0:
> +val = 0x26f024f023full;
> +break;
> +case PNV_XSCOM_EX_DTS_RESULT1:
> +val = 0x24full;
> +break;
> +default:
> +qemu_log_mask(LOG_UNIMP, "Warning: reading reg=0x%" HWADDR_PRIx,
> +  addr);
> +}
> +
> +return val;
> +}
> +
> +static void pnv_core_xscom_write(void *opaque, hwaddr addr, uint64_t val,
> + unsigned int width)
> +{
> +qemu_log_mask(LOG_UNIMP, "Warning: writing to reg=0x%" HWADDR_PRIx,
> +  addr);
> +}
> +
> +static const MemoryRegionOps pnv_core_xscom_ops = {
> +.read = pnv_core_xscom_read,
> +.write = pnv_core_xscom_write,
> +.valid.min_access_size = 8,
> +.valid.max_access_size = 8,
> +.impl.min_access_size = 8,
> +.impl.max_access_size = 8,
> +.endianness = DEVICE_BIG_ENDIAN,
> +};
> +
>  static void pnv_core_realize_child(Object *child, Error **errp)
>  {
>  Error *local_err = NULL;
> @@ -119,6 +165,10 @@ static void pnv_core_realize(DeviceState *dev, Error 
> **errp)
>  goto err;
>  }
>  }
> +
> +snprintf(name, sizeof(name), "xscom-core.%d", cc->core_id);
> +memory_region_init_io(>xscom_regs, OBJECT(dev), _core_xscom_ops,
> +  pc, name,  PNV_XSCOM_EX_CORE_SIZE << 3);
>  return;
>  
>  err:
> diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
> index a151e281c017..2955a41c901f 100644
> --- a/include/hw/ppc/pnv_core.h
> +++ b/include/hw/ppc/pnv_core.h
> @@ -36,6 +36,8 @@ typedef struct PnvCore {
>  /*< public >*/
>  void *threads;
>  uint32_t pir;
> +
> +MemoryRegion xscom_regs;
>  } PnvCore;
>  
>  typedef struct PnvCoreClass {
> diff --git a/include/hw/ppc/pnv_xscom.h b/include/hw/ppc/pnv_xscom.h
> index f50eb0bc4099..79975a6cbe46 100644
> --- a/include/hw/ppc/pnv_xscom.h
> +++ b/include/hw/ppc/pnv_xscom.h
> @@ -41,6 +41,25 @@ typedef struct PnvXScomInterfaceClass {
>  int (*populate)(PnvXScomInterface *dev, void *fdt, int offset);
>  } PnvXScomInterfaceClass;
>  
> +/*
> + * Layout of the XSCOM PCB addresses of EX core 1
> + *
> + *   GPIO0x1100
> + *   SCOM0x1101
> + *   OHA 0x1102
> + *   CLOCK 

Re: [Qemu-devel] [PATCH v1 1/3] target-ppc: implement vexts[bh]2w and vexts[bhw]2d

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 10:38:51AM +0530, Nikunj A Dadhania wrote:
> Vector Extend Sign Instructions:
> 
> vextsb2w: Vector Extend Sign Byte To Word
> vextsh2w: Vector Extend Sign Halfword To Word
> vextsb2d: Vector Extend Sign Byte To Doubleword
> vextsh2d: Vector Extend Sign Halfword To Doubleword
> vextsw2d: Vector Extend Sign Word To Doubleword
> 
> Signed-off-by: Nikunj A Dadhania 

Applied to ppc-for-2.8, thanks.

> ---
>  target-ppc/helper.h |  5 +
>  target-ppc/int_helper.c | 15 +++
>  target-ppc/translate/vmx-impl.inc.c |  5 +
>  target-ppc/translate/vmx-ops.inc.c  |  5 +
>  4 files changed, 30 insertions(+)
> 
> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> index 796ad45..04c6421 100644
> --- a/target-ppc/helper.h
> +++ b/target-ppc/helper.h
> @@ -267,6 +267,11 @@ DEF_HELPER_3(vinsertb, void, avr, avr, i32)
>  DEF_HELPER_3(vinserth, void, avr, avr, i32)
>  DEF_HELPER_3(vinsertw, void, avr, avr, i32)
>  DEF_HELPER_3(vinsertd, void, avr, avr, i32)
> +DEF_HELPER_2(vextsb2w, void, avr, avr)
> +DEF_HELPER_2(vextsh2w, void, avr, avr)
> +DEF_HELPER_2(vextsb2d, void, avr, avr)
> +DEF_HELPER_2(vextsh2d, void, avr, avr)
> +DEF_HELPER_2(vextsw2d, void, avr, avr)
>  DEF_HELPER_2(vupkhpx, void, avr, avr)
>  DEF_HELPER_2(vupklpx, void, avr, avr)
>  DEF_HELPER_2(vupkhsb, void, avr, avr)
> diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
> index 202854f..5aee0a8 100644
> --- a/target-ppc/int_helper.c
> +++ b/target-ppc/int_helper.c
> @@ -1934,6 +1934,21 @@ VEXTRACT(uw, u32)
>  VEXTRACT(d, u64)
>  #undef VEXTRACT
>  
> +#define VEXT_SIGNED(name, element, mask, cast, recast)  \
> +void helper_##name(ppc_avr_t *r, ppc_avr_t *b)  \
> +{   \
> +int i;  \
> +VECTOR_FOR_INORDER_I(i, element) {  \
> +r->element[i] = (recast)((cast)(b->element[i] & mask)); \
> +}   \
> +}
> +VEXT_SIGNED(vextsb2w, s32, UINT8_MAX, int8_t, int32_t)
> +VEXT_SIGNED(vextsb2d, s64, UINT8_MAX, int8_t, int64_t)
> +VEXT_SIGNED(vextsh2w, s32, UINT16_MAX, int16_t, int32_t)
> +VEXT_SIGNED(vextsh2d, s64, UINT16_MAX, int16_t, int64_t)
> +VEXT_SIGNED(vextsw2d, s64, UINT32_MAX, int32_t, int64_t)
> +#undef VEXT_SIGNED
> +
>  #define VSPLTI(suffix, element, splat_type) \
>  void helper_vspltis##suffix(ppc_avr_t *r, uint32_t splat)   \
>  {   \
> diff --git a/target-ppc/translate/vmx-impl.inc.c 
> b/target-ppc/translate/vmx-impl.inc.c
> index 25cd073..c8998f3 100644
> --- a/target-ppc/translate/vmx-impl.inc.c
> +++ b/target-ppc/translate/vmx-impl.inc.c
> @@ -815,6 +815,11 @@ GEN_VXFORM_NOA(vclzb, 1, 28)
>  GEN_VXFORM_NOA(vclzh, 1, 29)
>  GEN_VXFORM_NOA(vclzw, 1, 30)
>  GEN_VXFORM_NOA(vclzd, 1, 31)
> +GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
> +GEN_VXFORM_NOA_2(vextsh2w, 1, 24, 17)
> +GEN_VXFORM_NOA_2(vextsb2d, 1, 24, 24)
> +GEN_VXFORM_NOA_2(vextsh2d, 1, 24, 25)
> +GEN_VXFORM_NOA_2(vextsw2d, 1, 24, 26)
>  GEN_VXFORM_NOA_2(vctzb, 1, 24, 28)
>  GEN_VXFORM_NOA_2(vctzh, 1, 24, 29)
>  GEN_VXFORM_NOA_2(vctzw, 1, 24, 30)
> diff --git a/target-ppc/translate/vmx-ops.inc.c 
> b/target-ppc/translate/vmx-ops.inc.c
> index ac1dc9b..68cba3e 100644
> --- a/target-ppc/translate/vmx-ops.inc.c
> +++ b/target-ppc/translate/vmx-ops.inc.c
> @@ -215,6 +215,11 @@ GEN_VXFORM_DUAL_INV(vspltish, vinserth, 6, 13, 
> 0x, 0x10,
>  GEN_VXFORM_DUAL_INV(vspltisw, vinsertw, 6, 14, 0x, 0x10,
> PPC_ALTIVEC),
>  GEN_VXFORM_300_EXT(vinsertd, 6, 15, 0x10),
> +GEN_VXFORM_300_EO(vextsb2w, 0x01, 0x18, 0x10),
> +GEN_VXFORM_300_EO(vextsh2w, 0x01, 0x18, 0x11),
> +GEN_VXFORM_300_EO(vextsb2d, 0x01, 0x18, 0x18),
> +GEN_VXFORM_300_EO(vextsh2d, 0x01, 0x18, 0x19),
> +GEN_VXFORM_300_EO(vextsw2d, 0x01, 0x18, 0x1A),
>  GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C),
>  GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D),
>  GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E),

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] Async savevm using userfaultfd(2)

2016-10-12 Thread Stefan Hajnoczi
On Wed, Oct 12, 2016 at 4:04 PM, Stefan Hajnoczi  wrote:
> Perhaps this approach can be prototyped with mprotect and a SIGSEGV
> handler if anyone wants to get async savevm going.  I don't know if
> there are any disadvantages to mprotecting guest RAM that the kvm kernel
> module is using.  Hopefully in-kernel devices and vhost will continue to
> work.

I woke up this morning with a strong feeling that a SIGSEGV handler
won't work with vhost.  The problem is that the QEMU process' SIGSEGV
handler won't be called when the vhost kernel thread faults.  Now I'm
wondering whether userfaultfd will work together with vhost.

Stefan



Re: [Qemu-devel] [PATCH 02/15] xen: Fix coding style warnings

2016-10-12 Thread Emil Condrea
On Tue, Oct 11, 2016 at 5:20 PM, Anthony PERARD
 wrote:
> On Tue, Oct 04, 2016 at 09:43:31AM +0300, Emil Condrea wrote:
>> Fixes:
>>  * WARNING: line over 80 characters
>>
>> Signed-off-by: Emil Condrea 
>> ---
>>  hw/block/xen_disk.c  |  3 ++-
>>  hw/char/xen_console.c|  6 --
>>  hw/display/xenfb.c   | 30 --
>>  hw/net/xen_nic.c | 12 
>>  hw/xen/xen_backend.c | 15 ++-
>>  include/hw/xen/xen_backend.h |  8 +---
>>  6 files changed, 49 insertions(+), 25 deletions(-)
>>
>> diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
>> index 5aa350a..24edeb2 100644
>> --- a/hw/block/xen_disk.c
>> +++ b/hw/block/xen_disk.c
>> @@ -1068,7 +1068,8 @@ static int blk_connect(struct XenDevice *xendev)
>>  blk_set_enable_write_cache(blkdev->blk, !writethrough);
>>  } else {
>>  /* setup via qemu cmdline -> already setup for us */
>> -xen_be_printf(>xendev, 2, "get configured bdrv (cmdline 
>> setup)\n");
>> +xen_be_printf(>xendev, 2,
>> + "get configured bdrv (cmdline setup)\n");
>
> Arguments are usually aligned with the first one, so there is one
> missing space.

I guess this is displayed wrongly in the email client as in mine but the source
of the email contains this patch http://pastebin.com/Sbk23h6m, which shows that
this line is aligned to the first parameter.

>
>>  blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
>>  if (blk_is_read_only(blkdev->blk) && !readonly) {
>>  xen_be_printf(>xendev, 0, "Unexpected read-only drive");
>> diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
>> index 4e35c82..399bb5d 100644
>> --- a/hw/char/xen_console.c
>> +++ b/hw/char/xen_console.c
>> @@ -156,7 +156,8 @@ static void xencons_send(struct XenConsole *con)
>>  if (len < 1) {
>>  if (!con->backlog) {
>>  con->backlog = 1;
>> -xen_be_printf(>xendev, 1, "backlog piling up, nobody 
>> listening?\n");
>> +xen_be_printf(>xendev, 1,
>> + "backlog piling up, nobody listening?\n");
>
> same here and other places.

same as above

>
>>  }
>>  } else {
>>  buffer_advance(>buffer, len);
>> @@ -247,7 +248,8 @@ static int con_initialise(struct XenDevice *xendev)
>>  }
>>  }
>>
>> -xen_be_printf(xendev, 1, "ring mfn %d, remote port %d, local port %d, 
>> limit %zd\n",
>> +xen_be_printf(xendev, 1,
>> + "ring mfn %d, remote port %d, local port %d, limit %zd\n",
>> con->ring_ref,
>> con->xendev.remote_port,
>> con->xendev.local_port,
>> diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
>> index 545ee47..0aca6ae 100644
>> --- a/hw/xen/xen_backend.c
>> +++ b/hw/xen/xen_backend.c
>> @@ -205,7 +206,8 @@ int xenstore_read_fe_int(struct XenDevice *xendev, const 
>> char *node, int *ival)
>>  return xenstore_read_int(xendev->fe, node, ival);
>>  }
>>
>> -int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, 
>> uint64_t *uval)
>> +int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
>> +uint64_t *uval)
>
> Same here, it would be better to align the second line with the first
> parameter.

This indeed should be fixed. I will send a new patch for it.

>
>>  {
>>  return xenstore_read_uint64(xendev->fe, node, uval);
>>  }
>
> --
> Anthony PERARD



Re: [Qemu-devel] [PATCH 01/15] xen: Fix coding style errors

2016-10-12 Thread Emil Condrea
Actually I've split fixing coding style in 2 patches: one for errors and
one for warnings. In this patch some resolve the error
"code indent should never use tabs" but if on the same line there is a
warning about line exceeding 80 characters, it will be fixed in
"Fix coding style warnings" patch.

On Tue, Oct 11, 2016 at 4:56 PM, Anthony PERARD
 wrote:
> On Tue, Oct 04, 2016 at 09:43:30AM +0300, Emil Condrea wrote:
>> Fixes the following errors:
>>  * ERROR: line over 90 characters
>
> It's 80 ;), and there are a few more left in this patch.
>
>>  * ERROR: code indent should never use tabs
>>  * ERROR: space prohibited after that open square bracket '['
>>  * ERROR: do not initialise statics to 0 or NULL
>>  * ERROR: "(foo*)" should be "(foo *)"
>>
>> Signed-off-by: Emil Condrea 
>
> --
> Anthony PERARD



Re: [Qemu-devel] [PATCH 1/2] 9pfs: fix information leak in xattr read

2016-10-12 Thread Li Qiang
Yes, I think the limit to apply to xattr size in 9pfs is the same as the
Linux xattr size limit, I will try to find this limit.

Thanks.

On 2016-10-13 4:49 GMT+08:00 Eric Blake  wrote:

> On 10/12/2016 08:23 AM, Greg Kurz wrote:
> >
> > But in fact, I'm afraid we have a more serious problem here... size
> > comes from the guest and could cause g_malloc() to abort if QEMU has
> > reached some RLIMIT... we need to call g_try_malloc0() and return
> > ENOMEM if the allocation fails.
>
> Even if it does not cause an ENOMEM failure right away, the guest can
> also use this to chew up lots of host resources. It may also be worth
> putting a reasonable cap at the maximum the guest can allocate, rather
> than just trying to malloc every possible size.
>
> --
> Eric Blake   eblake redhat com+1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>
>


Re: [Qemu-devel] [PATCH v8 4/6] docs: Add Documentation for Mediated devices

2016-10-12 Thread Tian, Kevin
> From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
> Sent: Thursday, October 13, 2016 3:03 AM
> 
> 
> On 10/12/2016 9:29 PM, Alex Williamson wrote:
> > On Wed, 12 Oct 2016 20:43:48 +0530
> > Kirti Wankhede  wrote:
> >
> >> On 10/12/2016 7:22 AM, Tian, Kevin wrote:
>  From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
>  Sent: Wednesday, October 12, 2016 4:45 AM
> >> +* mdev_supported_types:
> >> +List of current supported mediated device types and its details 
> >> are added
> >> +in this directory in following format:
> >> +
> >> +|- 
> >> +|--- Vendor-specific-attributes [optional]
> >> +|--- mdev_supported_types
> >> +| |--- 
> >> +| |   |--- create
> >> +| |   |--- name
> >> +| |   |--- available_instances
> >> +| |   |--- description /class
> >> +| |   |--- [devices]
> >> +| |--- 
> >> +| |   |--- create
> >> +| |   |--- name
> >> +| |   |--- available_instances
> >> +| |   |--- description /class
> >> +| |   |--- [devices]
> >> +| |--- 
> >> +|  |--- create
> >> +|  |--- name
> >> +|  |--- available_instances
> >> +|  |--- description /class
> >> +|  |--- [devices]
> >> +
> >> +[TBD : description or class is yet to be decided. This will change.]
> >
> > I thought that in previous discussions we had agreed to drop
> > the  concept and use the name as the unique identifier.
> > When reporting these types in libvirt we won't want to report
> > the type id values - we'll want the name strings to be unique.
> >
> 
>  The 'name' might not be unique but type_id will be. For example that Neo
>  pointed out in earlier discussion, virtual devices can come from two
>  different physical devices, end user would be presented with what they
>  had selected but there will be internal implementation differences. In
>  that case 'type_id' will be unique.
> 
> >>>
> >>> Hi, Kirti, my understanding is that Neo agreed to use an unique type
> >>> string (if you still called it ), and then no need of additional
> >>> 'name' field which can be put inside 'description' field. See below quote:
> >>>
> >>
> >> We had internal discussions about this within NVIDIA and found that
> >> 'name' might not be unique where as 'type_id' would be unique. I'm
> >> refering to Neo's mail after that, where Neo do pointed that out.
> >>
> >> https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg07714.html
> >
> > Everyone not privy to those internal discussions, including me, seems to
> > think we dropped type_id and that if a vendor does not have a stable
> > name, they can compose some sort of stable type description based on the
> > name+id, or even vendor+id, ex. NVIDIA-11.  So please share why we
> > haven't managed to kill off type_id yet.  No matter what internal
> > representation each vendor driver has of "type_id" it seems possible
> > for it to come up with stable string to define a given configuration.
> 
> 
> The 'type_id' is unique and the 'name' are not, the name is just a
> virtual device name/ human readable name. Because at this moment Intel
> can't define a proper GPU class, we have to add a 'description' field
> there as well to represent the features of this virtual device, once we
> have all agreed with the GPU class and its mandatory attributes, the
> 'description' field can be removed. Here is an example,
> type_id/type_name = NVIDIA_11,
> name=M60-M0Q,
> description=2560x1600, 2 displays, 512MB"

As I commented earlier, I didn't see how above attributes can be defined
mandatory:

- #displays, is concerned only for VDI usage, where remote user may
care about how many virtual displays it could be use. What about using
vGPU in non-VDI usage, e.g. purely media transcoding case where 
#displays is just nothing? Then for media transcoding do we want to
further introduce attributes like H.265?

- framebuffer size (512MB) might make sense to discrete card like
NVIDIA. In your case the graphics memory is on-card, so the memory
size is critical to performance so user might want to know. However for 
integrated card like Intel, we just use system memory as 'virtual' graphics
memory through GPU page tables. There is one global GPU page table
(GGTT) partitioned between vGPUs, but the majority of rendering happens
on per-process GPU page table (PPGTT) which can be fully managed by
each VM. In this sense, the size of GGTT resource has little performance
implication (mostly an indirect functionality sense, such as #displays) User
cannot make clear expectation on it, so we don't have plan to expose it.

> 
> Neo's previous comment only applies to the situation where we will have
> the GPU class or optional attributes defined and recognized by libvirt,
> since that is not going to happen any time soon, we will have to have
> the new 

[Qemu-devel] [PATCH] 9pfs: add xattrwalk_fid field in V9fsFidState struct

2016-10-12 Thread Li Qiang
From: Li Qiang 

Currently, 9pfs sets the fs.xattr.copied_len field in V9fsFidState
to -1 to indicate a xattr walk fid. As the fs.xattr.copied_len is also
used to account for copied bytes, this may cause confusion. This patch
add a bool variable to represent the xattr walk fid.

Signed-off-by: Li Qiang 
---
 hw/9pfs/9p.c | 7 ---
 hw/9pfs/9p.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 8c7488f..9625296 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -325,7 +325,7 @@ static int v9fs_xattr_fid_clunk(V9fsPDU *pdu, V9fsFidState 
*fidp)
 {
 int retval = 0;
 
-if (fidp->fs.xattr.copied_len == -1) {
+if (fidp->xattrwalk_fid) {
 /* getxattr/listxattr fid */
 goto free_value;
 }
@@ -3181,7 +3181,7 @@ static void v9fs_xattrwalk(void *opaque)
  */
 xattr_fidp->fs.xattr.len = size;
 xattr_fidp->fid_type = P9_FID_XATTR;
-xattr_fidp->fs.xattr.copied_len = -1;
+xattr_fidp->xattrwalk_fid  = true;
 if (size) {
 xattr_fidp->fs.xattr.value = g_malloc(size);
 err = v9fs_co_llistxattr(pdu, _fidp->path,
@@ -3214,7 +3214,7 @@ static void v9fs_xattrwalk(void *opaque)
  */
 xattr_fidp->fs.xattr.len = size;
 xattr_fidp->fid_type = P9_FID_XATTR;
-xattr_fidp->fs.xattr.copied_len = -1;
+xattr_fidp->xattrwalk_fid  = true;
 if (size) {
 xattr_fidp->fs.xattr.value = g_malloc(size);
 err = v9fs_co_lgetxattr(pdu, _fidp->path,
@@ -3269,6 +3269,7 @@ static void v9fs_xattrcreate(void *opaque)
 /* Make the file fid point to xattr */
 xattr_fidp = file_fidp;
 xattr_fidp->fid_type = P9_FID_XATTR;
+xattr_fidp->xattrwalk_fid  = false;
 xattr_fidp->fs.xattr.copied_len = 0;
 xattr_fidp->fs.xattr.len = size;
 xattr_fidp->fs.xattr.flags = flags;
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 22198f6..7e1e70b 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -214,6 +214,7 @@ struct V9fsFidState
 uid_t uid;
 int ref;
 int clunked;
+bool xattrwalk_fid;
 V9fsFidState *next;
 V9fsFidState *rclm_lst;
 };
-- 
1.8.3.1




[Qemu-devel] [PATCH v2] 9pfs: fix integer overflow issue in xattr read/write

2016-10-12 Thread Li Qiang
From: Li Qiang 

In 9pfs xattr read/write function, it mix to use unsigned/signed
,32/64 bits integers. This will causes oob read/write issues. This patch
fix this.

Signed-off-by: Li Qiang 
---
 hw/9pfs/9p.c | 34 +-
 hw/9pfs/9p.h |  4 ++--
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index e4040dc..8c7488f 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -1642,20 +1642,17 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, 
V9fsFidState *fidp,
 {
 ssize_t err;
 size_t offset = 7;
-int read_count;
-int64_t xattr_len;
+uint64_t read_count;
 V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
 VirtQueueElement *elem = v->elems[pdu->idx];
 
-xattr_len = fidp->fs.xattr.len;
-read_count = xattr_len - off;
+if (fidp->fs.xattr.len < off) {
+read_count = 0;
+} else {
+   read_count = fidp->fs.xattr.len - off;
+}
 if (read_count > max_count) {
 read_count = max_count;
-} else if (read_count < 0) {
-/*
- * read beyond XATTR value
- */
-read_count = 0;
 }
 err = pdu_marshal(pdu, offset, "d", read_count);
 if (err < 0) {
@@ -1982,23 +1979,18 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, 
V9fsFidState *fidp,
 {
 int i, to_copy;
 ssize_t err = 0;
-int write_count;
-int64_t xattr_len;
+uint64_t write_count;
 size_t offset = 7;
 
 
-xattr_len = fidp->fs.xattr.len;
-write_count = xattr_len - off;
-if (write_count > count) {
-write_count = count;
-} else if (write_count < 0) {
-/*
- * write beyond XATTR value len specified in
- * xattrcreate
- */
+if (fidp->fs.xattr.len < off) {
 err = -ENOSPC;
 goto out;
 }
+write_count = fidp->fs.xattr.len - off;
+if (write_count > count) {
+write_count = count;
+}
 err = pdu_marshal(pdu, offset, "d", write_count);
 if (err < 0) {
 return err;
@@ -3254,7 +3246,7 @@ static void v9fs_xattrcreate(void *opaque)
 {
 int flags;
 int32_t fid;
-int64_t size;
+uint64_t size;
 ssize_t err = 0;
 V9fsString name;
 size_t offset = 7;
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index d539d2e..22198f6 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -159,8 +159,8 @@ typedef struct V9fsConf
 
 typedef struct V9fsXattr
 {
-int64_t copied_len;
-int64_t len;
+uint64_t copied_len;
+uint64_t len;
 void *value;
 V9fsString name;
 int flags;
-- 
1.8.3.1




[Qemu-devel] [PATCH] pci_aer_init: cleanup

2016-10-12 Thread Cao jin
log_max have no chance to be PCIE_AER_LOG_MAX_UNSET, unless user specify it.

Bonus:
1. remove unnecessary local variable.
2. fix a typo.

Signed-off-by: Cao jin 
---
 hw/pci/pcie_aer.c | 10 +-
 include/hw/pci/pcie_aer.h |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index ac47f34..6cf088b 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -99,18 +99,10 @@ static void aer_log_clear_all_err(PCIEAERLog *aer_log)
 int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, uint16_t offset,
   uint16_t size)
 {
-PCIExpressDevice *exp;
-
 pcie_add_capability(dev, PCI_EXT_CAP_ID_ERR, cap_ver,
 offset, size);
-exp = >exp;
-exp->aer_cap = offset;
+dev->exp.aer_cap = offset;
 
-/* log_max is property */
-if (dev->exp.aer_log.log_max == PCIE_AER_LOG_MAX_UNSET) {
-dev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
-}
-/* clip down the value to avoid unreasobale memory usage */
 if (dev->exp.aer_log.log_max > PCIE_AER_LOG_MAX_LIMIT) {
 return -EINVAL;
 }
diff --git a/include/hw/pci/pcie_aer.h b/include/hw/pci/pcie_aer.h
index c373591..31755ef 100644
--- a/include/hw/pci/pcie_aer.h
+++ b/include/hw/pci/pcie_aer.h
@@ -40,7 +40,7 @@ struct PCIEAERLog {
  * The specified value will be clipped down to PCIE_AER_LOG_MAX_LIMIT
  * to avoid unreasonable memory usage.
  * I bet that 128 log size would be big enough, otherwise too many errors
- * for system to function normaly. But could consecutive errors occur?
+ * for system to function normally. But could consecutive errors occur?
  */
 #define PCIE_AER_LOG_MAX_DEFAULT8
 #define PCIE_AER_LOG_MAX_LIMIT  128
-- 
1.8.3.1






[Qemu-devel] [PATCH] target-mips: Fix Loongson pandn instruction.

2016-10-12 Thread Heiher
From: Heiher 

pandn FD, FS, FT
Operation: FD = ((NOT FS) AND FT)

Signed-off-by: Heiher 
Signed-off-by: Fuxin Zhang 
---
 target-mips/translate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/target-mips/translate.c b/target-mips/translate.c
index 55c2ca0..ed67180 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -3880,6 +3880,8 @@ static void gen_loongson_multimedia(DisasContext *ctx, 
int rd, int rs, int rt)
 case OPC_##UP: gen_helper_##LO(t0, t0); break
 #define LMI_DIRECT(UP, LO, OP) \
 case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); break
+#define LMI_DIRECT_SWAP(UP, LO, OP) \
+case OPC_##UP: tcg_gen_##OP##_i64(t0, t1, t0); break
 
 switch (opc) {
 LMI_HELPER(PADDSH, paddsh);
@@ -3945,8 +3947,8 @@ static void gen_loongson_multimedia(DisasContext *ctx, 
int rd, int rs, int rt)
 LMI_DIRECT(XOR_CP2, xor, xor);
 LMI_DIRECT(NOR_CP2, nor, nor);
 LMI_DIRECT(AND_CP2, and, and);
-LMI_DIRECT(PANDN, pandn, andc);
 LMI_DIRECT(OR, or, or);
+LMI_DIRECT_SWAP(PANDN, pandn, andc);
 
 case OPC_PINSRH_0:
 tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
-- 
2.10.0




Re: [Qemu-devel] [PATCH v2 4/4] spapr: Introduce FWNMI KVM capability

2016-10-12 Thread Alexey Kardashevskiy
On 17/12/15 15:38, Aravinda Prasad wrote:
> 
> 
> On Thursday 17 December 2015 09:32 AM, David Gibson wrote:
>> On Wed, Dec 16, 2015 at 11:38:47AM +0530, Aravinda Prasad wrote:
>>> Introduce a new KVM capability to control how KVM
>>> behaves on machine check exception.
>>>
>>> Without this capability, KVM redirects machine check
>>> exceptions to guest's 0x200 vector if the address in
>>> error belongs to guest. With this capability KVM
>>> causes a guest exit with NMI exit reason.
>>>
>>> This is required to avoid problem if a new kernel/KVM
>>> is used with an old QEMU. As old QEMU might not
>>> understand the new NMI exit type and treat it as a
>>> fatal error, even though the guest could have actually
>>> handled the error if the exception was delivered to
>>> guest's 0x200 interrupt vector.
>>>
>>> PS: KVM_CAP_PPC_FWNMI is set to 121 as 119 and 120 are
>>> used by KVM_CAP_GUEST_DEBUG_HW_BPS and KVM_CAP_GUEST_DEBUG_HW_WPS
>>> in KVM code, but still not reflected in QEMU code.
>>
>> The commit message seems to be written as if this were the kernel
>> patch adding the capability there, rather than the qemu patch using
>> it.
>>
> 
> I will reword it.


Was there any progress in regard of FWNMI?



> 
> Regards,
> Aravinda
> 
>>> Signed-off-by: Aravinda Prasad 
>>> ---
>>>  hw/ppc/spapr_rtas.c   |   17 +
>>>  linux-headers/linux/kvm.h |1 +
>>>  target-ppc/kvm.c  |2 ++
>>>  3 files changed, 20 insertions(+)
>>>
>>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>>> index 17c4672..53319da 100644
>>> --- a/hw/ppc/spapr_rtas.c
>>> +++ b/hw/ppc/spapr_rtas.c
>>> @@ -38,6 +38,8 @@
>>>  #include 
>>>  #include "hw/ppc/spapr_drc.h"
>>>  
>>> +extern int cap_fwnmi;
>>> +
>>>  /* #define DEBUG_SPAPR */
>>>  
>>>  #ifdef DEBUG_SPAPR
>>> @@ -603,9 +605,24 @@ static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
>>>target_ulong args,
>>>uint32_t nret, target_ulong rets)
>>>  {
>>> +int ret;
>>> +CPUState *cs = CPU(cpu);
>>> +
>>> +if (!cap_fwnmi) {
>>> +rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
>>> +return;
>>> +}
>>> +
>>>  spapr->mc_in_progress = false;
>>>  qemu_cond_init(>mc_delivery_cond);
>>>  spapr->guest_machine_check_addr = rtas_ld(args, 1);
>>> +
>>> +ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_FWNMI, 0);
>>> +if (ret < 0) {
>>> +rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
>>> +return;
>>> +}
>>> +
>>>  rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>>>  }
>>>  
>>> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
>>> index 683f713..2db1fba 100644
>>> --- a/linux-headers/linux/kvm.h
>>> +++ b/linux-headers/linux/kvm.h
>>> @@ -819,6 +819,7 @@ struct kvm_ppc_smmu_info {
>>>  #define KVM_CAP_DISABLE_QUIRKS 116
>>>  #define KVM_CAP_X86_SMM 117
>>>  #define KVM_CAP_MULTI_ADDRESS_SPACE 118
>>> +#define KVM_CAP_PPC_FWNMI 121
>>>  
>>>  #ifdef KVM_CAP_IRQ_ROUTING
>>>  
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index 2bbb46d..5339c04 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -74,6 +74,7 @@ static int cap_ppc_watchdog;
>>>  static int cap_papr;
>>>  static int cap_htab_fd;
>>>  static int cap_fixup_hcalls;
>>> +int cap_fwnmi;
>>>  
>>>  static uint32_t debug_inst_opcode;
>>>  
>>> @@ -116,6 +117,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>>>   * only activated after this by kvmppc_set_papr() */
>>>  cap_htab_fd = kvm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
>>>  cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
>>> +cap_fwnmi = kvm_check_extension(s, KVM_CAP_PPC_FWNMI);
>>>  
>>>  if (!cap_interrupt_level) {
>>>  fprintf(stderr, "KVM: Couldn't find level irq capability. Expect 
>>> the "
>>>
>>
> 


-- 
Alexey



Re: [Qemu-devel] [PATCH 14/15] xen: Rename xen_be_del_xendev

2016-10-12 Thread Xuquan (Quan Xu)
On October 12, 2016 9:46 PM, Anthony PERARD < anthony.per...@citrix.com > wrote:
>On Tue, Oct 04, 2016 at 09:43:43AM +0300, Emil Condrea wrote:
>> Prepare xen_be_del_xendev to be shared with frontends:
>>  * xen_be_del_xendev -> xen_pv_del_xendev
>>
>> Signed-off-by: Emil Condrea 
>
>Acked-by: Anthony PERARD 
>

Reviewed-by: Quan Xu 

Quan



Re: [Qemu-devel] [PATCH 13/15] xen: Rename xen_be_find_xendev

2016-10-12 Thread Xuquan (Quan Xu)
On October 12, 2016 9:42 PM, Anthony PERARD < anthony.per...@citrix.com > wrote:
>On Tue, Oct 04, 2016 at 09:43:42AM +0300, Emil Condrea wrote:
>> Prepare xen_be_find_xendev to be shared with frontends:
>>  * xen_be_find_xendev -> xen_pv_find_xendev
>>
>> Signed-off-by: Emil Condrea 
>
>Acked-by: Anthony PERARD 
>

Reviewed-by: Quan Xu 

Quan



Re: [Qemu-devel] [PATCH 12/15] xen: Rename xen_be_evtchn_event

2016-10-12 Thread Xuquan (Quan Xu)
On October 12, 2016 9:41 PM, Anthony PERARD < anthony.per...@citrix.com > wrote:
>On Tue, Oct 04, 2016 at 09:43:41AM +0300, Emil Condrea wrote:
>> Prepare xen_be_evtchn_event to be shared with frontends:
>>  * xen_be_evtchn_event -> xen_pv_evtchn_event
>>
>> Signed-off-by: Emil Condrea 
>
>Acked-by: Anthony PERARD 
>

Reviewed-by: Quan Xu 

Quan



Re: [Qemu-devel] [PATCH 11/15] xen: Rename xen_be_send_notify

2016-10-12 Thread Xuquan (Quan Xu)
On October 12, 2016 9:41 PM, Anthony PERARD < anthony.per...@citrix.com > wrote:
>On Tue, Oct 04, 2016 at 09:43:40AM +0300, Emil Condrea wrote:
>> Prepare xen_be_send_notify to be shared with frontends:
>>  * xen_be_send_notify -> xen_pv_send_notify
>>
>> Signed-off-by: Emil Condrea 
>
>Acked-by: Anthony PERARD 
>

Reviewed-by: Quan Xu 

Quan



Re: [Qemu-devel] [PATCH 10/15] xen: Rename xen_be_unbind_evtchn

2016-10-12 Thread Xuquan (Quan Xu)
On October 12, 2016 9:37 PM, Anthony PERARD < anthony.per...@citrix.com > wrote:
>On Tue, Oct 04, 2016 at 09:43:39AM +0300, Emil Condrea wrote:
>> Prepare xen_be_unbind_evtchn to be shared with frontends:
>>  * xen_be_unbind_evtchn -> xen_pv_unbind_evtchn
>>
>> Signed-off-by: Emil Condrea 
>
>Acked-by: Anthony PERARD 
>

Reviewed-by: Quan Xu 

Quan



Re: [Qemu-devel] Async savevm using userfaultfd(2)

2016-10-12 Thread Hailiang Zhang

On 2016/10/12 22:21, Dr. David Alan Gilbert wrote:

* Stefan Hajnoczi (stefa...@gmail.com) wrote:

John and I recently discussed asynchronous savevm and I wanted to post
the ideas so they aren't forgotten.  (We're not actively working on this
feature.)

Asynchronous savevm has the same effect as the 'savevm' monitor command:
it saves RAM, device state, and a snapshot of all disks at the point in
time the command was issued.

The current 'savevm' monitor command is synchronous so the guest and
QEMU monitor are blocked while the operation runs (it can take a
while!).  Asynchronous savevm has the advantage of allowing the guest
and QEMU monitor to continue while the operation is running.

This sounds similar to live migration to file but remember that live
migration's consistency point is when the guest is paused at the end of
the iteration phase.  The user has no control over *when* live migration
captures the guest state.  Therefore it's not a useful command for
taking snapshots of guest state at a specific point in time - we need
asynchronous savevm for that.

Async savevm must copy-on-write guest RAM so the guest can continue
writing to memory while the snapshot is being saved.  Rik van Riel
suggested using userfaultfd(2) to do this on Linux.

Unlike post-copy live migration, we want to track memory writes (instead
of missing page faults).  The userfaultfd(2) flag
UFFDIO_REGISTER_MODE_WP provides these semantics.  Unfortunately I think
UFFDIO_REGISTER_MODE_WP is not yet implemented?


A prototype of this has already been written by Hailiang Zhang;
see https://lists.gnu.org/archive/html/qemu-devel/2016-08/msg03441.html



Yes, I have updated it to 2th version in private, but unfortunately,
there are still some problems with UFFDIO_REGISTER_MODE_WP API in kernel,
It still can't support KVM, (only supports tcg mode).
I have given feedback to Andrea, but got no response ... :(
http://www.mail-archive.com/qemu-devel@nongnu.org/msg394897.html


Once UFFDIO_REGISTER_MODE_WP is available QEMU can catch writes to guest
RAM and copy the original pages to a buffer.  If memory is dirtied too
quickly then it's necessary to throttle the guest or fail the savevm
operation.


The only limit there is the size of the buffer, waiting for space will
do the throttling.



Yes, We can optimize it by extend the size of buffer and use multiple fds to
handle the user fault.

Hailiang


Dave



Perhaps this approach can be prototyped with mprotect and a SIGSEGV
handler if anyone wants to get async savevm going.  I don't know if
there are any disadvantages to mprotecting guest RAM that the kvm kernel
module is using.  Hopefully in-kernel devices and vhost will continue to
work.

Stefan



--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK


.






Re: [Qemu-devel] [PATCH v2 04/11] blockjobs: Always use block_job_get_aio_context

2016-10-12 Thread John Snow
As context to everyone else as to why I'm going down the rabbit hole of 
trying to remove external references to AioContext at all, see 
https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg00795.html


On 10/07/2016 03:49 AM, Paolo Bonzini wrote:



On 06/10/2016 22:22, John Snow wrote:

Calls .complete(), for which the only implementation is
mirror_complete. Uh, this actually seems messy. Looks like there's
nothing to prevent us from calling this after we've already told it to
complete once.


Yeah, it should have an

if (s->should_complete) {
return;
}

at the beginning.  I have other mirror.c patches in my queue so I can
take care of that.



Or something up the stack at block_job_complete so it's not up to job 
implementations? What if the next implementer "forgets."



block_job_cancel and block_job_complete are different.

block_job_cancel is called in many places, but we can just add a similar
block_job_user_cancel if we wanted a version which takes care to acquire
context and one that does not. (Or we could just acquire the context
regardless, but Paolo warned me ominously that recursive locks are EVIL.
He sounded serious.)


Not that many places:

- block_job_finish_sync calls it, and you can just release/reacquire
around the call to "finish(job, _err)".



This makes me a little nervous because we went through the trouble of 
creating this callback, but we're going to assume we know that it's a 
public interface that will take the lock for itself (or otherwise does 
not require a lock.)


In practice it works, but it seems needlessly mystifying in terms of 
proving correctness.



- there are two callers in blockdev.c, and you can just remove the
acquire/release from blockdev.c if you push it in block_job_cancel.



Makes sense; I don't like the association of (bs :: job) here anyway. 
Again we're grabbing context for a job where that job may not even be 
running.



As to block_job_cancel_sync:



Which I didn't audit, because no callers use job->blk to get the 
AioContext before calling this; they use bs if bs->job is present.



- replication_stop is not acquiring s->secondary_disk->bs's AioContext.



Seems like a bug on their part. Would be fixed by having cancel acquire 
context for itself.



- there is no need to hold the AioContext between ->prepare and ->clean.
 My suggestion is to ref the blockjob after storing it in state->job
(you probably should do that anyway) and unref'ing it in ->clean.  Then
you can call again let block_job_cancel_sync(bs->job) take the
AioContext, which it will do in block_job_finish_sync.


Yeah, I should be reffing it anyway.

The rest of this... What I think you mean is acquiring and releasing the 
context as needed for EACH of prepare, commit, abort, and clean as 
necessary, right?


And then in this case, it simply wouldn't be necessary for abort, as the 
sync cancel would do it for us.





block_job_complete has no direct callers outside of QMP, but it is also
used as a callback by block_job_complete_sync, used in qemu-img for
run_block_job. I can probably rewrite qemu_img to avoid this usage.


No need to: qemu-img is not acquiring the AioContext, so it's okay to
let block_job_complete do that (like block_job_cancel,
block_job_complete will be called by block_job_finish_sync without the
AioContext acquired).



Eh? Oh, you're right, it just gets it for the sake of aio_poll.


Paolo




Alright.


Say I *do* push the acquisitions down into blockjob.c. What benefit does 
that provide? Won't I still need the block_job_get_aio_context() 
function (At least internally) to know which context to acquire? This 
would preclude you from deleting it.


Plus... we remove some fairly simple locking mechanisms and then inflate 
it tenfold. I'm not convinced this is an improvement.


As context and a refresher (for me when I re-read this email in 12 
hours,) there are three places externally that are using an AioContext 
lock as acquired from *within* a BlockJob, excluding those that acquire 
a context separately from a Job and use that to reason that accesses to 
the job are safe (For example, blockdev_mark_auto_del.)


(1) QMP interface for job management
(2) bdrv_drain_all, in block/io.c


(1) AFAICT, the QMP interface is concerned with assuring itself it has 
unique access to the BlockJob structure itself, and it doesn't really 
authentically care about the AIOContext itself -- just race-free access 
to the Job.


This is not necessarily buggy today because, even though we grab the 
BlockBackend's context unconditionally, we already know the main/monitor 
thread is not accessing the blockjob. It's still silly, though.


(2) bdrv_drain_all appears to be worried about the same thing; we just 
need to safely deliver pause/resume messages.


I'm less sure about where this can run from, and suspect that if the job 
has deferred to main that this could be buggy. If bdrv_drain_all is 
called from context A and the job is running on context M having 

[Qemu-devel] [PATCHv3 6/7] spapr_pci: Add a 64-bit MMIO window

2016-10-12 Thread David Gibson
On real hardware, and under pHyp, the PCI host bridges on Power machines
typically advertise two outbound MMIO windows from the guest's physical
memory space to PCI memory space:
  - A 32-bit window which maps onto 2GiB..4GiB in the PCI address space
  - A 64-bit window which maps onto a large region somewhere high in PCI
address space (traditionally this used an identity mapping from guest
physical address to PCI address, but that's not always the case)

The qemu implementation in spapr-pci-host-bridge, however, only supports a
single outbound MMIO window, however.  At least some Linux versions expect
the two windows however, so we arranged this window to map onto the PCI
memory space from 2 GiB..~64 GiB, then advertised it as two contiguous
windows, the "32-bit" window from 2G..4G and the "64-bit" window from
4G..~64G.

This approach means, however, that the 64G window is not naturally aligned.
In turn this limits the size of the largest BAR we can map (which does have
to be naturally aligned) to roughly half of the total window.  With some
large nVidia GPGPU cards which have huge memory BARs, this is starting to
be a problem.

This patch adds true support for separate 32-bit and 64-bit outbound MMIO
windows to the spapr-pci-host-bridge implementation, each of which can
be independently configured.  The 32-bit window always maps to 2G.. in PCI
space, but the PCI address of the 64-bit window can be configured (it
defaults to the same as the guest physical address).

So as not to break possible existing configurations, as long as a 64-bit
window is not specified, a large single window can be specified.  This
will appear the same way to the guest as the old approach, although it's
now implemented by two contiguous memory regions rather than a single one.

For now, this only adds the possibility of 64-bit windows.  The default
configuration still uses the legacy mode.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  | 10 +--
 hw/ppc/spapr_pci.c  | 70 -
 include/hw/pci-host/spapr.h |  8 --
 include/hw/ppc/spapr.h  |  3 +-
 4 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e6b110d..8db3657 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2371,7 +2371,8 @@ static HotpluggableCPUList 
*spapr_query_hotpluggable_cpus(MachineState *machine)
 }
 
 static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
-uint64_t *buid, hwaddr *pio, hwaddr *mmio,
+uint64_t *buid, hwaddr *pio,
+hwaddr *mmio32, hwaddr *mmio64,
 unsigned n_dma, uint32_t *liobns, Error **errp)
 {
 const uint64_t base_buid = 0x8002000ULL;
@@ -2409,7 +2410,12 @@ static void spapr_phb_placement(sPAPRMachineState 
*spapr, uint32_t index,
 
 phb_base = phb0_base + index * phb_spacing;
 *pio = phb_base + pio_offset;
-*mmio = phb_base + mmio_offset;
+*mmio32 = phb_base + mmio_offset;
+/*
+ * We don't set the 64-bit MMIO window, relying on the PHB's
+ * fallback behaviour of automatically splitting a large "32-bit"
+ * window into contiguous 32-bit and 64-bit windows
+ */
 }
 
 static void spapr_machine_class_init(ObjectClass *oc, void *data)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 0e6cf4d..31ca6fa 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1317,14 +1317,16 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
 if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != 
(uint32_t)-1)
 || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2)
 || (sphb->mem_win_addr != (hwaddr)-1)
+|| (sphb->mem64_win_addr != (hwaddr)-1)
 || (sphb->io_win_addr != (hwaddr)-1)) {
 error_setg(errp, "Either \"index\" or other parameters must"
" be specified for PAPR PHB, not both");
 return;
 }
 
-smc->phb_placement(spapr, sphb->index, >buid,
-   >io_win_addr, >mem_win_addr,
+smc->phb_placement(spapr, sphb->index,
+   >buid, >io_win_addr,
+   >mem_win_addr, >mem64_win_addr,
windows_supported, sphb->dma_liobn, _err);
 if (local_err) {
 error_propagate(errp, local_err);
@@ -1353,6 +1355,38 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
+if (sphb->mem64_win_size != 0) {
+if (sphb->mem64_win_addr == (hwaddr)-1) {
+error_setg(errp,
+   "64-bit memory window address not specified for PHB");
+return;
+}
+
+if (sphb->mem_win_size > SPAPR_PCI_MEM32_WIN_SIZE) {
+error_setg(errp, "32-bit memory window of size 

[Qemu-devel] [PATCHv3 7/7] spapr: Improved placement of PCI host bridges in guest memory map

2016-10-12 Thread David Gibson
Currently, the MMIO space for accessing PCI on pseries guests begins at
1 TiB in guest address space.  Each PCI host bridge (PHB) has a 64 GiB
chunk of address space in which it places its outbound PIO and 32-bit and
64-bit MMIO windows.

This scheme as several problems:
  - It limits guest RAM to 1 TiB (though we have a limited fix for this
now)
  - It limits the total MMIO window to 64 GiB.  This is not always enough
for some of the large nVidia GPGPU cards
  - Putting all the windows into a single 64 GiB area means that naturally
aligning things within there will waste more address space.
In addition there was a miscalculation in some of the defaults, which meant
that the MMIO windows for each PHB actually slightly overran the 64 GiB
region for that PHB.  We got away without nasty consequences because
the overrun fit within an unused area at the beginning of the next PHB's
region, but it's not pretty.

This patch implements a new scheme which addresses those problems, and is
also closer to what bare metal hardware and pHyp guests generally use.

Because some guest versions (including most current distro kernels) can't
access PCI MMIO above 64 TiB, we put all the PCI windows between 32 TiB and
64 TiB.  This is broken into 1 TiB chunks.  The 1 TiB contains the PIO
(64 kiB) and 32-bit MMIO (2 GiB) windows for all of the PHBs.  Each
subsequent TiB chunk contains a naturally aligned 64-bit MMIO window for
one PHB each.

This reduces the number of allowed PHBs (without full manual configuration
of all the windows) from 256 to 31, but this should still be plenty in
practice.

We also change some of the default window sizes for manually configured
PHBs to saner values.

Finally we adjust some tests and libqos so that it correctly uses the new
default locations.  Ideally it would parse the device tree given to the
guest, but that's a more complex problem for another time.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  | 126 +---
 hw/ppc/spapr_pci.c  |   5 +-
 include/hw/pci-host/spapr.h |   8 ++-
 tests/endianness-test.c |   3 +-
 tests/libqos/pci-spapr.c|   9 ++--
 tests/spapr-phb-test.c  |   2 +-
 6 files changed, 113 insertions(+), 40 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 8db3657..2d952a8 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2375,31 +2375,42 @@ static void spapr_phb_placement(sPAPRMachineState 
*spapr, uint32_t index,
 hwaddr *mmio32, hwaddr *mmio64,
 unsigned n_dma, uint32_t *liobns, Error **errp)
 {
+/*
+ * New-style PHB window placement.
+ *
+ * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
+ * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
+ * windows.
+ *
+ * Some guest kernels can't work with MMIO windows above 1<<46
+ * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
+ *
+ * 32TiB..33TiB contains the PIO and 32-bit MMIO windows for all
+ * PHBs.  33..34TiB has the 64-bit MMIO window for PHB0, 34..35
+ * has the 64-bit window for PHB1 and so forth.
+ */
 const uint64_t base_buid = 0x8002000ULL;
-const hwaddr phb_spacing = 0x10ULL; /* 64 GiB */
-const hwaddr mmio_offset = 0xa000; /* 2 GiB + 512 MiB */
-const hwaddr pio_offset = 0x8000; /* 2 GiB */
-const uint32_t max_index = 255;
-const hwaddr phb0_alignment = 0x100ULL; /* 1 TiB */
 
-uint64_t ram_top = MACHINE(spapr)->ram_size;
-hwaddr phb0_base, phb_base;
+int max_phbs =
+(SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / SPAPR_PCI_MEM64_WIN_SIZE - 1;
+hwaddr mmio32_base = SPAPR_PCI_BASE + SPAPR_PCI_MEM32_WIN_SIZE;
+hwaddr mmio64_base = SPAPR_PCI_BASE + SPAPR_PCI_MEM64_WIN_SIZE;
 int i;
 
-/* Do we have hotpluggable memory? */
-if (MACHINE(spapr)->maxram_size > ram_top) {
-/* Can't just use maxram_size, because there may be an
- * alignment gap between normal and hotpluggable memory
- * regions */
-ram_top = spapr->hotplug_memory.base +
-memory_region_size(>hotplug_memory.mr);
-}
-
-phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
-
-if (index > max_index) {
+/* Sanity check natural alignments */
+assert((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) == 0);
+assert((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) == 0);
+assert((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) == 0);
+assert((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) == 0);
+/* Sanity check bounds */
+assert((SPAPR_PCI_BASE + max_phbs * SPAPR_PCI_IO_WIN_SIZE)
+   <= mmio32_base);
+assert(mmio32_base + max_phbs * SPAPR_PCI_MEM32_WIN_SIZE
+   <= mmio64_base);
+
+if (index >= max_phbs) {
 error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
-

[Qemu-devel] [PATCHv3 1/7] libqos: Isolate knowledge of spapr memory map to qpci_init_spapr()

2016-10-12 Thread David Gibson
The libqos code for accessing PCI on the spapr machine type uses IOBASE()
and MMIOBASE() macros to determine the address in the CPU memory map of
the windows to PCI address space.

This is a detail of the implementation of PCI in the machine type, it's not
specified by the PAPR standard.  Real guests would get the addresses of the
PCI windows from the device tree.

Finding the device tree in libqos would be awkward, but we can at least
localize this knowledge of the implementation to the init function, saving
it in the QPCIBusSPAPR structure for use by the accessors.

That leaves only one place to fix if we alter the location of the PCI
windows, as we're planning to do.

Signed-off-by: David Gibson 
Reviewed-by: Laurent Vivier 
---
 tests/libqos/pci-spapr.c | 113 +++
 1 file changed, 64 insertions(+), 49 deletions(-)

diff --git a/tests/libqos/pci-spapr.c b/tests/libqos/pci-spapr.c
index 2f73bad..1765a54 100644
--- a/tests/libqos/pci-spapr.c
+++ b/tests/libqos/pci-spapr.c
@@ -18,30 +18,23 @@
 
 /* From include/hw/pci-host/spapr.h */
 
-#define SPAPR_PCI_BASE_BUID  0x8002000ULL
-
-#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL
-
-#define SPAPR_PCI_WINDOW_BASE0x100ULL
-#define SPAPR_PCI_WINDOW_SPACING 0x10ULL
-#define SPAPR_PCI_MMIO_WIN_OFF   0xA000
-#define SPAPR_PCI_MMIO_WIN_SIZE  (SPAPR_PCI_WINDOW_SPACING - \
- SPAPR_PCI_MEM_WIN_BUS_OFFSET)
-#define SPAPR_PCI_IO_WIN_OFF 0x8000
-#define SPAPR_PCI_IO_WIN_SIZE0x1
-
-/* index is the phb index */
-
-#define BUIDBASE(index)  (SPAPR_PCI_BASE_BUID + (index))
-#define PCIBASE(index)   (SPAPR_PCI_WINDOW_BASE + \
-  (index) * SPAPR_PCI_WINDOW_SPACING)
-#define IOBASE(index)(PCIBASE(index) + SPAPR_PCI_IO_WIN_OFF)
-#define MMIOBASE(index)  (PCIBASE(index) + SPAPR_PCI_MMIO_WIN_OFF)
+typedef struct QPCIWindow {
+uint64_t pci_base;/* window address in PCI space */
+uint64_t size;/* window size */
+} QPCIWindow;
 
 typedef struct QPCIBusSPAPR {
 QPCIBus bus;
 QGuestAllocator *alloc;
 
+uint64_t buid;
+
+uint64_t pio_cpu_base;
+QPCIWindow pio;
+
+uint64_t mmio_cpu_base;
+QPCIWindow mmio;
+
 uint64_t pci_hole_start;
 uint64_t pci_hole_size;
 uint64_t pci_hole_alloc;
@@ -59,69 +52,75 @@ typedef struct QPCIBusSPAPR {
 
 static uint8_t qpci_spapr_io_readb(QPCIBus *bus, void *addr)
 {
+QPCIBusSPAPR *s = container_of(bus, QPCIBusSPAPR, bus);
 uint64_t port = (uintptr_t)addr;
 uint8_t v;
-if (port < SPAPR_PCI_IO_WIN_SIZE) {
-v = readb(IOBASE(0) + port);
+if (port < s->pio.size) {
+v = readb(s->pio_cpu_base + port);
 } else {
-v = readb(MMIOBASE(0) + port);
+v = readb(s->mmio_cpu_base + port);
 }
 return v;
 }
 
 static uint16_t qpci_spapr_io_readw(QPCIBus *bus, void *addr)
 {
+QPCIBusSPAPR *s = container_of(bus, QPCIBusSPAPR, bus);
 uint64_t port = (uintptr_t)addr;
 uint16_t v;
-if (port < SPAPR_PCI_IO_WIN_SIZE) {
-v = readw(IOBASE(0) + port);
+if (port < s->pio.size) {
+v = readw(s->pio_cpu_base + port);
 } else {
-v = readw(MMIOBASE(0) + port);
+v = readw(s->mmio_cpu_base + port);
 }
 return bswap16(v);
 }
 
 static uint32_t qpci_spapr_io_readl(QPCIBus *bus, void *addr)
 {
+QPCIBusSPAPR *s = container_of(bus, QPCIBusSPAPR, bus);
 uint64_t port = (uintptr_t)addr;
 uint32_t v;
-if (port < SPAPR_PCI_IO_WIN_SIZE) {
-v = readl(IOBASE(0) + port);
+if (port < s->pio.size) {
+v = readl(s->pio_cpu_base + port);
 } else {
-v = readl(MMIOBASE(0) + port);
+v = readl(s->mmio_cpu_base + port);
 }
 return bswap32(v);
 }
 
 static void qpci_spapr_io_writeb(QPCIBus *bus, void *addr, uint8_t value)
 {
+QPCIBusSPAPR *s = container_of(bus, QPCIBusSPAPR, bus);
 uint64_t port = (uintptr_t)addr;
-if (port < SPAPR_PCI_IO_WIN_SIZE) {
-writeb(IOBASE(0) + port, value);
+if (port < s->pio.size) {
+writeb(s->pio_cpu_base + port, value);
 } else {
-writeb(MMIOBASE(0) + port, value);
+writeb(s->mmio_cpu_base + port, value);
 }
 }
 
 static void qpci_spapr_io_writew(QPCIBus *bus, void *addr, uint16_t value)
 {
+QPCIBusSPAPR *s = container_of(bus, QPCIBusSPAPR, bus);
 uint64_t port = (uintptr_t)addr;
 value = bswap16(value);
-if (port < SPAPR_PCI_IO_WIN_SIZE) {
-writew(IOBASE(0) + port, value);
+if (port < s->pio.size) {
+writew(s->pio_cpu_base + port, value);
 } else {
-writew(MMIOBASE(0) + port, value);
+writew(s->mmio_cpu_base + port, value);
 }
 }
 
 static void qpci_spapr_io_writel(QPCIBus *bus, void *addr, uint32_t value)
 {
+QPCIBusSPAPR *s = 

[Qemu-devel] [PATCHv3 4/7] spapr_pci: Delegate placement of PCI host bridges to machine type

2016-10-12 Thread David Gibson
The 'spapr-pci-host-bridge' represents the virtual PCI host bridge (PHB)
for a PAPR guest.  Unlike on x86, it's routine on Power (both bare metal
and PAPR guests) to have numerous independent PHBs, each controlling a
separate PCI domain.

There are two ways of configuring the spapr-pci-host-bridge device: first
it can be done fully manually, specifying the locations and sizes of all
the IO windows.  This gives the most control, but is very awkward with 6
mandatory parameters.  Alternatively just an "index" can be specified
which essentially selects from an array of predefined PHB locations.
The PHB at index 0 is automatically created as the default PHB.

The current set of default locations causes some problems for guests with
large RAM (> 1 TiB) or PCI devices with very large BARs (e.g. big nVidia
GPGPU cards via VFIO).  Obviously, for migration we can only change the
locations on a new machine type, however.

This is awkward, because the placement is currently decided within the
spapr-pci-host-bridge code, so it breaks abstraction to look inside the
machine type version.

So, this patch delegates the "default mode" PHB placement from the
spapr-pci-host-bridge device back to the machine type via a public method
in sPAPRMachineClass.  It's still a bit ugly, but it's about the best we
can do.

For now, this just changes where the calculation is done.  It doesn't
change the actual location of the host bridges, or any other behaviour.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  | 31 +++
 hw/ppc/spapr_pci.c  | 21 +++--
 include/hw/pci-host/spapr.h | 11 +--
 include/hw/ppc/spapr.h  |  3 +++
 4 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 03e3803..cb9da96 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2370,6 +2370,36 @@ static HotpluggableCPUList 
*spapr_query_hotpluggable_cpus(MachineState *machine)
 return head;
 }
 
+static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
+uint64_t *buid, hwaddr *pio, hwaddr *mmio,
+unsigned n_dma, uint32_t *liobns, Error **errp)
+{
+const uint64_t base_buid = 0x8002000ULL;
+const hwaddr phb0_base = 0x100ULL; /* 1 TiB */
+const hwaddr phb_spacing = 0x10ULL; /* 64 GiB */
+const hwaddr mmio_offset = 0xa000; /* 2 GiB + 512 MiB */
+const hwaddr pio_offset = 0x8000; /* 2 GiB */
+const uint32_t max_index = 255;
+
+hwaddr phb_base;
+int i;
+
+if (index > max_index) {
+error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
+   max_index);
+return;
+}
+
+*buid = base_buid + index;
+for (i = 0; i < n_dma; ++i) {
+liobns[i] = SPAPR_PCI_LIOBN(index, i);
+}
+
+phb_base = phb0_base + index * phb_spacing;
+*pio = phb_base + pio_offset;
+*mmio = phb_base + mmio_offset;
+}
+
 static void spapr_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
@@ -2406,6 +2436,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 mc->query_hotpluggable_cpus = spapr_query_hotpluggable_cpus;
 fwc->get_dev_path = spapr_get_fw_dev_path;
 nc->nmi_monitor_handler = spapr_nmi;
+smc->phb_placement = spapr_phb_placement;
 }
 
 static const TypeInfo spapr_machine_info = {
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 4f00865..0e6cf4d 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1311,7 +1311,8 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
 sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
 
 if (sphb->index != (uint32_t)-1) {
-hwaddr windows_base;
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+Error *local_err = NULL;
 
 if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != 
(uint32_t)-1)
 || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2)
@@ -1322,21 +1323,13 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
-if (sphb->index > SPAPR_PCI_MAX_INDEX) {
-error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
-   SPAPR_PCI_MAX_INDEX);
+smc->phb_placement(spapr, sphb->index, >buid,
+   >io_win_addr, >mem_win_addr,
+   windows_supported, sphb->dma_liobn, _err);
+if (local_err) {
+error_propagate(errp, local_err);
 return;
 }
-
-sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index;
-for (i = 0; i < windows_supported; ++i) {
-sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i);
-}
-
-windows_base = SPAPR_PCI_WINDOW_BASE
-+ sphb->index * SPAPR_PCI_WINDOW_SPACING;
-

[Qemu-devel] [PATCHv3 3/7] libqos: Limit spapr-pci to 32-bit MMIO for now

2016-10-12 Thread David Gibson
Currently the functions in pci-spapr.c (like pci-pc.c on which it's based)
don't distinguish between 32-bit and 64-bit PCI MMIO.  At the moment, the
qemu side implementation is a bit weird and has a single MMIO window
straddling 32-bit and 64-bit regions, but we're likely to change that in
future.

In any case, pci-pc.c - and therefore the testcases using PCI - only handle
32-bit MMIOs for now.  For spapr despite whatever changes might happen with
the MMIO windows, the 32-bit window is likely to remain at 2..4 GiB in PCI
space.

So, explicitly limit pci-spapr.c to 32-bit MMIOs for now, we can add 64-bit
MMIO support back in when and if we need it.

Signed-off-by: David Gibson 
Reviewed-by: Laurent Vivier 
---
 tests/libqos/pci-spapr.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/tests/libqos/pci-spapr.c b/tests/libqos/pci-spapr.c
index 3192903..558dfc3 100644
--- a/tests/libqos/pci-spapr.c
+++ b/tests/libqos/pci-spapr.c
@@ -32,8 +32,8 @@ typedef struct QPCIBusSPAPR {
 uint64_t pio_cpu_base;
 QPCIWindow pio;
 
-uint64_t mmio_cpu_base;
-QPCIWindow mmio;
+uint64_t mmio32_cpu_base;
+QPCIWindow mmio32;
 
 uint64_t pci_hole_start;
 uint64_t pci_hole_size;
@@ -58,7 +58,7 @@ static uint8_t qpci_spapr_io_readb(QPCIBus *bus, void *addr)
 if (port < s->pio.size) {
 v = readb(s->pio_cpu_base + port);
 } else {
-v = readb(s->mmio_cpu_base + port);
+v = readb(s->mmio32_cpu_base + port);
 }
 return v;
 }
@@ -71,7 +71,7 @@ static uint16_t qpci_spapr_io_readw(QPCIBus *bus, void *addr)
 if (port < s->pio.size) {
 v = readw(s->pio_cpu_base + port);
 } else {
-v = readw(s->mmio_cpu_base + port);
+v = readw(s->mmio32_cpu_base + port);
 }
 return bswap16(v);
 }
@@ -84,7 +84,7 @@ static uint32_t qpci_spapr_io_readl(QPCIBus *bus, void *addr)
 if (port < s->pio.size) {
 v = readl(s->pio_cpu_base + port);
 } else {
-v = readl(s->mmio_cpu_base + port);
+v = readl(s->mmio32_cpu_base + port);
 }
 return bswap32(v);
 }
@@ -96,7 +96,7 @@ static void qpci_spapr_io_writeb(QPCIBus *bus, void *addr, 
uint8_t value)
 if (port < s->pio.size) {
 writeb(s->pio_cpu_base + port, value);
 } else {
-writeb(s->mmio_cpu_base + port, value);
+writeb(s->mmio32_cpu_base + port, value);
 }
 }
 
@@ -108,7 +108,7 @@ static void qpci_spapr_io_writew(QPCIBus *bus, void *addr, 
uint16_t value)
 if (port < s->pio.size) {
 writew(s->pio_cpu_base + port, value);
 } else {
-writew(s->mmio_cpu_base + port, value);
+writew(s->mmio32_cpu_base + port, value);
 }
 }
 
@@ -120,7 +120,7 @@ static void qpci_spapr_io_writel(QPCIBus *bus, void *addr, 
uint32_t value)
 if (port < s->pio.size) {
 writel(s->pio_cpu_base + port, value);
 } else {
-writel(s->mmio_cpu_base + port, value);
+writel(s->mmio32_cpu_base + port, value);
 }
 }
 
@@ -235,12 +235,9 @@ static void qpci_spapr_iounmap(QPCIBus *bus, void *data)
 /* FIXME */
 }
 
-#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL
 #define SPAPR_PCI_WINDOW_BASE0x100ULL
-#define SPAPR_PCI_WINDOW_SPACING 0x10ULL
-#define SPAPR_PCI_MMIO_WIN_OFF   0xA000
-#define SPAPR_PCI_MMIO_WIN_SIZE  (SPAPR_PCI_WINDOW_SPACING - \
- SPAPR_PCI_MEM_WIN_BUS_OFFSET)
+#define SPAPR_PCI_MMIO32_WIN_OFF 0xA000
+#define SPAPR_PCI_MMIO32_WIN_SIZE0x8000 /* 2 GiB */
 #define SPAPR_PCI_IO_WIN_OFF 0x8000
 #define SPAPR_PCI_IO_WIN_SIZE0x1
 
@@ -280,13 +277,14 @@ QPCIBus *qpci_init_spapr(QGuestAllocator *alloc)
 ret->pio.pci_base = 0;
 ret->pio.size = SPAPR_PCI_IO_WIN_SIZE;
 
-ret->mmio_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_MMIO_WIN_OFF;
-ret->mmio.pci_base = SPAPR_PCI_MEM_WIN_BUS_OFFSET;
-ret->mmio.size = SPAPR_PCI_MMIO_WIN_SIZE;
+/* 32-bit portion of the MMIO window is at PCI address 2..4 GiB */
+ret->mmio32_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_MMIO32_WIN_OFF;
+ret->mmio32.pci_base = 0x8000; /* 2 GiB */
+ret->mmio32.size = SPAPR_PCI_MMIO32_WIN_SIZE;
 
 ret->pci_hole_start = 0xC000;
 ret->pci_hole_size =
-ret->mmio.pci_base + ret->mmio.size - ret->pci_hole_start;
+ret->mmio32.pci_base + ret->mmio32.size - ret->pci_hole_start;
 ret->pci_hole_alloc = 0;
 
 ret->pci_iohole_start = 0xc000;
-- 
2.7.4




[Qemu-devel] [PATCHv3 2/7] libqos: Correct error in PCI hole sizing for spapr

2016-10-12 Thread David Gibson
In pci-spapr.c (as in pci-pc.c from which it was derived), the
pci_hole_start/pci_hole_size and pci_iohole_start/pci_iohole_size pairs[1]
essentially define the region of PCI (not CPU) addresses in which MMIO
or PIO BARs respectively will be allocated.

The size value is relative to the start value.  But in pci-spapr.c it is
set to the entire size of the window supported by the (emulated) hardware,
but the start values are *not* at the beginning of the emulated windows.

That means if you tried to map enough PCI BARs, we'd messily overrun the
IO windows, instead of failing in iomap as we should.

This patch corrects this by calculating the hole sizes from the location
of the window in PCI space and the hole start.

[1] Those are bad names, but that's a problem for another time.

Signed-off-by: David Gibson 
Reviewed-by: Laurent Vivier 
---
 tests/libqos/pci-spapr.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/libqos/pci-spapr.c b/tests/libqos/pci-spapr.c
index 1765a54..3192903 100644
--- a/tests/libqos/pci-spapr.c
+++ b/tests/libqos/pci-spapr.c
@@ -285,11 +285,13 @@ QPCIBus *qpci_init_spapr(QGuestAllocator *alloc)
 ret->mmio.size = SPAPR_PCI_MMIO_WIN_SIZE;
 
 ret->pci_hole_start = 0xC000;
-ret->pci_hole_size = SPAPR_PCI_MMIO_WIN_SIZE;
+ret->pci_hole_size =
+ret->mmio.pci_base + ret->mmio.size - ret->pci_hole_start;
 ret->pci_hole_alloc = 0;
 
 ret->pci_iohole_start = 0xc000;
-ret->pci_iohole_size = SPAPR_PCI_IO_WIN_SIZE;
+ret->pci_iohole_size =
+ret->pio.pci_base + ret->pio.size - ret->pci_iohole_start;
 ret->pci_iohole_alloc = 0;
 
 return >bus;
-- 
2.7.4




[Qemu-devel] [PATCHv3 5/7] spapr: Adjust placement of PCI host bridge to allow > 1TiB RAM

2016-10-12 Thread David Gibson
Currently the default PCI host bridge for the 'pseries' machine type is
constructed with its IO windows in the 1TiB..(1TiB + 64GiB) range in
guest memory space.  This means that if > 1TiB of guest RAM is specified,
the RAM will collide with the PCI IO windows, causing serious problems.

Problems won't be obvious until guest RAM goes a bit beyond 1TiB, because
there's a little unused space at the bottom of the area reserved for PCI,
but essentially this means that > 1TiB of RAM has never worked with the
pseries machine type.

This patch fixes this by altering the placement of PHBs on large-RAM VMs.
Instead of always placing the first PHB at 1TiB, it is placed at the next
1 TiB boundary after the maximum RAM address.

Technically, this changes behaviour in a migration-breaking way for
existing machines with > 1TiB maximum memory, but since having > 1 TiB
memory was broken anyway, this seems like a reasonable trade-off.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index cb9da96..e6b110d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2375,15 +2375,27 @@ static void spapr_phb_placement(sPAPRMachineState 
*spapr, uint32_t index,
 unsigned n_dma, uint32_t *liobns, Error **errp)
 {
 const uint64_t base_buid = 0x8002000ULL;
-const hwaddr phb0_base = 0x100ULL; /* 1 TiB */
 const hwaddr phb_spacing = 0x10ULL; /* 64 GiB */
 const hwaddr mmio_offset = 0xa000; /* 2 GiB + 512 MiB */
 const hwaddr pio_offset = 0x8000; /* 2 GiB */
 const uint32_t max_index = 255;
+const hwaddr phb0_alignment = 0x100ULL; /* 1 TiB */
 
-hwaddr phb_base;
+uint64_t ram_top = MACHINE(spapr)->ram_size;
+hwaddr phb0_base, phb_base;
 int i;
 
+/* Do we have hotpluggable memory? */
+if (MACHINE(spapr)->maxram_size > ram_top) {
+/* Can't just use maxram_size, because there may be an
+ * alignment gap between normal and hotpluggable memory
+ * regions */
+ram_top = spapr->hotplug_memory.base +
+memory_region_size(>hotplug_memory.mr);
+}
+
+phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
+
 if (index > max_index) {
 error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
max_index);
-- 
2.7.4




[Qemu-devel] [PATCHv3 0/7] Improve PCI IO window orgnaization for pseries

2016-10-12 Thread David Gibson
The current way we organize the IO windows into PCI space for the
pseries machine type has several problems.

  - It makes it difficult to create very large MMIO spaces which is
necessary for certain PCI devices with very large BARs.  This
problem has been known for a while.

  - More recently we discovered a more serious problem: it prevents
more than 1TiB of RAM being added to a pseries guest.

  - It doesn't make very efficient use of address space.

Fixing this is complicated by keeping migration from old versionss
working and working out what things belong on which side of the
abstraction barrier between the machine type and the host bridge
device.

This series addresses all these problems.  Patches 1-3/7 perform
preliminary cleanups to the spapr specific PCI test code, which we'll
need to get the tests working with the changed implementation.  4-5/7
represent a minimal fix for the most serious problem (the 1 TiB limit)
- once polished, I'll consider submiting these for the stable branch.
6-7/7 complete a more comprehensive fix.

Changes since v2:
  * Removed window sizes from placement callback.  Having them in there
led to a small behavioural change that wasn't intended
  * Adjusted / added some comments for clarity.
Changes since v1:
  * Removed a debugging printf()
Changes since RFC:
  * Rebase
  * Fixed some bugs
  * Fixed up PCI testcases which were broken by the change (due to
test limitations)
  * Seriously contemplated, then rejected a completely different
approach

*** BLURB HERE ***

David Gibson (7):
  libqos: Isolate knowledge of spapr memory map to qpci_init_spapr()
  libqos: Correct error in PCI hole sizing for spapr
  libqos: Limit spapr-pci to 32-bit MMIO for now
  spapr_pci: Delegate placement of PCI host bridges to machine type
  spapr: Adjust placement of PCI host bridge to allow > 1TiB RAM
  spapr_pci: Add a 64-bit MMIO window
  spapr: Improved placement of PCI host bridges in guest memory map

 hw/ppc/spapr.c  | 119 +++-
 hw/ppc/spapr_pci.c  |  90 +++--
 include/hw/pci-host/spapr.h |  25 +-
 include/hw/ppc/spapr.h  |   4 ++
 tests/endianness-test.c |   3 +-
 tests/libqos/pci-spapr.c| 116 +++---
 tests/spapr-phb-test.c  |   2 +-
 7 files changed, 265 insertions(+), 94 deletions(-)

-- 
2.7.4




Re: [Qemu-devel] [PATCHv2 5/7] spapr: Adjust placement of PCI host bridge to allow > 1TiB RAM

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 02:06:10PM +0200, Laurent Vivier wrote:
> 
> 
> On 12/10/2016 12:55, David Gibson wrote:
> > On Wed, Oct 12, 2016 at 12:07:50PM +0200, Laurent Vivier wrote:
> >>
> >>
> >> On 12/10/2016 06:44, David Gibson wrote:
> >>> Currently the default PCI host bridge for the 'pseries' machine type is
> >>> constructed with its IO windows in the 1TiB..(1TiB + 64GiB) range in
> >>> guest memory space.  This means that if > 1TiB of guest RAM is specified,
> >>> the RAM will collide with the PCI IO windows, causing serious problems.
> >>>
> >>> Problems won't be obvious until guest RAM goes a bit beyond 1TiB, because
> >>> there's a little unused space at the bottom of the area reserved for PCI,
> >>> but essentially this means that > 1TiB of RAM has never worked with the
> >>> pseries machine type.
> >>>
> >>> This patch fixes this by altering the placement of PHBs on large-RAM VMs.
> >>> Instead of always placing the first PHB at 1TiB, it is placed at the next
> >>> 1 TiB boundary after the maximum RAM address.
> >>>
> >>> Technically, this changes behaviour in a migration-breaking way for
> >>> existing machines with > 1TiB maximum memory, but since having > 1 TiB
> >>> memory was broken anyway, this seems like a reasonable trade-off.
> >>>
> >>> Signed-off-by: David Gibson 
> >>> ---
> >>>  hw/ppc/spapr.c | 12 ++--
> >>>  1 file changed, 10 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>> index f6e9c2a..7cb167c 100644
> >>> --- a/hw/ppc/spapr.c
> >>> +++ b/hw/ppc/spapr.c
> >>> @@ -2376,15 +2376,23 @@ static void spapr_phb_placement(sPAPRMachineState 
> >>> *spapr, uint32_t index,
> >>>  unsigned n_dma, uint32_t *liobns, Error 
> >>> **errp)
> >>>  {
> >>>  const uint64_t base_buid = 0x8002000ULL;
> >>> -const hwaddr phb0_base = 0x100ULL; /* 1 TiB */
> >>>  const hwaddr phb_spacing = 0x10ULL; /* 64 GiB */
> >>>  const hwaddr mmio_offset = 0xa000; /* 2 GiB + 512 MiB */
> >>>  const hwaddr pio_offset = 0x8000; /* 2 GiB */
> >>>  const uint32_t max_index = 255;
> >>> +const hwaddr phb0_alignment = 0x100ULL; /* 1 TiB */
> >>>  
> >>> -hwaddr phb_base;
> >>> +uint64_t ram_top = MACHINE(spapr)->ram_size;
> >>> +hwaddr phb0_base, phb_base;
> >>>  int i;
> >>>  
> >>> +if (MACHINE(spapr)->maxram_size > ram_top) {
> >>> +ram_top = spapr->hotplug_memory.base +
> >>> +memory_region_size(>hotplug_memory.mr);
> >>> +}
> >>
> >> Why don't you set directly ram_top to maxram_size?
> > 
> > Because there may be an alignment gap between ram_size and the bottom
> > of the hotplug region.
> 
> Perhaps you could add a comment why we have this check:
> 
> when machine->ram_size == machine->maxram_size, there is no hotpluggable
> memory.

Good idea, I've added something.

> BTW, something strange in hotpluggable memory in hw/ppc/spapr.c:
> 
>  /* initialize hotplug memory address space */
>  if (machine->ram_size < machine->maxram_size) {
>  ram_addr_t hotplug_mem_size = machine->maxram_size -
> machine->ram_size;
> ...
>  spapr->hotplug_memory.base = ROUND_UP(machine->ram_size,
>SPAPR_HOTPLUG_MEM_ALIGN);
>  memory_region_init(>hotplug_memory.mr, OBJECT(spapr),
> "hotplug-memory", hotplug_mem_size);
> 
> So the end of hotpluggable memory can be beyond maxram_size, is that normal?

Yes.  maxram_size is the total possible amount of RAM not counting
gaps, rather than the maximum possible RAM address.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCHv2 4/7] spapr_pci: Delegate placement of PCI host bridges to machine type

2016-10-12 Thread David Gibson
On Wed, Oct 12, 2016 at 11:26:05AM +0200, Laurent Vivier wrote:
> 
> 
> On 12/10/2016 06:44, David Gibson wrote:
> > The 'spapr-pci-host-bridge' represents the virtual PCI host bridge (PHB)
> > for a PAPR guest.  Unlike on x86, it's routine on Power (both bare metal
> > and PAPR guests) to have numerous independent PHBs, each controlling a
> > separate PCI domain.
> > 
> > There are two ways of configuring the spapr-pci-host-bridge device: first
> > it can be done fully manually, specifying the locations and sizes of all
> > the IO windows.  This gives the most control, but is very awkward with 6
> > mandatory parameters.  Alternatively just an "index" can be specified
> > which essentially selects from an array of predefined PHB locations.
> > The PHB at index 0 is automatically created as the default PHB.
> > 
> > The current set of default locations causes some problems for guests with
> > large RAM (> 1 TiB) or PCI devices with very large BARs (e.g. big nVidia
> > GPGPU cards via VFIO).  Obviously, for migration we can only change the
> > locations on a new machine type, however.
> > 
> > This is awkward, because the placement is currently decided within the
> > spapr-pci-host-bridge code, so it breaks abstraction to look inside the
> > machine type version.
> > 
> > So, this patch delegates the "default mode" PHB placement from the
> > spapr-pci-host-bridge device back to the machine type via a public method
> > in sPAPRMachineClass.  It's still a bit ugly, but it's about the best we
> > can do.
> > 
> > For now, this just changes where the calculation is done.  It doesn't
> > change the actual location of the host bridges, or any other behaviour.
> > 
> > Signed-off-by: David Gibson 
> > ---
> >  hw/ppc/spapr.c  | 34 ++
> >  hw/ppc/spapr_pci.c  | 22 --
> >  include/hw/pci-host/spapr.h | 11 +--
> >  include/hw/ppc/spapr.h  |  4 
> >  4 files changed, 47 insertions(+), 24 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 03e3803..f6e9c2a 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -2370,6 +2370,39 @@ static HotpluggableCPUList 
> > *spapr_query_hotpluggable_cpus(MachineState *machine)
> >  return head;
> >  }
> >  
> > +static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
> > +uint64_t *buid, hwaddr *pio, hwaddr 
> > *pio_size,
> > +hwaddr *mmio, hwaddr *mmio_size,
> > +unsigned n_dma, uint32_t *liobns, Error 
> > **errp)
> > +{
> > +const uint64_t base_buid = 0x8002000ULL;
> > +const hwaddr phb0_base = 0x100ULL; /* 1 TiB */
> > +const hwaddr phb_spacing = 0x10ULL; /* 64 GiB */
> > +const hwaddr mmio_offset = 0xa000; /* 2 GiB + 512 MiB */
> > +const hwaddr pio_offset = 0x8000; /* 2 GiB */
> > +const uint32_t max_index = 255;
> > +
> > +hwaddr phb_base;
> > +int i;
> > +
> > +if (index > max_index) {
> > +error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
> > +   max_index);
> > +return;
> > +}
> > +
> > +*buid = base_buid + index;
> > +for (i = 0; i < n_dma; ++i) {
> > +liobns[i] = SPAPR_PCI_LIOBN(index, i);
> > +}
> > +
> > +phb_base = phb0_base + index * phb_spacing;
> > +*pio = phb_base + pio_offset;
> > +*pio_size = SPAPR_PCI_IO_WIN_SIZE;
> > +*mmio = phb_base + mmio_offset;
> > +*mmio_size = SPAPR_PCI_MMIO_WIN_SIZE;
> 
> sphb->io_win_size (*pio_size) and sphb->mem_win_size (*mmio_size) were
> previously initialized from spapr_phb_properties[], you overwrite these
> values now. Is this what you want?

Uuu... I guess not.

I put the sizes in here because I wanted to have all the configuration
for the common case in one place, rather than split between
phb_placement() and the default properties.

But you're right, it does introduce a real, if small, behaviour
change, which this patch isn't supposed to do.  I'll change it.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH 0/6] qdev class properties + abstract class support on device-list-properties

2016-10-12 Thread Eduardo Habkost
On Tue, Oct 11, 2016 at 02:01:19PM -0700, 
no-re...@ec2-52-6-146-230.compute-1.amazonaws.com wrote:
[...]
>   GTESTER tests/check-qom-proplist
> Found prop 'bv'
> **
> ERROR:/tmp/qemu-test/src/tests/check-qom-proplist.c:521:test_dummy_class_iterator:
>  code should not be reached
> GTester: last random seed: R02Sb911b265d311a4134ab5cd16c99088a0
>   GTESTER tests/test-qemu-opts

This happens because the series depends on the "tests: A few
check-qom-proplist fixes" series I have submitted earlier. If
that series is applied first, this error won't be triggered.

-- 
Eduardo



[Qemu-devel] [PATCH 03/11] spapr: add option vector handling in CAS-generated resets

2016-10-12 Thread Michael Roth
In some cases, ibm,client-architecture-support calls can fail. This
could happen in the current code for situations where the modified
device tree segment exceeds the buffer size provided by the guest
via the call parameters. In these cases, QEMU will reset, allowing
an opportunity to regenerate the device tree from scratch via
boot-time handling. There are potentially other scenarios as well,
not currently reachable in the current code, but possible in theory,
such as cases where device-tree properties or nodes need to be removed.

We currently don't handle either of these properly for option vector
capabilities however. Instead of carrying the negotiated capability
beyond the reset and creating the boot-time device tree accordingly,
we start from scratch, generating the same boot-time device tree as we
did prior to the CAS-generated and the same device tree updates as we
did before. This could (in theory) cause us to get stuck in a reset
loop. This hasn't been observed, but depending on the extensiveness
of CAS-induced device tree updates in the future, could eventually
become an issue.

Address this by pulling capability-related device tree
updates resulting from CAS calls into a common routine,
spapr_populate_cas_updates(), and adding an sPAPROptionVector*
parameter that allows us to test for newly-negotiated capabilities.
We invoke it as follows:

1) When ibm,client-architecture-support gets called, we
   call spapr_populate_cas_updates() with the set of capabilities
   added since the previous call to ibm,client-architecture-support.
   For the initial boot, or a system reset generated by something
   other than the CAS call itself, this set will consist of *all*
   options supported both the platform and the guest. For calls
   to ibm,client-architecture-support immediately after a CAS-induced
   reset, we call spapr_populate_cas_updates() with only the set
   of capabilities added since the previous call, since the other
   capabilities will have already been addressed by the boot-time
   device-tree this time around. In the unlikely event that
   capabilities are *removed* since the previous CAS, we will
   generate a CAS-induced reset. In the unlikely event that we
   cannot fit the device-tree updates into the buffer provided
   by the guest, well generate a CAS-induced reset.

2) When a CAS update results in the need to reset the machine and
   include the updates in the boot-time device tree, we call the
   spapr_populate_cas_updates() using the full set of negotiated
   capabilities as part of the reset path. At initial boot, or after
   a reset generated by something other than the CAS call itself,
   this set will be empty, resulting in what should be the same
   boot-time device-tree as we generated prior to this patch. For
   CAS-induced reset, this routine will be called with the full set of
   capabilities negotiated by the platform/guest in the previous
   CAS call, which should result in CAS updates from previous call
   being accounted for in the initial boot-time device tree.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 43 ++-
 hw/ppc/spapr_hcall.c   | 22 ++
 include/hw/ppc/spapr.h |  4 +++-
 3 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 934d6b2..460c7a8 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -854,13 +854,28 @@ out:
 return ret;
 }
 
+static int spapr_populate_cas_updates(sPAPRMachineState *spapr, void *fdt,
+  sPAPROptionVector *ov5_updates)
+{
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+int ret = 0;
+
+/* Generate ibm,dynamic-reconfiguration-memory node if required */
+if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
+g_assert(smc->dr_lmb_enabled);
+ret = spapr_populate_drconf_memory(spapr, fdt);
+}
+
+return ret;
+}
+
 int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
  target_ulong addr, target_ulong size,
- bool cpu_update)
+ bool cpu_update,
+ sPAPROptionVector *ov5_updates)
 {
 void *fdt, *fdt_skel;
 sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
-sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(qdev_get_machine());
 
 size -= sizeof(hdr);
 
@@ -879,11 +894,7 @@ int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
 _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
 }
 
-/* Generate ibm,dynamic-reconfiguration-memory node if required */
-if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
-g_assert(smc->dr_lmb_enabled);
-_FDT((spapr_populate_drconf_memory(spapr, fdt)));
-}
+spapr_populate_cas_updates(spapr, fdt, ov5_updates);
 
 /* Pack resulting tree */
 _FDT((fdt_pack(fdt)));
@@ -904,7 +915,8 @@ 

[Qemu-devel] [PATCH 07/11] spapr: add hotplug interrupt machine options

2016-10-12 Thread Michael Roth
This adds machine options of the form:

  -machine pseries,legacy-hotplug-events=true
  -machine pseries,legacy-hotplug-events=false

to denote whether or not we wish to force the use of "legacy" style
hotplug events, which are surfaced through EPOW interrupts instead of
a dedicated interrupt source, and lack certain features necessary,
mainly, for memory unplug support.

If false, QEMU will default to "legacy" style unless the guest
advertises support for the newer events via
ibm,client-architecture-support hcall during early boot.

For pseries-2.7 and earlier we default to true, for newer machine
types we default to false.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c  | 31 +++
 include/hw/ppc/spapr.h  |  1 +
 include/hw/ppc/spapr_ovec.h |  1 +
 3 files changed, 33 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f8cde92..d80a6fa 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1816,6 +1816,11 @@ static void ppc_spapr_init(MachineState *machine)
 
 spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
 
+/* use dedicated HP event source if guest supports it */
+if (spapr->use_hotplug_event_source) {
+spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
+}
+
 /* init CPUs */
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : smc->tcg_default_cpu;
@@ -2172,16 +2177,39 @@ static void spapr_set_kvm_type(Object *obj, const char 
*value, Error **errp)
 spapr->kvm_type = g_strdup(value);
 }
 
+static bool spapr_get_legacy_hotplug_events(Object *obj, Error **errp)
+{
+sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+return !spapr->use_hotplug_event_source;
+}
+
+static void spapr_set_legacy_hotplug_events(Object *obj, bool value,
+Error **errp)
+{
+sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+spapr->use_hotplug_event_source = !value;
+}
+
 static void spapr_machine_initfn(Object *obj)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
 
 spapr->htab_fd = -1;
+spapr->use_hotplug_event_source = true;
 object_property_add_str(obj, "kvm-type",
 spapr_get_kvm_type, spapr_set_kvm_type, NULL);
 object_property_set_description(obj, "kvm-type",
 "Specifies the KVM virtualization mode 
(HV, PR)",
 NULL);
+object_property_add_bool(obj, "legacy-hotplug-events",
+spapr_get_legacy_hotplug_events,
+spapr_set_legacy_hotplug_events,
+NULL);
+object_property_set_description(obj, "legacy-hotplug-events",
+"Use deprecated EPOW mechanism for hotplug 
events",
+NULL);
 }
 
 static void spapr_machine_finalizefn(Object *obj)
@@ -2518,6 +2546,9 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", true);
 
 static void spapr_machine_2_7_instance_options(MachineState *machine)
 {
+sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
+
+spapr->use_hotplug_event_source = false;
 }
 
 static void spapr_machine_2_7_class_options(MachineClass *mc)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 27a3328..d1a4a14 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -74,6 +74,7 @@ struct sPAPRMachineState {
 uint32_t check_exception_irq;
 Notifier epow_notifier;
 QTAILQ_HEAD(, sPAPREventLogEntry) pending_events;
+bool use_hotplug_event_source;
 
 /* Migration state */
 int htab_save_index;
diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
index 47fa04c..92167c6 100644
--- a/include/hw/ppc/spapr_ovec.h
+++ b/include/hw/ppc/spapr_ovec.h
@@ -45,6 +45,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
 /* option vector 5 */
 #define OV5_DRCONF_MEMORY   OV_BIT(2, 2)
 #define OV5_FORM1_AFFINITY  OV_BIT(5, 0)
+#define OV5_HP_EVT  OV_BIT(6, 5)
 
 /* interfaces */
 sPAPROptionVector *spapr_ovec_new(void);
-- 
1.9.1




[Qemu-devel] [PATCH 09/11] spapr: Add DRC count indexed hotplug identifier type

2016-10-12 Thread Michael Roth
From: Bharata B Rao 

Add support for DRC count indexed hotplug ID type which is primarily
needed for memory hot unplug. This type allows for specifying the
number of DRs that should be plugged/unplugged starting from a given
DRC index.

Signed-off-by: Bharata B Rao 
* updated rtas_event_log_v6_hp to reflect count/index field ordering
  used in PAPR hotplug ACR
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr_events.c  | 74 --
 include/hw/ppc/spapr.h |  4 +++
 2 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index f8bbec6..eeca800 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -175,6 +175,16 @@ struct epow_log_full {
 struct rtas_event_log_v6_epow epow;
 } QEMU_PACKED;
 
+union drc_identifier {
+uint32_t index;
+uint32_t count;
+struct {
+uint32_t count;
+uint32_t index;
+} count_indexed;
+char name[1];
+} QEMU_PACKED;
+
 struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_SECTION_ID_HOTPLUG  0x4850 /* HP */
 struct rtas_event_log_v6_section_header hdr;
@@ -191,12 +201,9 @@ struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_HP_ID_DRC_NAME   1
 #define RTAS_LOG_V6_HP_ID_DRC_INDEX  2
 #define RTAS_LOG_V6_HP_ID_DRC_COUNT  3
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED  4
 uint8_t reserved;
-union {
-uint32_t index;
-uint32_t count;
-char name[1];
-} drc;
+union drc_identifier drc_id;
 } QEMU_PACKED;
 
 struct hp_log_full {
@@ -457,7 +464,7 @@ static void spapr_hotplug_set_signalled(uint32_t drc_index)
 
 static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
 sPAPRDRConnectorType drc_type,
-uint32_t drc)
+union drc_identifier *drc_id)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 struct hp_log_full *new_hp;
@@ -502,7 +509,7 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t 
hp_action,
 case SPAPR_DR_CONNECTOR_TYPE_PCI:
 hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PCI;
 if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
-spapr_hotplug_set_signalled(drc);
+spapr_hotplug_set_signalled(drc_id->index);
 }
 break;
 case SPAPR_DR_CONNECTOR_TYPE_LMB:
@@ -520,9 +527,16 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t 
hp_action,
 }
 
 if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
-hp->drc.count = cpu_to_be32(drc);
+hp->drc_id.count = cpu_to_be32(drc_id->count);
 } else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_INDEX) {
-hp->drc.index = cpu_to_be32(drc);
+hp->drc_id.index = cpu_to_be32(drc_id->index);
+} else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED) {
+/* we should not be using count_indexed value unless the guest
+ * supports dedicated hotplug event source
+ */
+g_assert(spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT));
+hp->drc_id.count_indexed.count = 
cpu_to_be32(drc_id->count_indexed.count);
+hp->drc_id.count_indexed.index = 
cpu_to_be32(drc_id->count_indexed.index);
 }
 
 rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
@@ -535,34 +549,64 @@ void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc)
 {
 sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 sPAPRDRConnectorType drc_type = drck->get_type(drc);
-uint32_t index = drck->get_index(drc);
+union drc_identifier drc_id;
 
+drc_id.index = drck->get_index(drc);
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
-RTAS_LOG_V6_HP_ACTION_ADD, drc_type, index);
+RTAS_LOG_V6_HP_ACTION_ADD, drc_type, _id);
 }
 
 void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc)
 {
 sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 sPAPRDRConnectorType drc_type = drck->get_type(drc);
-uint32_t index = drck->get_index(drc);
+union drc_identifier drc_id;
 
+drc_id.index = drck->get_index(drc);
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
-RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, index);
+RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, _id);
 }
 
 void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
uint32_t count)
 {
+union drc_identifier drc_id;
+
+drc_id.count = count;
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
-RTAS_LOG_V6_HP_ACTION_ADD, drc_type, count);
+RTAS_LOG_V6_HP_ACTION_ADD, drc_type, _id);
 }
 
 void 

[Qemu-devel] [PATCH 08/11] spapr_events: add support for dedicated hotplug event source

2016-10-12 Thread Michael Roth
Hotplug events were previously delivered using an EPOW interrupt
and were queued by linux guests into a circular buffer. For traditional
EPOW events like shutdown/resets, this isn't an issue, but for hotplug
events there are cases where this buffer can be exhausted, resulting
in the loss of hotplug events, resets, etc.

Newer-style hotplug event are delivered using a dedicated event source.
We enable this in supported guests by adding standard an additional
event source in the guest device-tree via /event-sources, and, if
the guest advertises support for the newer-style hotplug events,
using the corresponding interrupt to signal the available of
hotplug/unplug events.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c |  10 ++--
 hw/ppc/spapr_events.c  | 148 ++---
 include/hw/ppc/spapr.h |   3 +-
 3 files changed, 120 insertions(+), 41 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d80a6fa..2037222 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -275,8 +275,7 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
hwaddr initrd_size,
hwaddr kernel_size,
bool little_endian,
-   const char *kernel_cmdline,
-   uint32_t epow_irq)
+   const char *kernel_cmdline)
 {
 void *fdt;
 uint32_t start_prop = cpu_to_be32(initrd_base);
@@ -437,7 +436,7 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
 _FDT((fdt_end_node(fdt)));
 
 /* event-sources */
-spapr_events_fdt_skel(fdt, epow_irq);
+spapr_events_fdt_skel(fdt);
 
 /* /hypervisor node */
 if (kvm_enabled()) {
@@ -1944,7 +1943,7 @@ static void ppc_spapr_init(MachineState *machine)
 }
 g_free(filename);
 
-/* Set up EPOW events infrastructure */
+/* Set up RTAS event infrastructure */
 spapr_events_init(spapr);
 
 /* Set up the RTC RTAS interfaces */
@@ -2076,8 +2075,7 @@ static void ppc_spapr_init(MachineState *machine)
 /* Prepare the device tree */
 spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
 kernel_size, kernel_le,
-kernel_cmdline,
-spapr->check_exception_irq);
+kernel_cmdline);
 assert(spapr->fdt_skel != NULL);
 
 /* used by RTAS */
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 4c7b6ae..f8bbec6 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -40,6 +40,7 @@
 #include "hw/ppc/spapr_drc.h"
 #include "qemu/help_option.h"
 #include "qemu/bcd.h"
+#include "hw/ppc/spapr_ovec.h"
 #include 
 
 struct rtas_error_log {
@@ -206,28 +207,104 @@ struct hp_log_full {
 struct rtas_event_log_v6_hp hp;
 } QEMU_PACKED;
 
-#define EVENT_MASK_INTERNAL_ERRORS   0x8000
-#define EVENT_MASK_EPOW  0x4000
-#define EVENT_MASK_HOTPLUG   0x1000
-#define EVENT_MASK_IO0x0800
+typedef enum EventClassIndex {
+EVENT_CLASS_INTERNAL_ERRORS = 0,
+EVENT_CLASS_EPOW= 1,
+EVENT_CLASS_RESERVED= 2,
+EVENT_CLASS_HOT_PLUG= 3,
+EVENT_CLASS_IO  = 4,
+EVENT_CLASS_MAX
+} EventClassIndex;
+
+#define EVENT_CLASS_MASK(index) (1 << (31 - index))
+
+typedef struct EventSource {
+const char *name;
+int irq;
+uint32_t mask;
+bool enabled;
+} EventSource;
+
+static EventSource event_source[EVENT_CLASS_MAX] = {
+[EVENT_CLASS_INTERNAL_ERRORS]   = { .name = "internal-errors", },
+[EVENT_CLASS_EPOW]  = { .name = "epow-events", },
+[EVENT_CLASS_HOT_PLUG]  = { .name = "hot-plug-events", },
+[EVENT_CLASS_IO]= { .name = "ibm,io-events", },
+};
+
+static void rtas_event_source_register(EventClassIndex index, int irq)
+{
+/* we only support 1 irq per event class at the moment */
+g_assert(!event_source[index].enabled);
+event_source[index].irq = irq;
+event_source[index].mask = EVENT_CLASS_MASK(index);
+event_source[index].enabled = true;
+}
 
-void spapr_events_fdt_skel(void *fdt, uint32_t check_exception_irq)
+void spapr_events_fdt_skel(void *fdt)
 {
-uint32_t irq_ranges[] = {cpu_to_be32(check_exception_irq), cpu_to_be32(1)};
-uint32_t interrupts[] = {cpu_to_be32(check_exception_irq), 0};
+uint32_t irq_ranges[EVENT_CLASS_MAX * 2];
+int i, count = 0;
 
 _FDT((fdt_begin_node(fdt, "event-sources")));
 
+for (i = 0, count = 0; i < EVENT_CLASS_MAX; i++) {
+/* TODO: what does 0 entail? */
+uint32_t interrupts[] = { cpu_to_be32(event_source[i].irq), 0 };
+
+if (!event_source[i].enabled) {
+

[Qemu-devel] [PATCH 06/11] spapr: update spapr hotplug documentation

2016-10-12 Thread Michael Roth
This updates the existing documentation to reflect recent updates to
the hotplug event structure, which are in draft form but slated
for inclusion in PAPR/LoPAPR.

Signed-off-by: Michael Roth 
---
 docs/specs/ppc-spapr-hotplug.txt | 55 +---
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
index 631b0ca..f57e2a0 100644
--- a/docs/specs/ppc-spapr-hotplug.txt
+++ b/docs/specs/ppc-spapr-hotplug.txt
@@ -233,12 +233,27 @@ tools by host-level management such as an HMC. This level 
of management is not
 applicable to PowerKVM, hence the reason for extending the notification
 framework to support hotplug events.
 
-Note that these events are not yet formally part of the PAPR+ specification,
-but support for this format has already been implemented in DR-related
-guest tools such as powerpc-utils/librtas, as well as kernel patches that have
-been submitted to handle in-kernel processing of memory/cpu-related hotplug
-events[1], and is planned for formal inclusion is PAPR+ specification. The
-hotplug-specific payload is QEMU implemented as follows (with all values
+The format for these EPOW-signalled events is described below under
+"hotplug/unplug event structure". Note that these events are not
+formally part of the PAPR+ specification, and have been superseded by a
+newer format, also described below under "hotplug/unplug event structure",
+and so are now deemed a "legacy" format. The formats are similar, but the
+"modern" format contains additional fields/flags, which are denoted for the
+purposes of this documentation with "#ifdef GUEST_SUPPORTS_MODERN" guards.
+
+QEMU should assume support only for "legacy" fields/flags unless the guest
+advertises support for the "modern" format via ibm,client-architecture-support
+hcall by setting byte 5, bit 6 of it's ibm,architecture-vec-5 option vector
+structure (as described by LoPAPR v11, B.6.2.3). As with "legacy" format 
events,
+"modern" format events are surfaced to the guest via check-exception RTAS 
calls,
+but use a dedicated event source to signal the guest. This event source is
+advertised to the guest by the addition of a "hot-plug-events" node under
+"/event-sources" node of the guest's device tree using the standard format
+described in LoPAPR v11, B.6.12.1.
+
+== hotplug/unplug event structure ==
+
+The hotplug-specific payload in QEMU is implemented as follows (with all values
 encoded in big-endian format):
 
 struct rtas_event_log_v6_hp {
@@ -263,14 +278,23 @@ struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_HP_ACTION_ADD   1
 #define RTAS_LOG_V6_HP_ACTION_REMOVE2
 uint8_t hotplug_action; /* action (add/remove) */
-#define RTAS_LOG_V6_HP_ID_DRC_NAME  1
-#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2
-#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3
+#define RTAS_LOG_V6_HP_ID_DRC_NAME  1
+#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3
+#ifdef GUEST_SUPPORTS_MODERN
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED 4
+#endif
 uint8_t hotplug_identifier; /* type of the resource identifier,
  * which serves as the discriminator
  * for the 'drc' union field below
  */
+#ifdef GUEST_SUPPORTS_MODERN
+uint8_t capabilities;   /* capability flags, currently unused
+ * by QEMU
+ */
+#else
 uint8_t reserved;
+#endif
 union {
 uint32_t index; /* DRC index of resource to take action
  * on
@@ -278,6 +302,19 @@ struct rtas_event_log_v6_hp {
 uint32_t count; /* number of DR resources to take
  * action on (guest chooses which)
  */
+#ifdef GUEST_SUPPORTS_MODERN
+struct {
+uint32_t count; /* number of DR resources to take
+ * action on
+ */
+uint32_t index; /* DRC index of first resource to take
+ * action on. guest will take action
+ * on DRC index  through
+ * DRC index  in
+ * sequential order
+ */
+} count_indexed;
+#endif
 char name[1];   /* string representing the name of the
  * DRC to take action on
  */
-- 
1.9.1




[Qemu-devel] [PATCH 05/11] spapr: fix inheritance chain for default machine options

2016-10-12 Thread Michael Roth
Rather than machine instances having backward-compatible option
defaults that need to be repeatedly re-enabled for every new machine
type we introduce, we set the defaults appropriate for newer machine
types, then add code to explicitly disable instance options as needed
to maintain compatibility with older machine types.

Currently pseries-2.5 does not inherit from pseries-2.6 in this
fashion, which is okay at the moment since we do not have any
instance compatibility options for pseries-2.6+ currently.

We will make use of this in future patches though, so fix it here.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3b2a459..f8cde92 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2544,6 +2544,7 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
 
 static void spapr_machine_2_6_instance_options(MachineState *machine)
 {
+spapr_machine_2_7_instance_options(machine);
 }
 
 static void spapr_machine_2_6_class_options(MachineClass *mc)
@@ -2568,6 +2569,7 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
 
 static void spapr_machine_2_5_instance_options(MachineState *machine)
 {
+spapr_machine_2_6_instance_options(machine);
 }
 
 static void spapr_machine_2_5_class_options(MachineClass *mc)
-- 
1.9.1




[Qemu-devel] [PATCH 04/11] spapr: improve ibm, architecture-vec-5 property handling

2016-10-12 Thread Michael Roth
ibm,architecture-vec-5 is supposed to encode all option vector 5 bits
negotiated between platform/guest. Currently we hardcode this property
in the boot-time device tree to advertise a single negotiated
capability, "Form 1" NUMA Affinity, regardless of whether or not CAS
has been invoked or that capability has actually been negotiated.

Improve this by generating ibm,architecture-vec-5 based on the full
set of option vector 5 capabilities negotiated via CAS.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c  | 22 +-
 include/hw/ppc/spapr_ovec.h |  1 +
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 460c7a8..3b2a459 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -285,7 +285,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
 GString *qemu_hypertas = g_string_sized_new(256);
 uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
 uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(max_cpus)};
-unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
 char *buf;
 
 add_str(hypertas, "hcall-pft");
@@ -351,9 +350,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
 /* /chosen */
 _FDT((fdt_begin_node(fdt, "chosen")));
 
-/* Set Form1_affinity */
-_FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5;
-
 _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
 _FDT((fdt_property(fdt, "linux,initrd-start",
_prop, sizeof(start_prop;
@@ -858,14 +854,28 @@ static int spapr_populate_cas_updates(sPAPRMachineState 
*spapr, void *fdt,
   sPAPROptionVector *ov5_updates)
 {
 sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
-int ret = 0;
+int ret = 0, offset;
 
 /* Generate ibm,dynamic-reconfiguration-memory node if required */
 if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
 g_assert(smc->dr_lmb_enabled);
 ret = spapr_populate_drconf_memory(spapr, fdt);
+if (ret) {
+goto out;
+}
 }
 
+offset = fdt_path_offset(fdt, "/chosen");
+if (offset < 0) {
+offset = fdt_add_subnode(fdt, 0, "chosen");
+if (offset < 0) {
+return offset;
+}
+}
+ret = spapr_ovec_populate_dt(fdt, offset, spapr->ov5_cas,
+ "ibm,architecture-vec-5");
+
+out:
 return ret;
 }
 
@@ -1804,6 +1814,8 @@ static void ppc_spapr_init(MachineState *machine)
 spapr_validate_node_memory(machine, _fatal);
 }
 
+spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
+
 /* init CPUs */
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : smc->tcg_default_cpu;
diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
index 09afd59..47fa04c 100644
--- a/include/hw/ppc/spapr_ovec.h
+++ b/include/hw/ppc/spapr_ovec.h
@@ -44,6 +44,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
 
 /* option vector 5 */
 #define OV5_DRCONF_MEMORY   OV_BIT(2, 2)
+#define OV5_FORM1_AFFINITY  OV_BIT(5, 0)
 
 /* interfaces */
 sPAPROptionVector *spapr_ovec_new(void);
-- 
1.9.1




[Qemu-devel] [PATCH 01/11] spapr_ovec: initial implementation of option vector helpers

2016-10-12 Thread Michael Roth
PAPR guests advertise their capabilities to the platform by passing
an ibm,architecture-vec structure via an
ibm,client-architecture-support hcall as described by LoPAPR v11,
B.6.2.3. during early boot.

Using this information, the platform enables the capabilities it
supports, then encodes a subset of those enabled capabilities (the
5th option vector of the ibm,architecture-vec structure passed to
ibm,client-architecture-support) into the guest device tree via
"/chosen/ibm,architecture-vec-5".

The logical format of these these option vectors is a bit-vector,
where individual bits are addressed/documented based on the byte-wise
offset from the beginning of the bit-vector, followed by the bit-wise
index starting from the byte-wise offset. Thus the bits of each of
these bytes are stored in reverse order. Additionally, the first
byte of each option vector is encodes the length of the option vector,
so byte offsets begin at 1, and bit offset at 0.

This is not very intuitive for the purposes of mapping these bits to
a particular documented capability, so this patch introduces a set
of abstractions that encapsulate the work of parsing/encoding these
options vectors and testing for individual capabilities.

Cc: Bharata B Rao 
Signed-off-by: Michael Roth 
---
 hw/ppc/Makefile.objs|   2 +-
 hw/ppc/spapr_ovec.c | 244 
 include/hw/ppc/spapr_ovec.h |  62 +++
 3 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 hw/ppc/spapr_ovec.c
 create mode 100644 include/hw/ppc/spapr_ovec.h

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 99a0d4e..2e0b0c9 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
 obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
-obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
+obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
diff --git a/hw/ppc/spapr_ovec.c b/hw/ppc/spapr_ovec.c
new file mode 100644
index 000..ddc19f5
--- /dev/null
+++ b/hw/ppc/spapr_ovec.c
@@ -0,0 +1,244 @@
+/*
+ * QEMU SPAPR Architecture Option Vector Helper Functions
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Authors:
+ *  Bharata B Rao 
+ *  Michael Roth  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/ppc/spapr_ovec.h"
+#include "qemu/bitmap.h"
+#include "exec/address-spaces.h"
+#include "qemu/error-report.h"
+#include 
+
+/* #define DEBUG_SPAPR_OVEC */
+
+#ifdef DEBUG_SPAPR_OVEC
+#define DPRINTFN(fmt, ...) \
+do { fprintf(stderr, fmt "\n", ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTFN(fmt, ...) \
+do { } while (0)
+#endif
+
+#define OV_MAXBYTES 256 /* not including length byte */
+#define OV_MAXBITS (OV_MAXBYTES * BITS_PER_BYTE)
+
+/* we *could* work with bitmaps directly, but handling the bitmap privately
+ * allows us to more safely make assumptions about the bitmap size and
+ * simplify the calling code somewhat
+ */
+struct sPAPROptionVector {
+unsigned long *bitmap;
+};
+
+static sPAPROptionVector *spapr_ovec_from_bitmap(unsigned long *bitmap)
+{
+sPAPROptionVector *ov;
+
+g_assert(bitmap);
+
+ov = g_new0(sPAPROptionVector, 1);
+ov->bitmap = bitmap;
+
+return ov;
+}
+
+sPAPROptionVector *spapr_ovec_new(void)
+{
+return spapr_ovec_from_bitmap(bitmap_new(OV_MAXBITS));
+}
+
+sPAPROptionVector *spapr_ovec_clone(sPAPROptionVector *ov_orig)
+{
+sPAPROptionVector *ov;
+
+g_assert(ov_orig);
+
+ov = spapr_ovec_new();
+bitmap_copy(ov->bitmap, ov_orig->bitmap, OV_MAXBITS);
+
+return ov;
+}
+
+void spapr_ovec_intersect(sPAPROptionVector *ov,
+  sPAPROptionVector *ov1,
+  sPAPROptionVector *ov2)
+{
+g_assert(ov);
+g_assert(ov1);
+g_assert(ov2);
+
+bitmap_and(ov->bitmap, ov1->bitmap, ov2->bitmap, OV_MAXBITS);
+}
+
+/* returns true if options bits were removed, false otherwise */
+bool spapr_ovec_diff(sPAPROptionVector *ov,
+ sPAPROptionVector *ov_old,
+ sPAPROptionVector *ov_new)
+{
+unsigned long *change_mask = bitmap_new(OV_MAXBITS);
+unsigned long *removed_bits = bitmap_new(OV_MAXBITS);
+bool bits_were_removed = false;
+
+g_assert(ov);
+g_assert(ov_old);
+g_assert(ov_new);
+
+bitmap_xor(change_mask, ov_old->bitmap, ov_new->bitmap, OV_MAXBITS);
+bitmap_and(ov->bitmap, ov_new->bitmap, change_mask, OV_MAXBITS);
+bitmap_and(removed_bits, ov_old->bitmap, change_mask, OV_MAXBITS);
+
+if 

[Qemu-devel] [PATCH 02/11] spapr_hcall: use spapr_ovec_* interfaces for CAS options

2016-10-12 Thread Michael Roth
Currently we access individual bytes of an option vector via
ldub_phys() to test for the presence of a particular capability
within that byte. Currently this is only done for the "dynamic
reconfiguration memory" capability bit. If that bit is present,
we pass a boolean value to spapr_h_cas_compose_response()
to generate a modified device tree segment with the additional
properties required to enable this functionality.

As more capability bits are added, will would need to modify the
code to add additional option vector accesses and extend the
param list for spapr_h_cas_compose_response() to include similar
boolean values for these parameters.

Avoid this by switching to spapr_ovec_* helpers so we can do all
the parsing in one shot and then test for these additional bits
within spapr_h_cas_compose_response() directly.

Cc: Bharata B Rao 
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c  | 10 ++--
 hw/ppc/spapr_hcall.c| 56 -
 include/hw/ppc/spapr.h  |  5 +++-
 include/hw/ppc/spapr_ovec.h |  3 +++
 4 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 03e3803..934d6b2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -856,7 +856,7 @@ out:
 
 int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
  target_ulong addr, target_ulong size,
- bool cpu_update, bool memory_update)
+ bool cpu_update)
 {
 void *fdt, *fdt_skel;
 sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
@@ -880,7 +880,8 @@ int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
 }
 
 /* Generate ibm,dynamic-reconfiguration-memory node if required */
-if (memory_update && smc->dr_lmb_enabled) {
+if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
+g_assert(smc->dr_lmb_enabled);
 _FDT((spapr_populate_drconf_memory(spapr, fdt)));
 }
 
@@ -1769,7 +1770,12 @@ static void ppc_spapr_init(MachineState *machine)
DIV_ROUND_UP(max_cpus * smt, smp_threads),
XICS_IRQS_SPAPR, _fatal);
 
+/* Set up containers for ibm,client-set-architecture negotiated options */
+spapr->ov5 = spapr_ovec_new();
+spapr->ov5_cas = spapr_ovec_new();
+
 if (smc->dr_lmb_enabled) {
+spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
 spapr_validate_node_memory(machine, _fatal);
 }
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index c5e7e8c..f1d081b 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -11,6 +11,7 @@
 #include "trace.h"
 #include "sysemu/kvm.h"
 #include "kvm_ppc.h"
+#include "hw/ppc/spapr_ovec.h"
 
 struct SPRSyncState {
 int spr;
@@ -880,32 +881,6 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, 
sPAPRMachineState *spapr,
 return ret;
 }
 
-/*
- * Return the offset to the requested option vector @vector in the
- * option vector table @table.
- */
-static target_ulong cas_get_option_vector(int vector, target_ulong table)
-{
-int i;
-char nr_vectors, nr_entries;
-
-if (!table) {
-return 0;
-}
-
-nr_vectors = (ldl_phys(_space_memory, table) >> 24) + 1;
-if (!vector || vector > nr_vectors) {
-return 0;
-}
-table++; /* skip nr option vectors */
-
-for (i = 0; i < vector - 1; i++) {
-nr_entries = ldl_phys(_space_memory, table) >> 24;
-table += nr_entries + 2;
-}
-return table;
-}
-
 typedef struct {
 uint32_t cpu_version;
 Error *err;
@@ -961,23 +936,21 @@ static void cas_handle_compat_cpu(PowerPCCPUClass *pcc, 
uint32_t pvr,
 }
 }
 
-#define OV5_DRCONF_MEMORY 0x20
-
 static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
   sPAPRMachineState *spapr,
   target_ulong opcode,
   target_ulong *args)
 {
 target_ulong list = ppc64_phys_to_real(args[0]);
-target_ulong ov_table, ov5;
+target_ulong ov_table;
 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu_);
 CPUState *cs;
-bool cpu_match = false, cpu_update = true, memory_update = false;
+bool cpu_match = false, cpu_update = true;
 unsigned old_cpu_version = cpu_->cpu_version;
 unsigned compat_lvl = 0, cpu_version = 0;
 unsigned max_lvl = get_compat_level(cpu_->max_compat);
 int counter;
-char ov5_byte2;
+sPAPROptionVector *ov5_guest;
 
 /* Parse PVR list */
 for (counter = 0; counter < 512; ++counter) {
@@ -1033,19 +1006,20 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu_,
 /* For the future use: here @ov_table points to the first option vector */
 ov_table = list;
 
-ov5 = cas_get_option_vector(5, ov_table);
-if (!ov5) {
- 

[Qemu-devel] [PATCH 10/11] spapr: use count+index for memory hotplug

2016-10-12 Thread Michael Roth
Commit 0a417869:

spapr: Move memory hotplug to RTAS_LOG_V6_HP_ID_DRC_COUNT type

dropped per-DRC/per-LMB hotplugs event in favor of a bulk add via a
single LMB count value. This was to avoid overrunning the guest EPOW
event queue with hotplug events. This works fine, but relies on the
guest exhaustively scanning for pluggable LMBs to satisfy the
requested count by issuing rtas-get-sensor(DR_ENTITY_SENSE, ...) calls
until all the LMBs associated with the DIMM are identified.

With newer support for dedicated hotplug event source, this queue
exhaustion is no longer as much of an issue due to implementation
details on the guest side, but we still try to avoid excessive hotplug
events by now supporting both a count and a starting index to avoid
unecessary work. This patch makes use of that approach when the
capability is available.

Cc: bhar...@linux.vnet.ibm.com
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2037222..9af4268 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2232,14 +2232,16 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error 
**errp)
 }
 }
 
-static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
-   uint32_t node, Error **errp)
+static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
+   uint32_t node, bool dedicated_hp_event_source,
+   Error **errp)
 {
 sPAPRDRConnector *drc;
 sPAPRDRConnectorClass *drck;
 uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
 int i, fdt_offset, fdt_size;
 void *fdt;
+uint64_t addr = addr_start;
 
 for (i = 0; i < nr_lmbs; i++) {
 drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
@@ -2258,7 +2260,16 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t 
addr, uint64_t size,
  * guest only in case of hotplugged memory
  */
 if (dev->hotplugged) {
-   spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+if (dedicated_hp_event_source) {
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   addr_start / 
SPAPR_MEMORY_BLOCK_SIZE);
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   nr_lmbs,
+   drck->get_index(drc));
+} else {
+spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, 
nr_lmbs);
+}
 }
 }
 
@@ -2291,7 +2302,9 @@ static void spapr_memory_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 goto out;
 }
 
-spapr_add_lmbs(dev, addr, size, node, _abort);
+spapr_add_lmbs(dev, addr, size, node,
+   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
+   _abort);
 
 out:
 error_propagate(errp, local_err);
-- 
1.9.1




[Qemu-devel] [PATCH 11/11] spapr: Memory hot-unplug support

2016-10-12 Thread Michael Roth
From: Bharata B Rao 

Add support to hot remove pc-dimm memory devices.

Signed-off-by: Bharata B Rao 
* add hooks to CAS/cmdline enablement of hotplug ACR support
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 106 -
 hw/ppc/spapr_drc.c |  17 +
 2 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9af4268..180fa3d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2310,6 +2310,90 @@ out:
 error_propagate(errp, local_err);
 }
 
+typedef struct sPAPRDIMMState {
+uint32_t nr_lmbs;
+} sPAPRDIMMState;
+
+static void spapr_lmb_release(DeviceState *dev, void *opaque)
+{
+sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
+HotplugHandler *hotplug_ctrl = NULL;
+
+if (--ds->nr_lmbs) {
+return;
+}
+
+g_free(ds);
+
+/*
+ * Now that all the LMBs have been removed by the guest, call the
+ * pc-dimm unplug handler to cleanup up the pc-dimm device.
+ */
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
+static void spapr_del_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
+   Error **errp)
+{
+sPAPRDRConnector *drc;
+sPAPRDRConnectorClass *drck;
+uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
+int i;
+sPAPRDIMMState *ds = g_malloc0(sizeof(sPAPRDIMMState));
+uint64_t addr = addr_start;
+
+ds->nr_lmbs = nr_lmbs;
+for (i = 0; i < nr_lmbs; i++) {
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+addr / SPAPR_MEMORY_BLOCK_SIZE);
+g_assert(drc);
+
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+drck->detach(drc, dev, spapr_lmb_release, ds, errp);
+addr += SPAPR_MEMORY_BLOCK_SIZE;
+}
+
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
+  nr_lmbs,
+  drck->get_index(drc));
+}
+
+static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+Error **errp)
+{
+sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+MemoryRegion *mr = ddc->get_memory_region(dimm);
+
+pc_dimm_memory_unplug(dev, >hotplug_memory, mr);
+object_unparent(OBJECT(dev));
+}
+
+static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
+DeviceState *dev, Error **errp)
+{
+Error *local_err = NULL;
+PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+MemoryRegion *mr = ddc->get_memory_region(dimm);
+uint64_t size = memory_region_size(mr);
+uint64_t addr;
+
+addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, 
_err);
+if (local_err) {
+goto out;
+}
+
+spapr_del_lmbs(dev, addr, size, _abort);
+out:
+error_propagate(errp, local_err);
+}
+
 void *spapr_populate_hotplug_cpu_dt(CPUState *cs, int *fdt_offset,
 sPAPRMachineState *spapr)
 {
@@ -2383,10 +2467,21 @@ static void spapr_machine_device_plug(HotplugHandler 
*hotplug_dev,
 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
   DeviceState *dev, Error **errp)
 {
+sPAPRMachineState *sms = SPAPR_MACHINE(qdev_get_machine());
 MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-error_setg(errp, "Memory hot unplug not supported by sPAPR");
+if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) {
+spapr_memory_unplug(hotplug_dev, dev, errp);
+} else {
+/* NOTE: this means there is a window after guest reset, prior to
+ * CAS negotiation, where unplug requests will fail due to the
+ * capability not being detected yet. This is a bit different than
+ * the case with PCI unplug, where the events will be queued and
+ * eventually handled by the guest after boot
+ */
+error_setg(errp, "Memory hot unplug not supported for this guest");
+}
 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
 if (!mc->query_hotpluggable_cpus) {
 error_setg(errp, "CPU hot unplug not supported on this machine");
@@ -2396,6 +2491,14 @@ static void spapr_machine_device_unplug(HotplugHandler 
*hotplug_dev,
 }
 }
 
+static void spapr_machine_device_unplug_request(HotplugHandler 

[Qemu-devel] [RFC PATCH 00/11] spapr: option vector re-work and memory unplug support

2016-10-12 Thread Michael Roth
This series is based on David's ppc-for-2.8 branch, and is also available from:

  https://github.com/mdroth/qemu/commits/spapr-hotplug-event-update

Patches 1-4 address various deficiencies in how we currently handle option
vectors via ibm,client-architecture-support. This is done here in preparation
for a new option vector bit introduced later in this series, as well as a
number of future option vector bits related to other features, but I can
break this out into a separate series if preferred.

Patches 5-8 add support for an updated event format for hotplug events,
which includes a new way to specify a range of DRCs/LMBs to hotplug/unplug
using a starting position and count, which is necessary for memory unplug.
The format for this new event format is still in draft form, but slated
for inclusion in the PAPR/LoPAPR.

Patches 9-11 add support for memory unplug using the new event format.

In addition to kernel 4.8 or later, there are a number of patches required
to enable support on the guest kernel side. I've including the minimum set
of patches in my branch here:

   https://github.com/mdroth/linux/commits/spapr-hotplug-event-update

   *powerpc/pseries: advertise Hot Plug Event support to firmware
   powerpc/pseries: Implement indexed-count hotplug memory remove
   powerpc/pseries: Implement indexed-count hotplug memory add

Note that there is currently an issue that arises when attempting to
offline an LMB that was onlined using a guest kernel's auto-onlining
mechanism, which can prevent full completion of memory unplug requests.
This is being investigated, but for the purposes of testing this can
be worked around currently by disabling auto-onlining in guests via:

  "echo offline >/sys/devices/system/memory/auto_online_blocks"

and instead onlining the blocks manually or via udev.

 docs/specs/ppc-spapr-hotplug.txt |  55 ++---
 hw/ppc/Makefile.objs |   2 +-
 hw/ppc/spapr.c   | 237 
--
 hw/ppc/spapr_drc.c   |  17 
 hw/ppc/spapr_events.c| 222 
---
 hw/ppc/spapr_hcall.c |  70 +++-
 hw/ppc/spapr_ovec.c  | 244 

 include/hw/ppc/spapr.h   |  15 +++-
 include/hw/ppc/spapr_ovec.h  |  67 
 9 files changed, 804 insertions(+), 125 deletions(-)




Re: [Qemu-devel] [PATCH 0/5] More thread sanitizer fixes and atomic.h improvements

2016-10-12 Thread Emilio G. Cota
On Mon, Oct 10, 2016 at 15:59:02 +0200, Paolo Bonzini wrote:
> See each patch.  My attempt at fixing whatever I did when I obviously
> didn't know enough^W about the C11 memory model, and at setting a
> better example for future generations...

Just for context. Building on this patchset, is it now time to
phase out smp_(rw)mb in favour or C11's acq/rel, as you laid
out in your KVM Forum talk [*]?

What is the plan with smp_mb_(sg)et? It's not clear to me from
the slides, but given patch 5 I don't see a reason to keep them.

Thanks,

Emilio

[*] 
http://events.linuxfoundation.org/sites/events/files/slides/kvmforum16-atomic2.pdf




Re: [Qemu-devel] [PATCH 3/3] iotests: Skip test 162 if there is no SSH support

2016-10-12 Thread Eric Blake
On 10/12/2016 03:49 PM, Max Reitz wrote:
> Signed-off-by: Max Reitz 
> ---
>  tests/qemu-iotests/162 | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/tests/qemu-iotests/162 b/tests/qemu-iotests/162
> index f8eecb3..cad2bd7 100755
> --- a/tests/qemu-iotests/162
> +++ b/tests/qemu-iotests/162
> @@ -35,6 +35,9 @@ status=1# failure is the default!
>  _supported_fmt generic
>  _supported_os Linux
>  
> +test_ssh=$($QEMU_IMG --help | grep '^Supported formats:.* ssh\( \|$\)')
> +[ "$test_ssh" = "" ] && _notrun "ssh support required"
> +

Reviewed-by: Eric Blake 

>  echo
>  echo '=== NBD ==='
>  # NBD expects all of its arguments to be strings
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PATCH 2/4] target-lm32: disable asm logging via LOG_DIS()

2016-10-12 Thread Michael Walle
The lm32 target already has a disassembler which logs the assembly
instructions with "-d in_asm". Therefore, turn of the LOG_DIS() macro to
prevent logging the assembly instructions twice. Also turn the macro in a
one which is always compiled to catch any errors while the macro is turned
off.

Signed-off-by: Michael Walle 
---
 target-lm32/translate.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index fa8416a..792637f 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -33,12 +33,14 @@
 #include "exec/log.h"
 
 
-#define DISAS_LM32 1
-#if DISAS_LM32
-#  define LOG_DIS(...) qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__)
-#else
-#  define LOG_DIS(...) do { } while (0)
-#endif
+#define DISAS_LM32 0
+
+#define LOG_DIS(...) \
+do { \
+if (DISAS_LM32) { \
+qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__); \
+} \
+} while (0)
 
 #define EXTRACT_FIELD(src, start, end) \
 (((src) >> start) & ((1 << (end - start + 1)) - 1))
-- 
2.1.4




[Qemu-devel] [PATCH 4/4] target-lm32: rewrite gen_compare()

2016-10-12 Thread Michael Walle
Drop the rX, rY and rZ stuff and use dc->r{0,1,2} directly. This should
also fix the false positive in coverity CID 1005720.

Signed-off-by: Michael Walle 
---
 target-lm32/translate.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 792637f..842af63 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -344,9 +344,6 @@ static void dec_calli(DisasContext *dc)
 
 static inline void gen_compare(DisasContext *dc, int cond)
 {
-int rX = (dc->format == OP_FMT_RR) ? dc->r2 : dc->r1;
-int rY = dc->r0;
-int rZ = (dc->format == OP_FMT_RR) ? dc->r1 : -1;
 int i;
 
 if (dc->format == OP_FMT_RI) {
@@ -360,9 +357,9 @@ static inline void gen_compare(DisasContext *dc, int cond)
 break;
 }
 
-tcg_gen_setcondi_tl(cond, cpu_R[rX], cpu_R[rY], i);
+tcg_gen_setcondi_tl(cond, cpu_R[dc->r1], cpu_R[dc->r0], i);
 } else {
-tcg_gen_setcond_tl(cond, cpu_R[rX], cpu_R[rY], cpu_R[rZ]);
+tcg_gen_setcond_tl(cond, cpu_R[dc->r2], cpu_R[dc->r0], cpu_R[dc->r1]);
 }
 }
 
-- 
2.1.4




Re: [Qemu-devel] [PATCH 2/3] block: Emit modules in bdrv_iterate_format()

2016-10-12 Thread Eric Blake
On 10/12/2016 03:49 PM, Max Reitz wrote:
> Some block drivers may not be loaded yet, but qemu supports them
> nonetheless. bdrv_iterate_format() should report them, too.
> 
> Signed-off-by: Max Reitz 
> ---
>  block.c | 18 ++
>  1 file changed, 18 insertions(+)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PATCH 3/4] lm32: milkymist-tmu2: fix integer overflow

2016-10-12 Thread Michael Walle
Don't truncate the multiplication and do a 64 bit one instead because
because the result is stored in a 64 bit variable.

Spotted by coverity, CID 1167561.

Signed-off-by: Michael Walle 
---
 hw/display/milkymist-tmu2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/display/milkymist-tmu2.c b/hw/display/milkymist-tmu2.c
index 9c00184..5c666f9 100644
--- a/hw/display/milkymist-tmu2.c
+++ b/hw/display/milkymist-tmu2.c
@@ -213,7 +213,7 @@ static void tmu2_start(MilkymistTMU2State *s)
 /* Read the QEMU source framebuffer into an OpenGL texture */
 glGenTextures(1, );
 glBindTexture(GL_TEXTURE_2D, texture);
-fb_len = 2*s->regs[R_TEXHRES]*s->regs[R_TEXVRES];
+fb_len = 2ULL * s->regs[R_TEXHRES] * s->regs[R_TEXVRES];
 fb = cpu_physical_memory_map(s->regs[R_TEXFBUF], _len, 0);
 if (fb == NULL) {
 glDeleteTextures(1, );
-- 
2.1.4




[Qemu-devel] [PATCH 1/4] target-lm32: swap operand of wcsr in LOG_DIS()

2016-10-12 Thread Michael Walle
Be consistent with the reference manual.

Signed-off-by: Michael Walle 
---
 target-lm32/translate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index dc64cc6..fa8416a 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -865,7 +865,7 @@ static void dec_wcsr(DisasContext *dc)
 {
 int no;
 
-LOG_DIS("wcsr r%d, %d\n", dc->r1, dc->csr);
+LOG_DIS("wcsr %d, r%d\n", dc->csr, dc->r1);
 
 switch (dc->csr) {
 case CSR_IE:
-- 
2.1.4




Re: [Qemu-devel] [PATCH 0/3] iotests: Skip 162 if there is no SSH support

2016-10-12 Thread no-reply
Hi,

Your series failed automatic build test. Please find the testing commands and
their output below. If you have docker installed, you can probably reproduce it
locally.

Message-id: 20161012204907.25941-1-mre...@redhat.com
Subject: [Qemu-devel] [PATCH 0/3] iotests: Skip 162 if there is no SSH support
Type: series

=== TEST SCRIPT BEGIN ===
#!/bin/bash
set -e
git submodule update --init dtc
# Let docker tests dump environment info
export SHOW_ENV=1
export J=16
make docker-test-quick@centos6
make docker-test-mingw@fedora
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] patchew/20161012204907.25941-1-mre...@redhat.com -> 
patchew/20161012204907.25941-1-mre...@redhat.com
Switched to a new branch 'test'
edffc08 iotests: Skip test 162 if there is no SSH support
de2a49f block: Emit modules in bdrv_iterate_format()
63e6b44 block: Fix bdrv_iterate_format() sorting

=== OUTPUT BEGIN ===
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 'dtc'...
Submodule path 'dtc': checked out '65cc4d2748a2c2e6f27f1cf39e07a5dbabd80ebf'
  BUILD   centos6
=== OUTPUT END ===

Abort: command timeout (>3600 seconds)


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-devel] [Bug 1630723] [NEW] UART writes to netduino2/stm32f205-soc disappear

2016-10-12 Thread Seth K
It's a bare metal program so I don't really have anywhere to print to,
other than my custom function to output to the uart. I did double check all
the address to make sure they agreed with the documentation and the Qemu
source code. I tried changing around the destinations of the output just to
verify the order of the write or the destination somehow affected the
output. I tried being tricky, like instead of writing to usart 3 I wrote to
uart 4 - 0x400 (the same address, it didn't work). The code should be
simple enough that I don't have room for any crazy mistakes:

volatile unsigned char * const USART1_PTR = (unsigned char *)0x40011000;
volatile unsigned char * const USART2_PTR = (unsigned char *)0x40004400;
volatile unsigned char * const USART3_PTR = (unsigned char *)0x40004800;
volatile unsigned char * const UART4_PTR = (unsigned char *)0x40004c00;

void display(const char *string, volatile unsigned char * uart_addr){
  while(*string != '\0'){
*(uart_addr+4) = *string;
string++;
  }
}

int my_init(){
  display("Test 1/4\n", USART1_PTR);
  display("Test 2/4\n", USART2_PTR);
  display("Test 3/4\n", USART3_PTR);
  display("Test 4/4\n", UART4_PTR);
}


In the past I ran a really long test where I wrote to every possible
address just to see what happens. No unexpected output occurred. I can do
that test again, but it takes hours. I could also write code to convert the
address to something printable to verify the address isn't being changed,
but that seems unlikely.

Another thought I had is maybe there is some sort of interaction between
where I am setting the stack top - 0x20001000 - but that doesn't seem like
it should interfere. Maybe the linker or objcopy are doing something crazy?

I don't understand Qemu enough to know what should be calling the functions
that handle UART read/write. Is there something I should look at in Qemu
and try to intercept?

On Fri, Oct 7, 2016 at 6:27 PM, Alistair Francis 
wrote:

> On Fri, Oct 7, 2016 at 1:04 PM, Seth K  wrote:
> > I applied that patch, made qemu and ran my code, I didn't see a change.
> >
> > According to the STM32F20xxx memory map, the memory range seems to be
> 0x400
> > -- UART 1 is listed as 0x4001 - 0x400103FF. Should that memory
> region be
> > set to 0x400?
>
> I was hoping that would have fixed it.
>
> It sounds like it should be 0x400 then, although it doesn't sound like
> this is causing this issue.
>
> >
> > I tried that too, no change yet, but maybe I should look at the other
> memory
> > settings.
>
> Maybe, it is very strange that it's not reaching the read/write functions.
>
> Can you try putting print statements in the guest software to make
> sure it is writing to the locations you expect and then make sure
> there are no conditionals in QEMU that cause the print statements to
> not be printed. See what that uncovers.
>
> Thanks,
>
> Alistair
>
> >
> > I also tried making these changes in another branch where I made this
> chip
> > have 8 UARTS. That was unchanged: I can only output UARTS 1,4,5,6.
> >
> > On Fri, Oct 7, 2016 at 12:10 PM, Alistair Francis 
> > wrote:
> >>
> >> On Fri, Oct 7, 2016 at 9:03 AM, Alistair Francis 
> >> wrote:
> >> > On Fri, Oct 7, 2016 at 8:59 AM, Seth K  wrote:
> >> >> The only machine I saw listed in the help output is "netduino2." I
> >> >> pulled
> >> >> QEMU from github, was that the right thing to do?
> >> >>
> >> >> I found the specifications for the stm32f2xx and some similar chips
> and
> >> >> verified the addresses and interrupts are correct.
> >> >
> >> > Sorry my mistake. It is a the Netduino 2 Plus that we don't support.
> >> >
> >> > I think we should move this conversation to the bug report as well, I
> >> > was hoping that replying to the email would update the bug report but
> >> > it doesn't look like it.
> >> >
> >> >>
> >> >> The stm32f205 should support 6 UARTs, and the 6 addresses and IRQs
> are
> >> >> coded
> >> >> correctly. However there is a hard-coded value MAX_SERIAL_PORTS
> >> >> limiting
> >> >> serial_hds to 4, and I don't know why. I am considering submitting a
> >> >> patch.
> >> >
> >> > I'm not sure why we have that limit, you can submit a patch and see
> >> > what everyone says.
> >> >
> >> >>
> >> >> If I increase MAX_SERIAL_PORTS I can write to UARTs 1, 4, 5, and 6
> and
> >> >> output them to sockets. However writes to UARTs 2 and 3 just
> disappear.
> >> >> They
> >> >> don't even trigger my printf in stm32f2xx_usart_write. It seems like
> >> >> they
> >> >> are being intercepted somewhere, and unfortunately my knowledge of
> QEMU
> >> >> is
> >> >> too low to know where to look. Any pointers would be greatly
> >> >> appreciated.
> >> >
> >> > Strange. There could be something else addressed there. If you run
> >> > 'info mtree' at the QEMU prompt (Ctrl-a + c) you should be able to see
> >> > the memory map of the system.
> >>
> >> Hey Seth,
> >>
> >> 

Re: [Qemu-devel] MTTCG memory ordering

2016-10-12 Thread Emilio G. Cota
On Wed, Oct 12, 2016 at 10:58:43 +0200, Stefan Hajnoczi wrote:
> Hi Pranith,
> I was curious about the status of your MTTCG GSoC work:
> 
> I saw your fence series which implements the noop memory barrier/fence
> instructions on various architectures, but I wasn't sure if that also
> covers the case where a strong target is emulated on a weak host.
> 
> Did you make TCG automatically emit barriers so stronger targets (x86)
> run correctly on weaker targets (ARM)?

Here are some numbers on this. Chart:
  http://imgur.com/a/H9E9R

The overhead of emitting barriers all over the place is significant
(~3x for integer code), but that's the price of correctness as
the ArMOR paper shows[*] (this is the SYNC option in the chart).

A faster alternative also provided by ArMOR is to pretend the host
isn't multi-copy atomic (i.e. the iriw litmus test would fail); this
is the PowerA option in the chart.

An even better alternative is to let the hardware deal with this.
Unfortunately, so far only recent IBM processors support this
(the feature is called Strong Access Ordering--SAO in the chart).

[*] ArMOR: Defending Against Memory Consistency Model Mismatches
in Heterogeneous Architectures
  http://www.cs.princeton.edu/~ctrippel/dlustig_ISCA15.pdf

Emilio



[Qemu-devel] (no subject)

2016-10-12 Thread Neeraj Sharma
Dear Sir/Ma'am

I want to ‘annotate’ the translation buffers - (adding a mechanism in the
translation buffers where we can store how many times they were executed,
and, for each one, add some ‘amount’ could be power, could be anything). I
need held to understand the translation buffer code in qemu, starts fom
cpu-exec.c.

Neeraj
Thanks


Re: [Qemu-devel] [PATCH v8 4/6] docs: Add Documentation for Mediated devices

2016-10-12 Thread Alex Williamson
On Thu, 13 Oct 2016 00:32:48 +0530
Kirti Wankhede  wrote:

> On 10/12/2016 9:29 PM, Alex Williamson wrote:
> > On Wed, 12 Oct 2016 20:43:48 +0530
> > Kirti Wankhede  wrote:
> >   
> >> On 10/12/2016 7:22 AM, Tian, Kevin wrote:  
>  From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
>  Sent: Wednesday, October 12, 2016 4:45 AM
> >> +* mdev_supported_types:
> >> +List of current supported mediated device types and its details 
> >> are added
> >> +in this directory in following format:
> >> +
> >> +|- 
> >> +|--- Vendor-specific-attributes [optional]
> >> +|--- mdev_supported_types
> >> +| |--- 
> >> +| |   |--- create
> >> +| |   |--- name
> >> +| |   |--- available_instances
> >> +| |   |--- description /class
> >> +| |   |--- [devices]
> >> +| |--- 
> >> +| |   |--- create
> >> +| |   |--- name
> >> +| |   |--- available_instances
> >> +| |   |--- description /class
> >> +| |   |--- [devices]
> >> +| |--- 
> >> +|  |--- create
> >> +|  |--- name
> >> +|  |--- available_instances
> >> +|  |--- description /class
> >> +|  |--- [devices]
> >> +
> >> +[TBD : description or class is yet to be decided. This will change.]  
> >>   
> >
> > I thought that in previous discussions we had agreed to drop
> > the  concept and use the name as the unique identifier.
> > When reporting these types in libvirt we won't want to report
> > the type id values - we'll want the name strings to be unique.
> >
> 
>  The 'name' might not be unique but type_id will be. For example that Neo
>  pointed out in earlier discussion, virtual devices can come from two
>  different physical devices, end user would be presented with what they
>  had selected but there will be internal implementation differences. In
>  that case 'type_id' will be unique.
> 
> >>>
> >>> Hi, Kirti, my understanding is that Neo agreed to use an unique type
> >>> string (if you still called it ), and then no need of additional
> >>> 'name' field which can be put inside 'description' field. See below quote:
> >>> 
> >>
> >> We had internal discussions about this within NVIDIA and found that
> >> 'name' might not be unique where as 'type_id' would be unique. I'm
> >> refering to Neo's mail after that, where Neo do pointed that out.
> >>
> >> https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg07714.html  
> > 
> > Everyone not privy to those internal discussions, including me, seems to
> > think we dropped type_id and that if a vendor does not have a stable
> > name, they can compose some sort of stable type description based on the
> > name+id, or even vendor+id, ex. NVIDIA-11.  So please share why we
> > haven't managed to kill off type_id yet.  No matter what internal
> > representation each vendor driver has of "type_id" it seems possible
> > for it to come up with stable string to define a given configuration.  
> 
> 
> The 'type_id' is unique and the 'name' are not, the name is just a
> virtual device name/ human readable name. Because at this moment Intel
> can't define a proper GPU class, we have to add a 'description' field
> there as well to represent the features of this virtual device, once we
> have all agreed with the GPU class and its mandatory attributes, the
> 'description' field can be removed. Here is an example,
> type_id/type_name = NVIDIA_11,
> name=M60-M0Q,
> description=2560x1600, 2 displays, 512MB"
> 
> Neo's previous comment only applies to the situation where we will have
> the GPU class or optional attributes defined and recognized by libvirt,
> since that is not going to happen any time soon, we will have to have
> the new 'description' field, and we don't want to have it mixed up with
> 'name' field.
> 
> We can definitely have something like name+id as Alex recommended to
> remove the 'name' field, but it will just require libvirt to have more
> logic to parse that string.

Let's use the mtty example driver provided in patch 5 so we can all
more clearly see how the interfaces work.  I'll start from the
beginning of my experience and work my way to the type/name thing.

(please add a modules_install target to the Makefile)

# modprobe mtty

Now what?  It seems like I need to have prior knowledge that this
drivers supports mdev devices and I need to go hunt for them.  We need
to create a class (ex. /sys/class/mdev/) where a user can find all the
devices that participate in this mediated device infrastructure.  That
would point me to /sys/devices/mtty.

# tree /sys/devices/mtty
/sys/devices/mtty
|-- mdev_supported_types
|   `-- mtty1
|   |-- available_instances (1)
|   |-- create
|   |-- devices
|   `-- name ("Dual-port-serial")
|-- mtty_dev
|   `-- sample_mtty_dev ("This is phy device")

Re: [Qemu-devel] [PATCH 1/3] block: Fix bdrv_iterate_format() sorting

2016-10-12 Thread Eric Blake
On 10/12/2016 03:49 PM, Max Reitz wrote:
> bdrv_iterate_format() did not actually sort the formats by name but by
> "pointer interpreted as string". That is probably not what we intended
> to do, so fix it (by changing qsort_strcmp() so it matches the example
> from qsort()'s manual page).
> 
> Signed-off-by: Max Reitz 
> ---
>  block.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

I'm a bit surprised that code sanitizers like Coverity or ASAN aren't
(yet?) able to flag this.

Reviewed-by: Eric Blake 

> 
> diff --git a/block.c b/block.c
> index bb1f1ec..e46e4b2 100644
> --- a/block.c
> +++ b/block.c
> @@ -2789,7 +2789,7 @@ const char *bdrv_get_format_name(BlockDriverState *bs)
>  
>  static int qsort_strcmp(const void *a, const void *b)
>  {
> -return strcmp(a, b);
> +return strcmp(*(char *const *)a, *(char *const *)b);
>  }
>  
>  void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 00/15] optimize Qemu RSS usage

2016-10-12 Thread Michael R. Hines

Peter,

Greetings from DigitalOcean. We're experiencing the same symptoms 
without this patch.
We have, collectively, many gigabytes of un-planned-for RSS being used 
per-hypervisor

that we would like to get rid of =).

Without explicitly trying this patch (will do that ASAP), we immediately 
noticed that the
192MB mentioned immediately melts away (Yay) when we disabled the 
coroutine thread pool explicitly,
with another ~100MB in additional stack usage that would likely also go 
away if we

applied the entirety of your patch.

Is there any chance you have revisited this or have a timeline for it?

- Michael

/*
 * Michael R. Hines
 * Senior Engineer, DigitalOcean.
 */

On 06/28/2016 04:01 AM, Peter Lieven wrote:

I recently found that Qemu is using several hundred megabytes of RSS memory
more than older versions such as Qemu 2.2.0. So I started tracing
memory allocation and found 2 major reasons for this.

1) We changed the qemu coroutine pool to have a per thread and a global release
pool. The choosen poolsize and the changed algorithm could lead to up to
192 free coroutines with just a single iothread. Each of the coroutines
in the pool each having 1MB of stack memory.

2) Between Qemu 2.2.0 and 2.3.0 RCU was introduced which lead to delayed freeing
of memory. This lead to higher heap allocations which could not effectively
be returned to kernel (most likely due to fragmentation).

The following series is what I came up with. Beside the coroutine patches I 
changed
some allocations to forcibly use mmap. All these allocations are not repeatly 
made
during runtime so the impact of using mmap should be neglectible.

There are still some big malloced allocations left which cannot be easily 
changed
(e.g. the pixman buffers in VNC). So it might an idea to set a lower mmap 
threshold for
malloc since this threshold seems to be in the order of several Megabytes on 
modern systems.

Peter Lieven (15):
   coroutine-ucontext: mmap stack memory
   coroutine-ucontext: add a switch to monitor maximum stack size
   coroutine-ucontext: reduce stack size to 64kB
   coroutine: add a knob to disable the shared release pool
   util: add a helper to mmap private anonymous memory
   exec: use mmap for subpages
   qapi: use mmap for QmpInputVisitor
   virtio: use mmap for VirtQueue
   loader: use mmap for ROMs
   vmware_svga: use mmap for scratch pad
   qom: use mmap for bigger Objects
   util: add a function to realloc mmapped memory
   exec: use mmap for PhysPageMap->nodes
   vnc-tight: make the encoding palette static
   vnc: use mmap for VncState

  configure | 33 ++--
  exec.c| 11 ---
  hw/core/loader.c  | 16 +-
  hw/display/vmware_vga.c   |  3 +-
  hw/virtio/virtio.c|  5 +--
  include/qemu/mmap-alloc.h |  7 +
  include/qom/object.h  |  1 +
  qapi/qmp-input-visitor.c  |  5 +--
  qom/object.c  | 20 ++--
  ui/vnc-enc-tight.c| 21 ++---
  ui/vnc.c  |  5 +--
  ui/vnc.h  |  1 +
  util/coroutine-ucontext.c | 66 +--
  util/mmap-alloc.c | 27 
  util/qemu-coroutine.c | 79 ++-
  15 files changed, 225 insertions(+), 75 deletions(-)





Re: [Qemu-devel] error reporting in functions

2016-10-12 Thread Eric Blake
On 10/12/2016 10:47 AM, Vladimir Sementsov-Ogievskiy wrote:
> HI all!
> 
> My questions is: what are general recommendations in Qemu for return
> code, if we have Error **errp?
> What should I prefer: errp, duplicated by int return code, or void
> functions with errp?

Markus has already had several threads commenting on this very topic in
the past.  A lot of code uses void return, but some newer code is using
an int return for simplifying the call-sites (and glib prefers the int
rather than void return, as well).  Markus was playing with a Coccinelle
script to see how painful a bulk-conversion of the code base would be,
but it is probably stalled behind more pressing work at the moment.

> 
> void + errp seems good, just to not duplicate things. But it has a
> disadvantage of necessity of "local_err" and "error_propagate" in caller
> function, if its behaviour depends on callee function success...
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 1/2] 9pfs: fix information leak in xattr read

2016-10-12 Thread Eric Blake
On 10/12/2016 08:23 AM, Greg Kurz wrote:
> 
> But in fact, I'm afraid we have a more serious problem here... size
> comes from the guest and could cause g_malloc() to abort if QEMU has
> reached some RLIMIT... we need to call g_try_malloc0() and return
> ENOMEM if the allocation fails.

Even if it does not cause an ENOMEM failure right away, the guest can
also use this to chew up lots of host resources. It may also be worth
putting a reasonable cap at the maximum the guest can allocate, rather
than just trying to malloc every possible size.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PATCH 1/3] block: Fix bdrv_iterate_format() sorting

2016-10-12 Thread Max Reitz
bdrv_iterate_format() did not actually sort the formats by name but by
"pointer interpreted as string". That is probably not what we intended
to do, so fix it (by changing qsort_strcmp() so it matches the example
from qsort()'s manual page).

Signed-off-by: Max Reitz 
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index bb1f1ec..e46e4b2 100644
--- a/block.c
+++ b/block.c
@@ -2789,7 +2789,7 @@ const char *bdrv_get_format_name(BlockDriverState *bs)
 
 static int qsort_strcmp(const void *a, const void *b)
 {
-return strcmp(a, b);
+return strcmp(*(char *const *)a, *(char *const *)b);
 }
 
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
-- 
2.10.0




[Qemu-devel] [PATCH 3/3] iotests: Skip test 162 if there is no SSH support

2016-10-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/162 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/162 b/tests/qemu-iotests/162
index f8eecb3..cad2bd7 100755
--- a/tests/qemu-iotests/162
+++ b/tests/qemu-iotests/162
@@ -35,6 +35,9 @@ status=1  # failure is the default!
 _supported_fmt generic
 _supported_os Linux
 
+test_ssh=$($QEMU_IMG --help | grep '^Supported formats:.* ssh\( \|$\)')
+[ "$test_ssh" = "" ] && _notrun "ssh support required"
+
 echo
 echo '=== NBD ==='
 # NBD expects all of its arguments to be strings
-- 
2.10.0




[Qemu-devel] [PATCH 0/3] iotests: Skip 162 if there is no SSH support

2016-10-12 Thread Max Reitz
As reported by Hao QingFeng, iotest 162 is currently executed even if
qemu does not have any SSH support (which makes it fail, naturally).

Fixing that is not so trivial, because qemu-img currently does not
report modules, and SSH can be compiled as a module, so that needs to be
fixed first. While doing so, I noticed that bdrv_iterate_format() tries
to sort the list of formats, which is a bit contrary to my experience.
Turns out that needs to be fixed, too.


This series can either be applied on top of my series "iotests: Fix test
162" or just directly on master, both works (i.e. the patches in this
series do not interfere with those from that one). I still thought I'd
mention that series, if nothing else then only to get you to review that
other one. ;-)


Max Reitz (3):
  block: Fix bdrv_iterate_format() sorting
  block: Emit modules in bdrv_iterate_format()
  iotests: Skip test 162 if there is no SSH support

 block.c| 20 +++-
 tests/qemu-iotests/162 |  3 +++
 2 files changed, 22 insertions(+), 1 deletion(-)

-- 
2.10.0




[Qemu-devel] [PATCH 2/3] block: Emit modules in bdrv_iterate_format()

2016-10-12 Thread Max Reitz
Some block drivers may not be loaded yet, but qemu supports them
nonetheless. bdrv_iterate_format() should report them, too.

Signed-off-by: Max Reitz 
---
 block.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block.c b/block.c
index e46e4b2..88a1ea5 100644
--- a/block.c
+++ b/block.c
@@ -2815,6 +2815,24 @@ void bdrv_iterate_format(void (*it)(void *opaque, const 
char *name),
 }
 }
 
+for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
+const char *format_name = block_driver_modules[i].format_name;
+
+if (format_name) {
+bool found = false;
+int j = count;
+
+while (formats && j && !found) {
+found = !strcmp(formats[--j], format_name);
+}
+
+if (!found) {
+formats = g_renew(const char *, formats, count + 1);
+formats[count++] = format_name;
+}
+}
+}
+
 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
 
 for (i = 0; i < count; i++) {
-- 
2.10.0




Re: [Qemu-devel] chardev's and fd's in monitors

2016-10-12 Thread Marc-André Lureau
Hi

On Wed, Oct 12, 2016 at 11:15 PM Dr. David Alan Gilbert 
wrote:

> Hi,
>   I had a look at a couple of readline like libraries;
> editline and linenoise.  A difficulty with using them is that
> they both want fd's or FILE*'s; editline takes either but
> from a brief look I think it's expecting to extract the fd.
> That makes them tricky to integrate into qemu, where
> the chardev's hide a whole bunch of non-fd things; in particular
> tls, mux, ringbuffers etc.
>

We could restrict readline usage to chardev with fd? But even with that,
how would it be compatible with mux? It would have to somehow steal/restore
the chardev fd. Alternatively, we could have a "fd pipe"/socketpair chardev
frontend compatible with any chardev. Sounds contrived though, but it
should work, and probably not so much code. (qemu_chr_new_fd_fe?)


>
> If we could get away with just a FILE* then we could use fopencookie,
> but that's GNU only.
>
> Is there any sane way of shepherding all chardev's into having an
> fd?
>

Ah that would be nice! But I think the point is to stay in userspace (and
avoid extra copy, context switch, or extra fds). Otherwise, it feels like
the whole chr interface could be a socketpair + a thin layer for events,
that would simplify things indeed.


> Once you had those then you could also use them in a separate thread.
>
>
You can already use chardev in seperate thread, but I don't know to which
extent (see add_handlers_full for completely seperate thread, locking for
write for multi-writer, I suppose s->chr_read is called from the
dispatching context and is responsability for frontend callback to lock
properly)


-- 
Marc-André Lureau


[Qemu-devel] [PATCH v3 4/4] target-arm: Comments added to identify cases in a switch

2016-10-12 Thread Thomas Hanson
3 cases in a switch in disas_exc() require reference to the
ARM ARM spec in order to determine what case they're handling.

Signed-off-by: Thomas Hanson 
---
 target-arm/translate-a64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index b4a4b72..eb63e2f 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1688,12 +1688,12 @@ static void disas_exc(DisasContext *s, uint32_t insn)
  * instruction works properly.
  */
 switch (op2_ll) {
-case 1:
+case 1: /* SVC */
 gen_ss_advance(s);
 gen_exception_insn(s, 0, EXCP_SWI, syn_aa64_svc(imm16),
default_exception_el(s));
 break;
-case 2:
+case 2: /* HVC */
 if (s->current_el == 0) {
 unallocated_encoding(s);
 break;
@@ -1706,7 +1706,7 @@ static void disas_exc(DisasContext *s, uint32_t insn)
 gen_ss_advance(s);
 gen_exception_insn(s, 0, EXCP_HVC, syn_aa64_hvc(imm16), 2);
 break;
-case 3:
+case 3: /* SMC */
 if (s->current_el == 0) {
 unallocated_encoding(s);
 break;
-- 
1.9.1




Re: [Qemu-devel] [kvm-unit-tests PATCHv6 2/3] arm: pmu: Check cycle count increases

2016-10-12 Thread Wei Huang


On 10/12/2016 01:10 PM, Christopher Covington wrote:
> Hi Wei,
> 
> On 10/12/2016 11:49 AM, Wei Huang wrote:
>> On 10/11/2016 01:40 PM, Christopher Covington wrote:
>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>>> even for the smallest delta of two subsequent reads.
>>>
>>> Signed-off-by: Christopher Covington 
>>> Reviewed-by: Andrew Jones 
>>> ---
>>>  arm/pmu.c | 60 
>>>  1 file changed, 60 insertions(+)
>>>
>>> diff --git a/arm/pmu.c b/arm/pmu.c
>>> index 42d0ee1..4334de4 100644
>>> --- a/arm/pmu.c
>>> +++ b/arm/pmu.c
>>> @@ -14,6 +14,8 @@
>>>   */
>>>  #include "libcflat.h"
>>>  
>>> +#define NR_SAMPLES 10
>>> +
>>>  #if defined(__arm__)
>>>  static inline uint32_t get_pmcr(void)
>>>  {
>>> @@ -22,6 +24,25 @@ static inline uint32_t get_pmcr(void)
>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>> return ret;
>>>  }
>>> +
>>> +static inline void set_pmcr(uint32_t pmcr)
>>> +{
>>> +   asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (pmcr));
>>> +}
>>> +
>>> +/*
>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, 
>>> returning 64
>>> + * bits doesn't seem worth the trouble when differential usage of the 
>>> result is
>>> + * expected (with differences that can easily fit in 32 bits). So just 
>>> return
>>> + * the lower 32 bits of the cycle count in AArch32.
>>> + */
>>> +static inline unsigned long get_pmccntr(void)
>>> +{
>>> +   unsigned long cycles;
>>> +
>>> +   asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
>>> +   return cycles;
>>> +}
>>>  #elif defined(__aarch64__)
>>>  static inline uint32_t get_pmcr(void)
>>>  {
>>> @@ -30,6 +51,19 @@ static inline uint32_t get_pmcr(void)
>>> asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
>>> return ret;
>>>  }
>>> +
>>> +static inline void set_pmcr(uint32_t pmcr)
>>> +{
>>> +   asm volatile("msr pmcr_el0, %0" : : "r" (pmcr));
>>> +}
>>> +
>>> +static inline unsigned long get_pmccntr(void)
>>> +{
>>> +   unsigned long cycles;
>>> +
>>> +   asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
>>> +   return cycles;
>>> +}
>>>  #endif
>>>  
>>>  struct pmu_data {
>>> @@ -72,11 +106,37 @@ static bool check_pmcr(void)
>>> return pmu.implementer != 0;
>>>  }
>>>  
>>> +/*
>>> + * Ensure that the cycle counter progresses between back-to-back reads.
>>> + */
>>> +static bool check_cycles_increase(void)
>>> +{
>>> +   struct pmu_data pmu = {0};
>>
>> Compilation error on my machine:
>>
>> arm/pmu.c: In function ‘check_cycles_increase’:
>> arm/pmu.c:148:9: error: missing braces around initializer
>> [-Werror=missing-braces]
>>   struct pmu_data pmu = {0};
>>
>> Same for Patch 3.
> 
> "...So your compiler complains about {0}? Is there a problem besides the
> warning? If not, then I'm still a bit inclined to keep the code neat. The
> warnings will go away with compiler updates."

Indeed my stock GCC compiler is a bit old - 4.8.5; newer version can fix
it. But note this a compilation error which prevents the binary from
being built and will last a while. Could we use double-braces, pmu =
{{0}}, as a solution?

-Wei

> 
> https://lists.gnu.org/archive/html/qemu-devel/2015-10/msg06064.html
> 
> Thanks,
> Cov
> 



[Qemu-devel] [PATCH v3 2/4] target-arm: Code changes to implement overwrite of tag field on PC load

2016-10-12 Thread Thomas Hanson
For BR, BLR and RET instructions, if tagged addresses are enabled, the
tag field in the address must be cleared out prior to loading the
address into the PC.  Depending on the current EL, it will be set to
either all 0's or all 1's.

Signed-off-by: Thomas Hanson 
---
 target-arm/translate-a64.c | 91 +++---
 target-arm/translate.h |  1 +
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 3b15d2c..8321218 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -41,6 +41,7 @@ static TCGv_i64 cpu_pc;
 
 /* Load/store exclusive handling */
 static TCGv_i64 cpu_exclusive_high;
+static TCGv_i64 cpu_reg(DisasContext *s, int reg);
 
 static const char *regnames[] = {
 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
@@ -176,6 +177,85 @@ void gen_a64_set_pc_im(uint64_t val)
 tcg_gen_movi_i64(cpu_pc, val);
 }
 
+/* Load the PC from a generic TCG variable.
+ *
+ * If address tagging is enabled via the TCR TBI bits, then loading
+ * an address into the PC will clear out any tag in the it:
+ *  + for EL2 and EL3 there is only one TBI bit, and if it is set
+ *then the address is zero-extended, clearing bits [63:56]
+ *  + for EL0 and EL1, TBI0 controls addresses with bit 55 == 0
+ *and TBI1 controls addressses with bit 55 == 1.
+ *If the appropriate TBI bit is set for the address then
+ *the address is sign-extended from bit 55 into bits [63:56]
+ *
+ * We can avoid doing this for relative-branches, because the
+ * PC + offset can never overflow into the tag bits (assuming
+ * that virtual addresses are less than 56 bits wide, as they
+ * are currently), but we must handle it for branch-to-register.
+ */
+static void gen_a64_set_pc_var(DisasContext *s, TCGv_i64 src)
+{
+
+if (s->current_el <= 1) {
+/* Test if NEITHER or BOTH TBI values are set.  If so, no need to
+ * examine bit 55 of address, can just generate code.
+ * If mixed, then test via generated code
+ */
+if (s->tbi0 && s->tbi1) {
+TCGv_i64 tmp_reg = tcg_temp_new_i64();
+/* Both bits set, sign extension from bit 55 into [63:56] will
+ * cover both cases
+ */
+tcg_gen_shli_i64(tmp_reg, src, 8);
+tcg_gen_sari_i64(cpu_pc, tmp_reg, 8);
+tcg_temp_free_i64(tmp_reg);
+} else if (!s->tbi0 && !s->tbi1) {
+/* Neither bit set, just load it as-is */
+tcg_gen_mov_i64(cpu_pc, src);
+} else {
+TCGv_i64 tcg_tmpval = tcg_temp_new_i64();
+TCGv_i64 tcg_bit55  = tcg_temp_new_i64();
+TCGv_i64 tcg_zero   = tcg_const_i64(0);
+
+tcg_gen_andi_i64(tcg_bit55, src, (1ull << 55));
+
+if (s->tbi0) {
+/* tbi0==1, tbi1==0, so 0-fill upper byte if bit 55 = 0 */
+tcg_gen_andi_i64(tcg_tmpval, src,
+ 0x00FFull);
+tcg_gen_movcond_i64(TCG_COND_EQ, cpu_pc, tcg_bit55, tcg_zero,
+tcg_tmpval, src);
+} else {
+/* tbi0==0, tbi1==1, so 1-fill upper byte if bit 55 = 1 */
+tcg_gen_ori_i64(tcg_tmpval, src,
+0xFF00ull);
+tcg_gen_movcond_i64(TCG_COND_NE, cpu_pc, tcg_bit55, tcg_zero,
+tcg_tmpval, src);
+}
+tcg_temp_free_i64(tcg_zero);
+tcg_temp_free_i64(tcg_bit55);
+tcg_temp_free_i64(tcg_tmpval);
+}
+} else {  /* EL > 1 */
+if (s->tbi0) {
+/* Force tag byte to all zero */
+tcg_gen_andi_i64(cpu_pc, src, 0x00FFull);
+} else {
+/* Load unmodified address */
+tcg_gen_mov_i64(cpu_pc, src);
+}
+ }
+}
+
+/* Load the PC from a register.
+ *
+ * Convert register into a TCG variable and call gen_a64_set_pc_var()
+ */
+void gen_a64_set_pc_reg(DisasContext *s, unsigned int rn)
+{
+gen_a64_set_pc_var(s, cpu_reg(s, rn));
+}
+
 typedef struct DisasCompare64 {
 TCGCond cond;
 TCGv_i64 value;
@@ -1704,12 +1784,13 @@ static void disas_uncond_b_reg(DisasContext *s, 
uint32_t insn)
 
 switch (opc) {
 case 0: /* BR */
-case 2: /* RET */
-tcg_gen_mov_i64(cpu_pc, cpu_reg(s, rn));
-break;
 case 1: /* BLR */
-tcg_gen_mov_i64(cpu_pc, cpu_reg(s, rn));
-tcg_gen_movi_i64(cpu_reg(s, 30), s->pc);
+case 2: /* RET */
+gen_a64_set_pc_reg(s, rn);
+/* BLR also needs to load return address */
+if (opc == 1) {
+tcg_gen_movi_i64(cpu_reg(s, 30), s->pc);
+}
 break;
 case 4: /* ERET */
 if (s->current_el == 0) {
diff --git a/target-arm/translate.h b/target-arm/translate.h
index a53f25a..49c042e 100644
--- 

Re: [Qemu-devel] [PATCH 2/3] target-arm: Code changes to implement overwrite of tag field on PC load

2016-10-12 Thread Tom Hanson
On 10/11/2016 10:12 AM, Peter Maydell wrote:
> On 11 October 2016 at 16:51, Thomas Hanson  wrote:
>> On 5 October 2016 at 16:01, Peter Maydell  wrote:
>>> It matches the style of the rest of the code which generally
>>> prefers to convert register numbers into TCGv earlier rather
>>> than later (at the level which is doing decode of instruction
>>> bits, rather than inside utility functions), and gives you a
>>> more flexible utility function, which can do a "write value to PC"
>>> for any value, not just something that happens to be in a CPU
>>> register. And as you say it avoids calling cpu_reg() multiple times
>>> as a side benefit.
> 
>> This approach seems counter to both structured and OO design principles
>> which would push common code (like type conversion) down into the lower
>> level function in order to increase re-use and minimize code duplication.
>> Those principles suggest that if we need a gen_a64_set_pc_value() function
>> that can load the PC from something other than a register or an immediate,
>> then it should be a lower level function than, and be called by,
>> gen_a64_set_pc_reg().  This also has the benefit of reducing clutter in the
>> caller, making it more readable and more maintainable.
> 
> The 'lower level' stuff here has a general pattern of taking either
> (1) a TCGv or (2) an integer immediate. We should follow that pattern.
> 
>> As a separate issue, we now have functions to load the PC from an immediate
>> value and from a register.  Where else could we legitimately load the PC
>> from?
> 
> Anything where we found ourselves wanting to do some preliminary
> manipulation of the value before writing it to the PC.
> 
> thanks
> -- PMM
> 

I split gen_a64_set_pc_reg() into 2 funtions, upper that takes a register and 
lower
that takes a variable.  Patch v3 submitted.




[Qemu-devel] [PATCH v3 3/4] target-arm: Comments to mark location of pending work for 56 bit addresses

2016-10-12 Thread Thomas Hanson
Certain instructions which can not directly load a tagged address value
may trigger a corner case when the address size is 56 bits.  This is
because incrementing or offsetting from the current PC can cause an
arithetic roll-over into the tag bits.  Per the ARM ARM spec, these cases
should also be addressed by cleaning up the tag field.

Signed-off-by: Thomas Hanson 
---
 target-arm/translate-a64.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 8321218..b4a4b72 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1232,6 +1232,9 @@ static inline AArch64DecodeFn *lookup_disas_fn(const 
AArch64DecodeTable *table,
  */
 static void disas_uncond_b_imm(DisasContext *s, uint32_t insn)
 {
+/*If/when address size is 56 bits, this could overflow into address tag
+ * byte, and that byte should be fixed per ARM ARM spec.
+ */
 uint64_t addr = s->pc + sextract32(insn, 0, 26) * 4 - 4;
 
 if (insn & (1U << 31)) {
@@ -1259,6 +1262,9 @@ static void disas_comp_b_imm(DisasContext *s, uint32_t 
insn)
 sf = extract32(insn, 31, 1);
 op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */
 rt = extract32(insn, 0, 5);
+/*If/when address size is 56 bits, this could overflow into address tag
+ * byte, and that byte should be fixed per ARM ARM spec.
+ */
 addr = s->pc + sextract32(insn, 5, 19) * 4 - 4;
 
 tcg_cmp = read_cpu_reg(s, rt, sf);
@@ -1287,6 +1293,9 @@ static void disas_test_b_imm(DisasContext *s, uint32_t 
insn)
 
 bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5);
 op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */
+/*If/when address size is 56 bits, this could overflow into address tag
+ * byte, and that byte should be fixed per ARM ARM spec.
+ */
 addr = s->pc + sextract32(insn, 5, 14) * 4 - 4;
 rt = extract32(insn, 0, 5);
 
@@ -1316,6 +1325,9 @@ static void disas_cond_b_imm(DisasContext *s, uint32_t 
insn)
 unallocated_encoding(s);
 return;
 }
+/*If/when address size is 56 bits, this could overflow into address tag
+ * byte, and that byte should be fixed per ARM ARM spec.
+ */
 addr = s->pc + sextract32(insn, 5, 19) * 4 - 4;
 cond = extract32(insn, 0, 4);
 
-- 
1.9.1




[Qemu-devel] [PATCH v3 0/4] target-arm: Handle tagged addresses when loading PC

2016-10-12 Thread Thomas Hanson
If tagged addresses are enabled, then addresses being loaded into the 
PC must be cleaned up by overwriting the tag bits with either all 0's 
or all 1's as specified in the ARM ARM spec.  The decision process is 
dependent on whether the code will be running in EL0/1 or in EL2/3 and 
is controlled by a combination of Top Byte Ignored (TBI) bits in the 
TCR and the value of bit 55 in the address being loaded. 

TBI values are extracted from the appropriate TCR and made available 
to TCG code generation routines by inserting them into the TB flags 
field and then transferring them to DisasContext structure in 
gen_intermediate_code_a64().

New function gen_a64_set_pc_var() encapsulates the logic required to 
determine whether clean up of the tag byte is required and then 
generating the code to correctly load the PC. New function 
gen_a64_set_pc_reg() accepts a register number and then calls
gen_a64_set_pc_var().
  
In addition to those instruction which can directly load a tagged 
address into the PC, there are others which increment or add a value to
the PC.  If 56 bit addressing is used, these instructions can cause an 
arithmetic roll-over into the tag bits.  The ARM ARM specification for 
handling tagged addresses requires that these cases also be addressed
by cleaning up the tag field.  This work has been deferred because 
there is currently no CPU model available for testing with 56 bit 
addresses.

v1->v2:
  - Updated patch descriptions per Peter's commments
  - Added function header and other comments as recommended
  - Change return type from long to unit32_t for arm_regime_tbi0() &
arm_regime_tbi1()
  - Moved prototype of gen_a64_set_pc_reg() from patch 1 to patch 2
  - Moved assignment of dc->tbi0 & dc->tbi1 from patch 2 to patch 1
  - Split out documentation comments into separate patch.

v2->v3
  - Split gen_a64_set_pc_reg() into 2 functions:
* gen_a64_set_pc_var() which takes a TCGv_i64 argument and 
* gen_a64_set_pc_reg() which takes a register number and calls 
  gen_a64_set_pc_var() after mapping the register to a variable

  Still looking into handling of tagged addresses for exceptions and
  exception returns.  Will handle that as a separate patch set.


Thomas Hanson (4):
  target-arm: Infrastucture changes to enable handling of tagged address
loading into PC
  target-arm: Code changes to implement overwrite of tag field on PC
load
  target-arm: Comments to mark location of pending work for 56 bit
addresses
  target-arm: Comments added to identify cases in a switch




[Qemu-devel] [PATCH v3 1/4] target-arm: Infrastucture changes to enable handling of tagged address loading into PC

2016-10-12 Thread Thomas Hanson
When capturing the current CPU state for the TB, extract the TBI0 and TBI1
values from the correct TCR for the current EL and then add them to the TB
flags field.

Then, at the start of code generation for the block, copy the TBI fields
into the DisasContext structure.

Signed-off-by: Thomas Hanson 
---
 target-arm/cpu.h   | 39 +--
 target-arm/helper.c| 46 ++
 target-arm/translate-a64.c |  2 ++
 target-arm/translate.h |  2 ++
 4 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 76d824d..699e6e5 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -2191,7 +2191,11 @@ static inline bool 
arm_cpu_data_is_big_endian(CPUARMState *env)
 #define ARM_TBFLAG_BE_DATA_SHIFT20
 #define ARM_TBFLAG_BE_DATA_MASK (1 << ARM_TBFLAG_BE_DATA_SHIFT)
 
-/* Bit usage when in AArch64 state: currently we have no A64 specific bits */
+/* Bit usage when in AArch64 state */
+#define ARM_TBFLAG_TBI0_SHIFT 0/* TBI0 for EL0/1 or TBI for EL2/3 */
+#define ARM_TBFLAG_TBI0_MASK (0x1ull << ARM_TBFLAG_TBI0_SHIFT)
+#define ARM_TBFLAG_TBI1_SHIFT 1/* TBI1 for EL0/1  */
+#define ARM_TBFLAG_TBI1_MASK (0x1ull << ARM_TBFLAG_TBI1_SHIFT)
 
 /* some convenience accessor macros */
 #define ARM_TBFLAG_AARCH64_STATE(F) \
@@ -,6 +2226,10 @@ static inline bool 
arm_cpu_data_is_big_endian(CPUARMState *env)
 (((F) & ARM_TBFLAG_NS_MASK) >> ARM_TBFLAG_NS_SHIFT)
 #define ARM_TBFLAG_BE_DATA(F) \
 (((F) & ARM_TBFLAG_BE_DATA_MASK) >> ARM_TBFLAG_BE_DATA_SHIFT)
+#define ARM_TBFLAG_TBI0(F) \
+(((F) & ARM_TBFLAG_TBI0_MASK) >> ARM_TBFLAG_TBI0_SHIFT)
+#define ARM_TBFLAG_TBI1(F) \
+(((F) & ARM_TBFLAG_TBI1_MASK) >> ARM_TBFLAG_TBI1_SHIFT)
 
 static inline bool bswap_code(bool sctlr_b)
 {
@@ -2319,12 +2327,38 @@ static inline bool arm_cpu_bswap_data(CPUARMState *env)
 }
 #endif
 
+/**
+ * arm_regime_tbi0:
+ * @env: CPUARMState
+ * @mmu_idx: MMU index indicating required translation regime
+ *
+ * Extracts the TBI0 value from the appropriate TCR for the current EL
+ *
+ * Returns: the TBI0 value.
+ */
+extern uint32_t arm_regime_tbi0(CPUARMState *env, ARMMMUIdx mmu_idx);
+
+/**
+ * arm_regime_tbi1:
+ * @env: CPUARMState
+ * @mmu_idx: MMU index indicating required translation regime
+ *
+ * Extracts the TBI1 value from the appropriate TCR for the current EL
+ *
+ * Returns: the TBI1 value.
+ */
+extern uint32_t arm_regime_tbi1(CPUARMState *env, ARMMMUIdx mmu_idx);
+
 static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
 target_ulong *cs_base, uint32_t *flags)
 {
+ARMMMUIdx mmu_idx = cpu_mmu_index(env, false);
 if (is_a64(env)) {
 *pc = env->pc;
 *flags = ARM_TBFLAG_AARCH64_STATE_MASK;
+/* Get control bits for tagged addresses */
+*flags |= (arm_regime_tbi0(env, mmu_idx) << ARM_TBFLAG_TBI0_SHIFT);
+*flags |= (arm_regime_tbi1(env, mmu_idx) << ARM_TBFLAG_TBI1_SHIFT);
 } else {
 *pc = env->regs[15];
 *flags = (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
@@ -2343,7 +2377,8 @@ static inline void cpu_get_tb_cpu_state(CPUARMState *env, 
target_ulong *pc,
<< ARM_TBFLAG_XSCALE_CPAR_SHIFT);
 }
 
-*flags |= (cpu_mmu_index(env, false) << ARM_TBFLAG_MMUIDX_SHIFT);
+*flags |= (mmu_idx << ARM_TBFLAG_MMUIDX_SHIFT);
+
 /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
  * states defined in the ARM ARM for software singlestep:
  *  SS_ACTIVE   PSTATE.SS   State
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 25f612d..70e2742 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -6720,6 +6720,52 @@ static inline TCR *regime_tcr(CPUARMState *env, 
ARMMMUIdx mmu_idx)
 return >cp15.tcr_el[regime_el(env, mmu_idx)];
 }
 
+/* Returns TBI0 value for current regime el */
+uint32_t arm_regime_tbi0(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+TCR *tcr;
+uint32_t el;
+
+/* For EL0 and EL1, TBI is controlled by stage 1's TCR, so convert
+   * a stage 1+2 mmu index into the appropriate stage 1 mmu index.
+   */
+if (mmu_idx == ARMMMUIdx_S12NSE0 || mmu_idx == ARMMMUIdx_S12NSE1) {
+mmu_idx += ARMMMUIdx_S1NSE0;
+}
+
+tcr = regime_tcr(env, mmu_idx);
+el = regime_el(env, mmu_idx);
+
+if (el > 1) {
+return extract64(tcr->raw_tcr, 20, 1);
+} else {
+return extract64(tcr->raw_tcr, 37, 1);
+}
+}
+
+/* Returns TBI1 value for current regime el */
+uint32_t arm_regime_tbi1(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+TCR *tcr;
+uint32_t el;
+
+/* For EL0 and EL1, TBI is controlled by stage 1's TCR, so convert
+   * a stage 1+2 mmu index into the appropriate stage 1 mmu index.
+   */
+if (mmu_idx == ARMMMUIdx_S12NSE0 || mmu_idx == ARMMMUIdx_S12NSE1) {
+mmu_idx += ARMMMUIdx_S1NSE0;
+}
+
+

Re: [Qemu-devel] [Qemu-block] [PATCH v4 0/3] iotests: Fix test 162

2016-10-12 Thread Max Reitz
On 12.10.2016 10:55, Hao QingFeng wrote:
> Max,
> 
> Just a common question for this case, if sshx block driver wasn't built
> into qemu-img, this case would fail as below:

Good point, and thanks for bringing it up, but it's not directly linked
to this series other than by its subject, of course, so I'd rather add a
fix on top.

> exec /home/haoqf/KVMonz/qemu/tests/qemu-iotests/../../qemu-img info
> --image-opts driver=ssh,host=localhost,port=0.42,path=/foo
> qemu-img: Could not open
> 'driver=ssh,host=localhost,port=0.42,path=/foo': Unknown driver 'ssh'
> 
> Adding 162.notrun can bypass this case but it would skip it even if
> qemu-img has sshx block driver, in which case I think it should be run.
> 
> So How about adding a script to dynamically check at runtime if the
> current env qemu-img can meet the requirement to run the test or not?

Unfortunately, the list of block drivers listed by will not contain ssh
if ssh is built as a module, which is possible.

This is a bug that should be fixed, but I'd rather do so in a separate
series from this one.

In any case, once it is fixed I'd rather just take the approach quorum
tests take already (e.g. test 081), which is something like:

test_ssh=$($QEMU_IMG --help | grep '^Supported formats:.* ssh\( \|$\)')
[ "$test_ssh" = "" ] && _notrun "ssh support required"

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v8 6/6] Add common functions for SET_IRQS and GET_REGION_INFO ioctls

2016-10-12 Thread Kirti Wankhede


On 10/12/2016 4:48 AM, Alex Williamson wrote:
> On Tue, 11 Oct 2016 01:58:37 +0530
> Kirti Wankhede  wrote:
> 
>> Add common functions for SET_IRQS and to add capability buffer for
>> GET_REGION_INFO ioctls
> 
> Clearly should be two (or more) separate patches since SET_IRQS and
> REGION_INFO are unrelated changes.  Each of the two capabilities handled
> could possibly be separate patches as well.
> 

Ok. I'll have the two separated.

>  
...

>> @@ -754,35 +742,22 @@ static long vfio_pci_ioctl(void *device_data,
>>  } else if (cmd == VFIO_DEVICE_SET_IRQS) {
>>  struct vfio_irq_set hdr;
>>  u8 *data = NULL;
>> -int ret = 0;
>> +int max, ret = 0, data_size = 0;
>>  
>>  minsz = offsetofend(struct vfio_irq_set, count);
>>  
>>  if (copy_from_user(, (void __user *)arg, minsz))
>>  return -EFAULT;
>>  
>> -if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
>> -hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
>> -  VFIO_IRQ_SET_ACTION_TYPE_MASK))
>> -return -EINVAL;
>> -
>> -if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
>> -size_t size;
>> -int max = vfio_pci_get_irq_count(vdev, hdr.index);
>> +max = vfio_pci_get_irq_count(vdev, hdr.index);
>>  
>> -if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
>> -size = sizeof(uint8_t);
>> -else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
>> -size = sizeof(int32_t);
>> -else
>> -return -EINVAL;
>> -
>> -if (hdr.argsz - minsz < hdr.count * size ||
>> -hdr.start >= max || hdr.start + hdr.count > max)
>> -return -EINVAL;
> 
> 
> vfio_platform has very similar code that would also need to be updated.
>

Ok. Thanks for pointing that out. I'll update that too.


>> +ret = vfio_set_irqs_validate_and_prepare(, max, _size);
>> +if (ret)
>> +return ret;
>>  
>> +if (data_size) {
>>  data = memdup_user((void __user *)(arg + minsz),
>> -   hdr.count * size);
>> +data_size);
>>  if (IS_ERR(data))
>>  return PTR_ERR(data);
>>  }
>> @@ -790,7 +765,7 @@ static long vfio_pci_ioctl(void *device_data,
>>  mutex_lock(>igate);
>>  
>>  ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
>> -  hdr.start, hdr.count, data);
>> +hdr.start, hdr.count, data);
> 
> White space bogosity.
> 
>>  
>>  mutex_unlock(>igate);
>>  kfree(data);
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index e3e342861e04..0185d5fb2c85 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1782,6 +1782,122 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, 
>> size_t offset)
>>  }
>>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>>  
>> +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
>> +{
>> +struct vfio_info_cap_header *header;
>> +struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
>> +size_t size;
>> +
>> +size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
>> +header = vfio_info_cap_add(caps, size,
>> +   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
>> +if (IS_ERR(header))
>> +return PTR_ERR(header);
>> +
>> +sparse_cap = container_of(header,
>> +struct vfio_region_info_cap_sparse_mmap, header);
>> +sparse_cap->nr_areas = sparse->nr_areas;
>> +memcpy(sparse_cap->areas, sparse->areas,
>> +   sparse->nr_areas * sizeof(*sparse->areas));
>> +return 0;
>> +}
>> +
>> +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
>> +{
>> +struct vfio_info_cap_header *header;
>> +struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
>> +
>> +header = vfio_info_cap_add(caps, sizeof(*cap),
>> +   VFIO_REGION_INFO_CAP_TYPE, 1);
>> +if (IS_ERR(header))
>> +return PTR_ERR(header);
>> +
>> +type_cap = container_of(header, struct vfio_region_info_cap_type,
>> +header);
>> +type_cap->type = cap->type;
>> +type_cap->subtype = cap->subtype;
>> +return 0;
>> +}
> 
> Why can't we just do a memcpy of all the data past the header?  Do we
> need separate functions for these?
> 

In case of sparse_cap, data past header is variable, depends on
nr_areas. For region_type_cap, data is fixed. For both capabilities,
structures are different and id are 

[Qemu-devel] chardev's and fd's in monitors

2016-10-12 Thread Dr. David Alan Gilbert
Hi,
  I had a look at a couple of readline like libraries;
editline and linenoise.  A difficulty with using them is that
they both want fd's or FILE*'s; editline takes either but
from a brief look I think it's expecting to extract the fd.
That makes them tricky to integrate into qemu, where
the chardev's hide a whole bunch of non-fd things; in particular
tls, mux, ringbuffers etc.

If we could get away with just a FILE* then we could use fopencookie,
but that's GNU only.

Is there any sane way of shepherding all chardev's into having an
fd?

Once you had those then you could also use them in a separate thread.

Dave
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



Re: [Qemu-devel] [PATCH v8 4/6] docs: Add Documentation for Mediated devices

2016-10-12 Thread Kirti Wankhede


On 10/12/2016 9:29 PM, Alex Williamson wrote:
> On Wed, 12 Oct 2016 20:43:48 +0530
> Kirti Wankhede  wrote:
> 
>> On 10/12/2016 7:22 AM, Tian, Kevin wrote:
 From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
 Sent: Wednesday, October 12, 2016 4:45 AM  
>> +* mdev_supported_types:
>> +List of current supported mediated device types and its details are 
>> added
>> +in this directory in following format:
>> +
>> +|- 
>> +|--- Vendor-specific-attributes [optional]
>> +|--- mdev_supported_types
>> +| |--- 
>> +| |   |--- create
>> +| |   |--- name
>> +| |   |--- available_instances
>> +| |   |--- description /class
>> +| |   |--- [devices]
>> +| |--- 
>> +| |   |--- create
>> +| |   |--- name
>> +| |   |--- available_instances
>> +| |   |--- description /class
>> +| |   |--- [devices]
>> +| |--- 
>> +|  |--- create
>> +|  |--- name
>> +|  |--- available_instances
>> +|  |--- description /class
>> +|  |--- [devices]
>> +
>> +[TBD : description or class is yet to be decided. This will change.]  
>
> I thought that in previous discussions we had agreed to drop
> the  concept and use the name as the unique identifier.
> When reporting these types in libvirt we won't want to report
> the type id values - we'll want the name strings to be unique.
>  

 The 'name' might not be unique but type_id will be. For example that Neo
 pointed out in earlier discussion, virtual devices can come from two
 different physical devices, end user would be presented with what they
 had selected but there will be internal implementation differences. In
 that case 'type_id' will be unique.
  
>>>
>>> Hi, Kirti, my understanding is that Neo agreed to use an unique type
>>> string (if you still called it ), and then no need of additional
>>> 'name' field which can be put inside 'description' field. See below quote:
>>>   
>>
>> We had internal discussions about this within NVIDIA and found that
>> 'name' might not be unique where as 'type_id' would be unique. I'm
>> refering to Neo's mail after that, where Neo do pointed that out.
>>
>> https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg07714.html
> 
> Everyone not privy to those internal discussions, including me, seems to
> think we dropped type_id and that if a vendor does not have a stable
> name, they can compose some sort of stable type description based on the
> name+id, or even vendor+id, ex. NVIDIA-11.  So please share why we
> haven't managed to kill off type_id yet.  No matter what internal
> representation each vendor driver has of "type_id" it seems possible
> for it to come up with stable string to define a given configuration.


The 'type_id' is unique and the 'name' are not, the name is just a
virtual device name/ human readable name. Because at this moment Intel
can't define a proper GPU class, we have to add a 'description' field
there as well to represent the features of this virtual device, once we
have all agreed with the GPU class and its mandatory attributes, the
'description' field can be removed. Here is an example,
type_id/type_name = NVIDIA_11,
name=M60-M0Q,
description=2560x1600, 2 displays, 512MB"

Neo's previous comment only applies to the situation where we will have
the GPU class or optional attributes defined and recognized by libvirt,
since that is not going to happen any time soon, we will have to have
the new 'description' field, and we don't want to have it mixed up with
'name' field.

We can definitely have something like name+id as Alex recommended to
remove the 'name' field, but it will just require libvirt to have more
logic to parse that string.

Thanks,
Kirti



[Qemu-devel] [PATCH] script/clean-includes: added duplicate #include check

2016-10-12 Thread Anand J
Added script to check duplicate #include entries. This check will scan and
print the files in which duplicate #include entries are present.

Script might output false postive entries as well. Such entries should
not be removed. So if it finds any duplicate entries script will terminate
with an exit status 1. Then each and every file should be checked manually
and corrected if necessary.

Added an additional option --ignore-duphead in the clean-includes
script to disable this check if all the duplicate #includes are
genuine. The check in enabled by default.

NOTE: Removed some of the genuine duplicate entries in the code base.

Signed-off-by: Anand J 
---
 accel.c |  1 -
 cputlb.c|  1 -
 gdbstub.c   |  1 -
 hw/i386/acpi-build.c|  1 -
 hw/microblaze/boot.c|  1 -
 hw/mips/mips_malta.c|  1 -
 hw/nvram/fw_cfg.c   |  1 -
 hw/pci-bridge/pci_expander_bridge.c |  1 -
 hw/ppc/ppc405_boards.c  |  1 -
 hw/ppc/spapr.c  |  1 -
 hw/timer/grlib_gptimer.c|  1 -
 hw/tpm/tpm_tis.c|  1 -
 hw/unicore32/puv3.c |  1 -
 hw/usb/dev-mtp.c|  1 -
 include/hw/i386/pc.h|  1 -
 monitor.c   |  2 --
 qemu-io-cmds.c  |  1 -
 qmp.c   |  1 -
 scripts/clean-includes  | 50 +++--
 target-i386/machine.c   |  3 ---
 target-mips/machine.c   |  1 -
 target-ppc/machine.c|  1 -
 target-ppc/mem_helper.c |  1 -
 target-sparc/machine.c  |  3 ---
 target-xtensa/translate.c   |  1 -
 tests/crypto-tls-x509-helpers.h |  3 ---
 tests/vhost-user-test.c |  2 --
 util/oslib-posix.c  |  1 -
 vl.c|  1 -
 29 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/accel.c b/accel.c
index 403eb5e..b5a4210 100644
--- a/accel.c
+++ b/accel.c
@@ -25,7 +25,6 @@
 
 #include "qemu/osdep.h"
 #include "sysemu/accel.h"
-#include "hw/boards.h"
 #include "qemu-common.h"
 #include "sysemu/arch_init.h"
 #include "sysemu/sysemu.h"
diff --git a/cputlb.c b/cputlb.c
index 3c99c34..59b3969 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -19,7 +19,6 @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "exec/exec-all.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "exec/cpu_ldst.h"
diff --git a/gdbstub.c b/gdbstub.c
index ecea8c4..67eb028 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -31,7 +31,6 @@
 
 #define MAX_PACKET_LENGTH 4096
 
-#include "cpu.h"
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index e999654..b2baa60 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -30,7 +30,6 @@
 #include "qom/cpu.h"
 #include "hw/i386/pc.h"
 #include "target-i386/cpu.h"
-#include "hw/timer/hpet.h"
 #include "hw/acpi/acpi-defs.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/cpu.h"
diff --git a/hw/microblaze/boot.c b/hw/microblaze/boot.c
index 9eebb1a..1834d22 100644
--- a/hw/microblaze/boot.c
+++ b/hw/microblaze/boot.c
@@ -30,7 +30,6 @@
 #include "qemu/option.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "qemu-common.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c
index e90857e..61aa8eb 100644
--- a/hw/mips/mips_malta.c
+++ b/hw/mips/mips_malta.c
@@ -47,7 +47,6 @@
 #include "elf.h"
 #include "hw/timer/mc146818rtc.h"
 #include "hw/timer/i8254.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
 #include "hw/sysbus.h" /* SysBusDevice */
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 92aa563..1f0c3e9 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -29,7 +29,6 @@
 #include "hw/isa/isa.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/sysbus.h"
-#include "hw/boards.h"
 #include "trace.h"
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
diff --git a/hw/pci-bridge/pci_expander_bridge.c 
b/hw/pci-bridge/pci_expander_bridge.c
index 1cc598f..6ac187f 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -15,7 +15,6 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
-#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/i386/pc.h"
 #include "qemu/range.h"
diff --git a/hw/ppc/ppc405_boards.c b/hw/ppc/ppc405_boards.c
index 4b2f07a..d01798f 100644
--- a/hw/ppc/ppc405_boards.c
+++ b/hw/ppc/ppc405_boards.c
@@ -37,7 +37,6 @@
 #include "qemu/log.h"
 #include "qemu/error-report.h"
 #include "hw/loader.h"
-#include "sysemu/block-backend.h"
 #include 

Re: [Qemu-devel] [PATCH v1 2/2] gdbstub: Fix vCont behaviour

2016-10-12 Thread David Hildenbrand
On Wed, Oct 12, 2016 at 03:55:18PM +0200, Claudio Imbrenda wrote:
> On 12/10/16 15:15, David Hildenbrand wrote:
> >>> +for (cx = 0; ccpus && ccpus[cx]; cx++) {
> >>> +cpu_single_step(cpu, 0);
> > 
> > This looks suspicious
> 
> why? we set all cpus to single step, since that is the default, and then
> we clear the single-step property from all CPUs that should be restarted
> in normal mode, then we restart all CPUs. Those in single-step will
> indeed only perform one single step, the others will run freely (at
> least until the first single-step CPU stops again).

actually I was more concerned about calling it on "cpu" in a loop.

GDB will:
- single step one thread only (stopping all other)
- use vCont

as default. So this means quite some ioctls on every step with some VCPUs.
I doubt that it will really be a problem (e.g. for GDB single stepping
instead of setting breakpoints when returning froma function), but still I
want to have it said. (we actually only need 1 ioctl but call quite a lot).

> 
> >>> +}
> >>> +CPU_FOREACH(cpu) {
> >>> +cpu_resume(cpu);
> >>> +}
> > 
> > Claudio, did you have a look at how s->c_cpu is used later on? I remember 
> > that we
> > have to take care of some query reply packages.
> 
> yes, that's set by the H packet and used by the c,s,m,etc packets. vCont
> ignores it and doesn't change it
> (see here https://sourceware.org/gdb/onlinedocs/gdb/Packets.html )

I remember something different (also having to do with clients detaching and
re-attaching). Will have a look at the code when I have time.



Re: [Qemu-devel] [PATCH 05/22] qcow2-bitmap: structs and consts

2016-10-12 Thread Max Reitz
On 11.10.2016 13:50, Vladimir Sementsov-Ogievskiy wrote:
> On 01.10.2016 17:34, Max Reitz wrote:
>> On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:
>>> Create block/qcow2-bitmap.c
>>> Add data structures and constraints accordingly to docs/specs/qcow2.txt
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block/Makefile.objs  |  2 +-
>>>   block/qcow2-bitmap.c | 47
>>> +++
>>>   block/qcow2.h| 29 +
>>>   3 files changed, 77 insertions(+), 1 deletion(-)
>>>   create mode 100644 block/qcow2-bitmap.c
>>>
>>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>>> index fa4d8b8..0f661bb 100644
>>> --- a/block/Makefile.objs
>>> +++ b/block/Makefile.objs
>>> @@ -1,5 +1,5 @@
>>>   block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o
>>> vvfat.o dmg.o
>>> -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>>> qcow2-snapshot.o qcow2-cache.o
>>> +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>>> qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o
>>>   block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
>>> qed-cluster.o
>>>   block-obj-y += qed-check.o
>>>   block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
>>> diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
>>> new file mode 100644
>>> index 000..cd18b07
>>> --- /dev/null
>>> +++ b/block/qcow2-bitmap.c
>>> @@ -0,0 +1,47 @@
>>> +/*
>>> + * Bitmaps for the QCOW version 2 format
>>> + *
>>> + * Copyright (c) 2014-2016 Vladimir Sementsov-Ogievskiy
>>> + *
>>> + * This file is derived from qcow2-snapshot.c, original copyright:
>>> + * Copyright (c) 2004-2006 Fabrice Bellard
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a copy
>>> + * of this software and associated documentation files (the
>>> "Software"), to deal
>>> + * in the Software without restriction, including without limitation
>>> the rights
>>> + * to use, copy, modify, merge, publish, distribute, sublicense,
>>> and/or sell
>>> + * copies of the Software, and to permit persons to whom the
>>> Software is
>>> + * furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
>>> SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
>>> OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> ARISING FROM,
>>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>>> DEALINGS IN
>>> + * THE SOFTWARE.
>>> + */
>>> +
>>> +/* NOTICE: BME here means Bitmaps Extension and used as a namespace for
>>> + * _internal_ constants. Please do not use this _internal_
>>> abbreviation for
>>> + * other needs and/or outside of this file. */
>>> +
>>> +/* Bitmap directory entry constraints */
>>> +#define BME_MAX_TABLE_SIZE 0x800
>>> +#define BME_MAX_PHYS_SIZE 0x2000 /* 512 mb */
>> I suppose BME_MAX_TABLE_SIZE (8M) is greater than BME_MAX_PHYS_SIZE (512
>> MB) divided by the cluster size (>= 512; 512 MB / cluster_size <= 1 MB)
>> because fully zero or one clusters do not require any physical space?
>>
>> Makes some sense, but I can see that this might make give some trouble
>> when trying to serialize overly large bitmaps. But I guess that comes
>> later in this series, so I'll wait for that point.
>>
>> Another thing is that 512 MB is rather big. It gets worse: The bitmap
>> may only require 512 MB on disk, but with a maximum table size of 8 MB,
>> it can require up to 8M * cluster_size in memory (with just 64 MB of
>> disk space!) by using the "read as all zeroes" or "read as all ones"
>> flags. With the default cluster size of 64 kB, this would be 512 GB in
>> RAM. That sounds bad to me.
>>
>> Well, it is probably fine as long as the bitmap is not auto-loaded...
>> But we do have a flag for exactly that. So it seems to me that a
>> manipulated image can easily consume huge amounts of RAM on the host.
>>
>> So I think we also need some sane limitation on the in-RAM size of a
>> bitmap (which is BME_MAX_TABLE_SIZE * cluster_size, as far as I
>> understand). The question of course is, what is sane? For a server
>> system with no image manipulation possible from the outside, 1 GB may be
>> completely fine. But imagine you download some qcow2 image to your
>> laptop. Then, 1 GB may not be fine, actually.
>>
>> Maybe it would make sense to use a runtime-adjustable limit here?
> 
> Actualy BME_MAX_PHYS_SIZE is this limit:
> in check_constraints we have
> 
> uint64_t phys_bitmap_bytes =
> (uint64_t)h->bitmap_table_size * 

Re: [Qemu-devel] [PATCH 09/22] block: introduce persistent dirty bitmaps

2016-10-12 Thread Max Reitz
On 12.10.2016 14:30, Vladimir Sementsov-Ogievskiy wrote:
> On 12.10.2016 14:38, Vladimir Sementsov-Ogievskiy wrote:
>> On 07.10.2016 22:28, Max Reitz wrote:
>>> On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:
 New field BdrvDirtyBitmap.persistent means, that bitmap should be saved
 on bdrv_close, using format driver. Format driver should maintain
 bitmap
 storing.

 Signed-off-by: Vladimir Sementsov-Ogievskiy 
 ---
   block.c  | 30 ++
   block/dirty-bitmap.c | 27 +++
   block/qcow2-bitmap.c |  1 +
   include/block/block.h|  2 ++
   include/block/block_int.h|  2 ++
   include/block/dirty-bitmap.h |  6 ++
   6 files changed, 68 insertions(+)
>>> [...]
>>>
 diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
 index 623e1d1..0314581 100644
 --- a/block/dirty-bitmap.c
 +++ b/block/dirty-bitmap.c
>>> [...]
>>>
 @@ -555,3 +559,26 @@ bool bdrv_dirty_bitmap_get_autoload(const
 BdrvDirtyBitmap *bitmap)
   {
   return bitmap->autoload;
   }
 +
 +void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap,
 +bool persistent)
 +{
 +bitmap->persistent = persistent;
>>> After some thinking, I think this function should be more complex: It
>>> should check whether the node the bitmap is attached to actually can
>>> handle persistent bitmaps and whether it would actually support storing
>>> *this* bitmap.
>>>
>>> For instance, a qcow2 node would not support writing overly large
>>> bitmaps (limited by BME_MAX_TABLE_SIZE and BME_MAX_PHYS_SIZE) or bitmaps
>>> with overly large granularities (BME_MAX_GRANULARITY_BITS) or bitmaps
>>> whose name is already occupied by some bitmap that is already stored in
>>> the file but has not been loaded.
>>>
>>> Checking this here will trivially prevent users from creating such
>>> bitmaps and will also preempt detection of such failures during
>>> bdrv_close() when they cannot be handled gracefully.
>>>
>>> Max
>>
>> Good point, but I can't do it exactly as you say, because I call this
>> function from qcow2_read_bitmaps, for just created bitmap and it
>> should not be checked and of course it's name is occupied..
> 
> So, I'll add an additional checking function, to call it from
> qmp_block_dirty_bitmap_add, if persistent parameter is set to true.

That would work just as well, yes. Thanks!

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 09/22] block: introduce persistent dirty bitmaps

2016-10-12 Thread Max Reitz
On 11.10.2016 15:11, Vladimir Sementsov-Ogievskiy wrote:
> On 07.10.2016 20:54, Max Reitz wrote:
>> On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:
>>> New field BdrvDirtyBitmap.persistent means, that bitmap should be saved
>>> on bdrv_close, using format driver. Format driver should maintain bitmap
>>> storing.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block.c  | 30 ++
>>>   block/dirty-bitmap.c | 27 +++
>>>   block/qcow2-bitmap.c |  1 +
>>>   include/block/block.h|  2 ++
>>>   include/block/block_int.h|  2 ++
>>>   include/block/dirty-bitmap.h |  6 ++
>>>   6 files changed, 68 insertions(+)
>>>
>>> diff --git a/block.c b/block.c
>>> index 804e3d4..1cde03a 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -2196,6 +2196,7 @@ void bdrv_reopen_abort(BDRVReopenState
>>> *reopen_state)
>>>   static void bdrv_close(BlockDriverState *bs)
>>>   {
>>>   BdrvAioNotifier *ban, *ban_next;
>>> +Error *local_err = NULL;
>>> assert(!bs->job);
>>>   assert(!bs->refcnt);
>>> @@ -2204,6 +2205,10 @@ static void bdrv_close(BlockDriverState *bs)
>>>   bdrv_flush(bs);
>>>   bdrv_drain(bs); /* in case flush left pending I/O */
>>>   +bdrv_store_persistent_bitmaps(bs, _err);
>>> +if (local_err != NULL) {
>>> +error_report_err(local_err);
>>> +}
>> That seems pretty wrong to me. If the persistent bitmaps cannot be
>> stored, the node should not be closed to avoid loss of data.
>>
>>>   bdrv_release_named_dirty_bitmaps(bs);
>> Especially since the next function will just drop all the dirty bitmaps.
>>
>> I see the issue that bdrv_close() is only called by bdrv_delete() which
>> in turn is only called by bdrv_unref(); and how are you supposed to
>> react to bdrv_unref() failing?
>>
>> So I'm not sure how this issue should be addressed, but this is most
>> certainly not ideal. You should not just drop supposedly persistent
>> dirty bitmaps if they cannot be saved.
>>
>> We really should to have some way to keep the bitmap around if it cannot
>> be saved, but I don't know how to do that either.
>>
>> In any case, we should make sure that the node supports saving
>> persistent dirty bitmaps, because having persistent dirty bitmaps at a
>> node that does not support them is something we can and must prevent
>> beforehand.
>>
>> But I don't know how to handle failure if writing the dirty bitmap
>> fails. I guess one could argue that it's the same as bdrv_flush()
>> failing and thus can be handled in the same way, i.e. ignore it. I'm not
>> happy with that, but I'd accept it if there's no other way.
> 
> For now, the only usage of these bitmaps is incremental backup and
> bitmaps are not critical data. If we lost them we will just do full
> backup. If there will be some critical persistent bdrv dirty bitmaps in
> future, we can introduce a callback BdrvDirtyBitmap.store_failed for
> them, which will somehow handle that case.. Detach bitmap from bs and
> save it in memory, add qmp commands to raw-dump them, etc.. I

Yes, fine with me. Still, we should make an effort to detect the case
that some block driver will not be able to store a certain persistent
bitmap attached to one of its nodes as early as possible, ideally
already when the bitmap is created.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 06/22] qcow2: add dirty bitmaps extension

2016-10-12 Thread Max Reitz
On 11.10.2016 14:09, Vladimir Sementsov-Ogievskiy wrote:
> On 01.10.2016 17:46, Max Reitz wrote:
>> On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:
>>> Add dirty bitmap extension as specified in docs/specs/qcow2.txt.
>>> For now, just mirror extension header into Qcow2 state and check
>>> constraints.
>>>
>>> For now, disable image resize if it has bitmaps. It will be fixed later.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block/qcow2.c | 83
>>> +++
>>>   block/qcow2.h |  4 +++
>>>   2 files changed, 87 insertions(+)
>>>
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index c079aa8..08c4ef9 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>> [...]
>>
>>> @@ -162,6 +164,62 @@ static int
>>> qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
>>>   }
>>>   break;
>>>   +case QCOW2_EXT_MAGIC_DIRTY_BITMAPS:
>>> +ret = bdrv_pread(bs->file, offset, _ext, ext.len);
>> Overflows with ext.len > sizeof(bitmaps_ext).
>>
>> (ext.len < sizeof(bitmaps_ext) is also wrong, but less dramatically so.)
>>
>>> +if (ret < 0) {
>>> +error_setg_errno(errp, -ret, "ERROR: bitmaps_ext: "
>>> + "Could not read ext header");
>>> +return ret;
>>> +}
>>> +
>>> +if (bitmaps_ext.reserved32 != 0) {
>>> +error_setg_errno(errp, -ret, "ERROR: bitmaps_ext: "
>>> + "Reserved field is not zero.");
>> Please drop the full stop at the end.
> 
> what do you mean? goto to fail: here? or not stop at all, just print error?

The "." at the end of the message. :-)

(https://en.wikipedia.org/wiki/Full_stop)

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH] target-lm32: fix LOG_DIS operand order

2016-10-12 Thread Peter Maydell
On 12 October 2016 at 18:15, Michael Walle  wrote:
> The order of most opcodes with immediates was wrong (according to the
> reference manual) in the (debug) logging. Additionally, one operand for the
> andhi instruction was completly wrong. Fix these.
>
> Signed-off-by: Michael Walle 

Reviewed-by: Peter Maydell 

PS: the wcsr disassembly also looks to be wrong:
LOG_DIS("wcsr r%d, %d\n", dc->r1, dc->csr);
where the manual says this is "wcsr csr, rX"
(separate bug though really).

thanks
-- PMM



Re: [Qemu-devel] [Qemu-block] [PATCH v14 11/21] qapi: add integer range support for QObjectInputVisitor

2016-10-12 Thread Markus Armbruster
Kevin Wolf  writes:

> Am 12.10.2016 um 17:50 hat Markus Armbruster geschrieben:
>> "Daniel P. Berrange"  writes:
>> 
>> > The traditional CLI arg syntax allows two ways to specify
>> > integer lists, either one value per key, or a range of
>> > values per key. eg the following are identical:
>> >
>> >   -arg foo=5,foo=6,foo=7
>> >   -arg foo=5-7
>> >
>> > This extends the QObjectInputVisitor so that it is able
>> > to parse ranges and turn them into distinct list entries.
>> >
>> > This means that
>> >
>> >   -arg foo=5-7
>> >
>> > is treated as equivalent to
>> >
>> >   -arg foo.0=5,foo.1=6,foo.2=7
>> >
>> > Edge case tests are copied from test-opts-visitor to
>> > ensure identical behaviour when parsing.
>> >
>> > Signed-off-by: Daniel P. Berrange 
>
>> > @@ -329,21 +335,87 @@ static void 
>> > qobject_input_type_int64_autocast(Visitor *v, const char *name,
>> >int64_t *obj, Error **errp)
>> >  {
>> >  QObjectInputVisitor *qiv = to_qiv(v);
>> > -QString *qstr = qobject_to_qstring(qobject_input_get_object(qiv, name,
>> > -true));
>> > +QString *qstr;
>> >  int64_t ret;
>> > +const char *end = NULL;
>> > +StackObject *tos;
>> > +bool inlist = false;
>> > +
>> > +/* Preferentially generate values from a range, before
>> > + * trying to consume another QList element */
>> > +tos = QSLIST_FIRST(>stack);
>> > +if (tos) {
>> > +if ((int64_t)tos->range_val < (int64_t)tos->range_limit) {
>> > +*obj = tos->range_val + 1;
>> > +tos->range_val++;
>> 
>> Roundabout way to write
>> 
>>*obj = tos->range_val++;
>
> *obj = ++tos->range_val, actually.

Of course, thanks.



Re: [Qemu-devel] [PATCH] vfio: Fix vfio_rtl8168_quirk_data_read address offset

2016-10-12 Thread Thorsten Kohfeldt


Am 10.10.2016 um 17:18 schrieb Alex Williamson:

On Sun, 9 Oct 2016 19:56:03 +0200
Thorsten Kohfeldt  wrote:


From: Thorsten Kohfeldt 
Date: Sat, 24 Sep 2016 20:43:20 +0200
Subject: [PATCH] vfio: Fix vfio_rtl8168_quirk_data_read address offset

Introductory comment for rtl8168 VFIO MSI-X quirk states:
At BAR2 offset 0x70 there is a dword data register,
 offset 0x74 is a dword address register.
vfio: vfio_bar_read(:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data

Thus, correct offset for data read is 0x70,
but function vfio_rtl8168_quirk_data_read() wrongfully uses offset 0x74.

Signed-off-by: Thorsten Kohfeldt 


I need a real email address for these, can I replace this with your
gmx.de email?  Thanks,

Alex


Yes, thank you for taking over from here.

Regards,

Thorsten


---
  hw/vfio/pci-quirks.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index bec694c..1e97bc4 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -898,7 +898,7 @@ static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
  {
  VFIOrtl8168Quirk *rtl = opaque;
  VFIOPCIDevice *vdev = rtl->vdev;
-uint64_t data = vfio_region_read(>bars[2].region, addr + 0x74, size);
+uint64_t data = vfio_region_read(>bars[2].region, addr + 0x70, size);

  if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
  hwaddr offset = rtl->addr & 0xfff;







Re: [Qemu-devel] [kvm-unit-tests PATCHv6 2/3] arm: pmu: Check cycle count increases

2016-10-12 Thread Christopher Covington
Hi Wei,

On 10/12/2016 11:49 AM, Wei Huang wrote:
> On 10/11/2016 01:40 PM, Christopher Covington wrote:
>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>> even for the smallest delta of two subsequent reads.
>>
>> Signed-off-by: Christopher Covington 
>> Reviewed-by: Andrew Jones 
>> ---
>>  arm/pmu.c | 60 
>>  1 file changed, 60 insertions(+)
>>
>> diff --git a/arm/pmu.c b/arm/pmu.c
>> index 42d0ee1..4334de4 100644
>> --- a/arm/pmu.c
>> +++ b/arm/pmu.c
>> @@ -14,6 +14,8 @@
>>   */
>>  #include "libcflat.h"
>>  
>> +#define NR_SAMPLES 10
>> +
>>  #if defined(__arm__)
>>  static inline uint32_t get_pmcr(void)
>>  {
>> @@ -22,6 +24,25 @@ static inline uint32_t get_pmcr(void)
>>  asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>  return ret;
>>  }
>> +
>> +static inline void set_pmcr(uint32_t pmcr)
>> +{
>> +asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (pmcr));
>> +}
>> +
>> +/*
>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, 
>> returning 64
>> + * bits doesn't seem worth the trouble when differential usage of the 
>> result is
>> + * expected (with differences that can easily fit in 32 bits). So just 
>> return
>> + * the lower 32 bits of the cycle count in AArch32.
>> + */
>> +static inline unsigned long get_pmccntr(void)
>> +{
>> +unsigned long cycles;
>> +
>> +asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
>> +return cycles;
>> +}
>>  #elif defined(__aarch64__)
>>  static inline uint32_t get_pmcr(void)
>>  {
>> @@ -30,6 +51,19 @@ static inline uint32_t get_pmcr(void)
>>  asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
>>  return ret;
>>  }
>> +
>> +static inline void set_pmcr(uint32_t pmcr)
>> +{
>> +asm volatile("msr pmcr_el0, %0" : : "r" (pmcr));
>> +}
>> +
>> +static inline unsigned long get_pmccntr(void)
>> +{
>> +unsigned long cycles;
>> +
>> +asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
>> +return cycles;
>> +}
>>  #endif
>>  
>>  struct pmu_data {
>> @@ -72,11 +106,37 @@ static bool check_pmcr(void)
>>  return pmu.implementer != 0;
>>  }
>>  
>> +/*
>> + * Ensure that the cycle counter progresses between back-to-back reads.
>> + */
>> +static bool check_cycles_increase(void)
>> +{
>> +struct pmu_data pmu = {0};
> 
> Compilation error on my machine:
> 
> arm/pmu.c: In function ‘check_cycles_increase’:
> arm/pmu.c:148:9: error: missing braces around initializer
> [-Werror=missing-braces]
>   struct pmu_data pmu = {0};
> 
> Same for Patch 3.

"...So your compiler complains about {0}? Is there a problem besides the
warning? If not, then I'm still a bit inclined to keep the code neat. The
warnings will go away with compiler updates."

https://lists.gnu.org/archive/html/qemu-devel/2015-10/msg06064.html

Thanks,
Cov
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code
Aurora Forum, a Linux Foundation Collaborative Project.



[Qemu-devel] [PATCH v2 1/4] target-arm: Implement dummy MDCCINT_EL1

2016-10-12 Thread Peter Maydell
MDCCINT_EL1 is part of the DCC debugger communication
channel between the CPU and an attached external debugger.
QEMU doesn't implement this, but since Linux may try
to access this register we need to provide at least
a dummy implementation.

Signed-off-by: Peter Maydell 
Reviewed-by: Edgar E. Iglesias 
---
 target-arm/helper.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 25f612d..23792ab 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4060,6 +4060,14 @@ static const ARMCPRegInfo debug_cp_reginfo[] = {
   .cp = 14, .opc1 = 0, .crn = 0, .crm = 7, .opc2 = 0,
   .access = PL1_RW, .accessfn = access_tda,
   .type = ARM_CP_NOP },
+/* Dummy MDCCINT_EL1, since we don't implement the Debug Communications
+ * Channel but Linux may try to access this register. The 32-bit
+ * alias is DBGDCCINT.
+ */
+{ .name = "MDCCINT_EL1", .state = ARM_CP_STATE_BOTH,
+  .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 0,
+  .access = PL1_RW, .accessfn = access_tda,
+  .type = ARM_CP_NOP },
 REGINFO_SENTINEL
 };
 
-- 
2.7.4




[Qemu-devel] [PATCH v2 3/4] hw/intc/arm_gicv3: Fix ICC register tracepoints

2016-10-12 Thread Peter Maydell
Fix some problems with the tracepoints for ICC register reads
and writes:
 * tracepoints for ICC_BPR, ICC_APR, ICC_IGRPEN,
   ICC_EIOR were not printing the  that indicated whether
   the access was to the group 0 or 1 register
 * the ICC_IGREPEN1_EL3 read function was not actually calling
   the associated tracepoint
 * the ICC_BPR write function was incorrectly calling the
   tracepoint for ICC_PMR writes

Signed-off-by: Peter Maydell 
Acked-by: Edgar E. Iglesias 
---
 hw/intc/arm_gicv3_cpuif.c | 23 +++
 hw/intc/trace-events  | 14 +++---
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 4633172..bca30c4 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -454,7 +454,8 @@ static void icc_eoir_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 int irq = value & 0xff;
 int grp;
 
-trace_gicv3_icc_eoir_write(gicv3_redist_affid(cs), value);
+trace_gicv3_icc_eoir_write(ri->crm == 8 ? 0 : 1,
+   gicv3_redist_affid(cs), value);
 
 if (ri->crm == 8) {
 /* EOIR0 */
@@ -542,7 +543,7 @@ static uint64_t icc_bpr_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 bpr = MIN(bpr, 7);
 }
 
-trace_gicv3_icc_bpr_read(gicv3_redist_affid(cs), bpr);
+trace_gicv3_icc_bpr_read(ri->crm == 8 ? 0 : 1, gicv3_redist_affid(cs), 
bpr);
 
 return bpr;
 }
@@ -553,7 +554,8 @@ static void icc_bpr_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 GICv3CPUState *cs = icc_cs_from_env(env);
 int grp = (ri->crm == 8) ? GICV3_G0 : GICV3_G1;
 
-trace_gicv3_icc_pmr_write(gicv3_redist_affid(cs), value);
+trace_gicv3_icc_bpr_write(ri->crm == 8 ? 0 : 1,
+  gicv3_redist_affid(cs), value);
 
 if (grp == GICV3_G1 && gicv3_use_ns_bank(env)) {
 grp = GICV3_G1NS;
@@ -591,7 +593,7 @@ static uint64_t icc_ap_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 
 value = cs->icc_apr[grp][regno];
 
-trace_gicv3_icc_ap_read(regno, gicv3_redist_affid(cs), value);
+trace_gicv3_icc_ap_read(ri->crm & 1, regno, gicv3_redist_affid(cs), value);
 return value;
 }
 
@@ -603,7 +605,7 @@ static void icc_ap_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 int regno = ri->opc2 & 3;
 int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1;
 
-trace_gicv3_icc_ap_write(regno, gicv3_redist_affid(cs), value);
+trace_gicv3_icc_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), 
value);
 
 if (grp == GICV3_G1 && gicv3_use_ns_bank(env)) {
 grp = GICV3_G1NS;
@@ -820,7 +822,8 @@ static uint64_t icc_igrpen_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 }
 
 value = cs->icc_igrpen[grp];
-trace_gicv3_icc_igrpen_read(gicv3_redist_affid(cs), value);
+trace_gicv3_icc_igrpen_read(ri->opc2 & 1 ? 1 : 0,
+gicv3_redist_affid(cs), value);
 return value;
 }
 
@@ -830,7 +833,8 @@ static void icc_igrpen_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 GICv3CPUState *cs = icc_cs_from_env(env);
 int grp = ri->opc2 & 1 ? GICV3_G1 : GICV3_G0;
 
-trace_gicv3_icc_igrpen_write(gicv3_redist_affid(cs), value);
+trace_gicv3_icc_igrpen_write(ri->opc2 & 1 ? 1 : 0,
+ gicv3_redist_affid(cs), value);
 
 if (grp == GICV3_G1 && gicv3_use_ns_bank(env)) {
 grp = GICV3_G1NS;
@@ -843,9 +847,12 @@ static void icc_igrpen_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 static uint64_t icc_igrpen1_el3_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
 GICv3CPUState *cs = icc_cs_from_env(env);
+uint64_t value;
 
 /* IGRPEN1_EL3 bits 0 and 1 are r/w aliases into IGRPEN1_EL1 NS and S */
-return cs->icc_igrpen[GICV3_G1NS] | (cs->icc_igrpen[GICV3_G1] << 1);
+value = cs->icc_igrpen[GICV3_G1NS] | (cs->icc_igrpen[GICV3_G1] << 1);
+trace_gicv3_icc_igrpen1_el3_read(gicv3_redist_affid(cs), value);
+return value;
 }
 
 static void icc_igrpen1_el3_write(CPUARMState *env, const ARMCPRegInfo *ri,
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index f12192c..4a23848 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -84,12 +84,12 @@ gic_acknowledge_irq(int cpu, int irq) "cpu %d acknowledged 
irq %d"
 # hw/intc/arm_gicv3_cpuif.c
 gicv3_icc_pmr_read(uint32_t cpu, uint64_t val) "GICv3 ICC_PMR read cpu %x 
value 0x%" PRIx64
 gicv3_icc_pmr_write(uint32_t cpu, uint64_t val) "GICv3 ICC_PMR write cpu %x 
value 0x%" PRIx64
-gicv3_icc_bpr_read(uint32_t cpu, uint64_t val) "GICv3 ICC_BPR read cpu %x 
value 0x%" PRIx64
-gicv3_icc_bpr_write(uint32_t cpu, uint64_t val) "GICv3 ICC_BPR write cpu %x 
value 0x%" PRIx64
-gicv3_icc_ap_read(int regno, uint32_t cpu, uint64_t val) "GICv3 ICC_AP%dR read 
cpu %x value 0x%" PRIx64
-gicv3_icc_ap_write(int regno, uint32_t cpu, uint64_t val) "GICv3 ICC_AP%dR 
write cpu %x value 0x%" PRIx64

[Qemu-devel] [PATCH v2 4/4] hw/char/pl011: Add trace events

2016-10-12 Thread Peter Maydell
Add some trace events for the pl011 UART model.

Signed-off-by: Peter Maydell 
---
 hw/char/pl011.c  | 71 
 hw/char/trace-events |  9 +++
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/hw/char/pl011.c b/hw/char/pl011.c
index 786e605..1a7911f 100644
--- a/hw/char/pl011.c
+++ b/hw/char/pl011.c
@@ -11,6 +11,7 @@
 #include "hw/sysbus.h"
 #include "sysemu/char.h"
 #include "qemu/log.h"
+#include "trace.h"
 
 #define TYPE_PL011 "pl011"
 #define PL011(obj) OBJECT_CHECK(PL011State, (obj), TYPE_PL011)
@@ -58,6 +59,7 @@ static void pl011_update(PL011State *s)
 uint32_t flags;
 
 flags = s->int_level & s->int_enabled;
+trace_pl011_irq_state(flags != 0);
 qemu_set_irq(s->irq, flags != 0);
 }
 
@@ -66,10 +68,8 @@ static uint64_t pl011_read(void *opaque, hwaddr offset,
 {
 PL011State *s = (PL011State *)opaque;
 uint32_t c;
+uint64_t r;
 
-if (offset >= 0xfe0 && offset < 0x1000) {
-return s->id[(offset - 0xfe0) >> 2];
-}
 switch (offset >> 2) {
 case 0: /* UARTDR */
 s->flags &= ~PL011_FLAG_RXFF;
@@ -84,41 +84,62 @@ static uint64_t pl011_read(void *opaque, hwaddr offset,
 }
 if (s->read_count == s->read_trigger - 1)
 s->int_level &= ~ PL011_INT_RX;
+trace_pl011_read_fifo(s->read_count);
 s->rsr = c >> 8;
 pl011_update(s);
 if (s->chr) {
 qemu_chr_accept_input(s->chr);
 }
-return c;
+r = c;
+break;
 case 1: /* UARTRSR */
-return s->rsr;
+r = s->rsr;
+break;
 case 6: /* UARTFR */
-return s->flags;
+r = s->flags;
+break;
 case 8: /* UARTILPR */
-return s->ilpr;
+r = s->ilpr;
+break;
 case 9: /* UARTIBRD */
-return s->ibrd;
+r = s->ibrd;
+break;
 case 10: /* UARTFBRD */
-return s->fbrd;
+r = s->fbrd;
+break;
 case 11: /* UARTLCR_H */
-return s->lcr;
+r = s->lcr;
+break;
 case 12: /* UARTCR */
-return s->cr;
+r = s->cr;
+break;
 case 13: /* UARTIFLS */
-return s->ifl;
+r = s->ifl;
+break;
 case 14: /* UARTIMSC */
-return s->int_enabled;
+r = s->int_enabled;
+break;
 case 15: /* UARTRIS */
-return s->int_level;
+r = s->int_level;
+break;
 case 16: /* UARTMIS */
-return s->int_level & s->int_enabled;
+r = s->int_level & s->int_enabled;
+break;
 case 18: /* UARTDMACR */
-return s->dmacr;
+r = s->dmacr;
+break;
+case 0x3f8 ... 0x400:
+r = s->id[(offset - 0xfe0) >> 2];
+break;
 default:
 qemu_log_mask(LOG_GUEST_ERROR,
   "pl011_read: Bad offset %x\n", (int)offset);
-return 0;
+r = 0;
+break;
 }
+
+trace_pl011_read(offset, r);
+return r;
 }
 
 static void pl011_set_read_trigger(PL011State *s)
@@ -141,6 +162,8 @@ static void pl011_write(void *opaque, hwaddr offset,
 PL011State *s = (PL011State *)opaque;
 unsigned char ch;
 
+trace_pl011_write(offset, value);
+
 switch (offset >> 2) {
 case 0: /* UARTDR */
 /* ??? Check if transmitter is enabled.  */
@@ -207,11 +230,15 @@ static void pl011_write(void *opaque, hwaddr offset,
 static int pl011_can_receive(void *opaque)
 {
 PL011State *s = (PL011State *)opaque;
+int r;
 
-if (s->lcr & 0x10)
-return s->read_count < 16;
-else
-return s->read_count < 1;
+if (s->lcr & 0x10) {
+r = s->read_count < 16;
+} else {
+r = s->read_count < 1;
+}
+trace_pl011_can_receive(s->lcr, s->read_count, r);
+return r;
 }
 
 static void pl011_put_fifo(void *opaque, uint32_t value)
@@ -225,7 +252,9 @@ static void pl011_put_fifo(void *opaque, uint32_t value)
 s->read_fifo[slot] = value;
 s->read_count++;
 s->flags &= ~PL011_FLAG_RXFE;
+trace_pl011_put_fifo(value, s->read_count);
 if (!(s->lcr & 0x10) || s->read_count == 16) {
+trace_pl011_put_fifo_full();
 s->flags |= PL011_FLAG_RXFF;
 }
 if (s->read_count == s->read_trigger) {
diff --git a/hw/char/trace-events b/hw/char/trace-events
index d53577c..7fd48bb 100644
--- a/hw/char/trace-events
+++ b/hw/char/trace-events
@@ -47,3 +47,12 @@ escc_sunkbd_event_in(int ch, const char *name, int down) 
"QKeyCode 0x%2.2x [%s],
 escc_sunkbd_event_out(int ch) "Translated keycode 0x%2.2x"
 escc_kbd_command(int val) "Command %d"
 escc_sunmouse_event(int dx, int dy, int buttons_state) "dx=%d dy=%d 
buttons=%01x"
+
+# hw/char/pl011.c
+pl011_irq_state(int level) "irq state %d"
+pl011_read(uint32_t addr, uint32_t value) "addr 0x%08x value 0x%08x"
+pl011_read_fifo(int read_count) "FIFO read, read_count now %d"
+pl011_write(uint32_t addr, uint32_t value) "addr 0x%08x 

  1   2   3   4   >