date:20161024

[Qemu-devel] [PATCH v3 07/13] xen: Move xenstore cleanup and mkdir functions

2016-10-24 Thread Emil Condrea

The name of the functions moved to xen_pvdev.c:
 * xenstore_cleanup_dir
 * xen_config_cleanup
 * xenstore_mkdir

Signed-off-by: Emil Condrea 
Acked-by: Anthony PERARD 
---
 hw/xen/xen_backend.c | 49 -
 hw/xen/xen_pvdev.c   | 51 +++
 2 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
index 2875e7c..216072d 100644
--- a/hw/xen/xen_backend.c
+++ b/hw/xen/xen_backend.c
@@ -47,57 +47,8 @@ struct xs_handle *xenstore = NULL;
 const char *xen_protocol;
 
 /* private */
-struct xs_dirs {
-char *xs_dir;
-QTAILQ_ENTRY(xs_dirs) list;
-};
-static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
-QTAILQ_HEAD_INITIALIZER(xs_cleanup);
-
 static int debug;
 
-static void xenstore_cleanup_dir(char *dir)
-{
-struct xs_dirs *d;
-
-d = g_malloc(sizeof(*d));
-d->xs_dir = dir;
-QTAILQ_INSERT_TAIL(_cleanup, d, list);
-}
-
-void xen_config_cleanup(void)
-{
-struct xs_dirs *d;
-
-QTAILQ_FOREACH(d, _cleanup, list) {
-xs_rm(xenstore, 0, d->xs_dir);
-}
-}
-
-int xenstore_mkdir(char *path, int p)
-{
-struct xs_permissions perms[2] = {
-{
-.id= 0, /* set owner: dom0 */
-}, {
-.id= xen_domid,
-.perms = p,
-}
-};
-
-if (!xs_mkdir(xenstore, 0, path)) {
-xen_be_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
-return -1;
-}
-xenstore_cleanup_dir(g_strdup(path));
-
-if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
-xen_be_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
-return -1;
-}
-return 0;
-}
-
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const 
char *val)
 {
 return xenstore_write_str(xendev->be, node, val);
diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c
index 96ed2a3..e432d30 100644
--- a/hw/xen/xen_pvdev.c
+++ b/hw/xen/xen_pvdev.c
@@ -24,11 +24,62 @@
 
 /* private */
 static int debug;
+
+struct xs_dirs {
+char *xs_dir;
+QTAILQ_ENTRY(xs_dirs) list;
+};
+
+static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
+QTAILQ_HEAD_INITIALIZER(xs_cleanup);
+
 static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs =
 QTAILQ_HEAD_INITIALIZER(xendevs);
 
 /* - */
 
+static void xenstore_cleanup_dir(char *dir)
+{
+struct xs_dirs *d;
+
+d = g_malloc(sizeof(*d));
+d->xs_dir = dir;
+QTAILQ_INSERT_TAIL(_cleanup, d, list);
+}
+
+void xen_config_cleanup(void)
+{
+struct xs_dirs *d;
+
+QTAILQ_FOREACH(d, _cleanup, list) {
+xs_rm(xenstore, 0, d->xs_dir);
+}
+}
+
+int xenstore_mkdir(char *path, int p)
+{
+struct xs_permissions perms[2] = {
+{
+.id= 0, /* set owner: dom0 */
+}, {
+.id= xen_domid,
+.perms = p,
+}
+};
+
+if (!xs_mkdir(xenstore, 0, path)) {
+xen_be_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
+return -1;
+}
+xenstore_cleanup_dir(g_strdup(path));
+
+if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
+xen_be_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
+return -1;
+}
+return 0;
+}
+
 int xenstore_write_str(const char *base, const char *node, const char *val)
 {
 char abspath[XEN_BUFSIZE];
-- 
1.9.1

Re: [Qemu-devel] [PATCH] hw/arm/pxa2xx: Set value default values for CCCR and CKEN on PXA255

2016-10-24 Thread no-reply

Hi,

Your series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 1477361273-1-1-git-send-email-li...@roeck-us.net
Subject: [Qemu-devel] [PATCH] hw/arm/pxa2xx: Set value default values for CCCR 
and CKEN on PXA255

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git show --no-patch --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
0140108 hw/arm/pxa2xx: Set value default values for CCCR and CKEN on PXA255

=== OUTPUT BEGIN ===
Checking PATCH 1/1: hw/arm/pxa2xx: Set value default values for CCCR and CKEN 
on PXA255...
ERROR: code indent should never use tabs
#24: FILE: hw/arm/pxa2xx.c:2270:
+s->cm_regs[CCCR >> 2] = 0x0121;^I/* from datasheet */$

ERROR: code indent should never use tabs
#25: FILE: hw/arm/pxa2xx.c:2271:
+s->cm_regs[CKEN >> 2] = 0x00017def;^I/* from datasheet */$

total: 2 errors, 0 warnings, 10 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

[Qemu-devel] [PATCH v3 02/13] xen: Fix coding style warnings

2016-10-24 Thread Emil Condrea

Fixes:
 * WARNING: line over 80 characters

Signed-off-by: Emil Condrea 
---
 hw/block/xen_disk.c  |  3 ++-
 hw/char/xen_console.c|  3 ++-
 hw/display/xenfb.c   |  6 --
 hw/net/xen_nic.c | 12 
 hw/xen/xen_backend.c | 15 ++-
 include/hw/xen/xen_backend.h |  8 +---
 6 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 1292a4b..5b037e7 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -1068,7 +1068,8 @@ static int blk_connect(struct XenDevice *xendev)
 blk_set_enable_write_cache(blkdev->blk, !writethrough);
 } else {
 /* setup via qemu cmdline -> already setup for us */
-xen_be_printf(>xendev, 2, "get configured bdrv (cmdline 
setup)\n");
+xen_be_printf(>xendev, 2,
+  "get configured bdrv (cmdline setup)\n");
 blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
 if (blk_is_read_only(blkdev->blk) && !readonly) {
 xen_be_printf(>xendev, 0, "Unexpected read-only drive");
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index d236a46..9ae9558 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -248,7 +248,8 @@ static int con_initialise(struct XenDevice *xendev)
 qemu_chr_fe_set_handlers(>chr, xencons_can_receive,
  xencons_receive, NULL, con, NULL, true);
 
-xen_be_printf(xendev, 1, "ring mfn %d, remote port %d, local port %d, 
limit %zd\n",
+xen_be_printf(xendev, 1,
+  "ring mfn %d, remote port %d, local port %d, limit %zd\n",
  con->ring_ref,
  con->xendev.remote_port,
  con->xendev.local_port,
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index eaa1fce..d458fc1 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -561,7 +561,8 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t 
fb_len_lim,
 xenfb->offset = offset;
 xenfb->up_fullscreen = 1;
 xenfb->do_resize = 1;
-xen_be_printf(>c.xendev, 1, "framebuffer %dx%dx%d offset %d stride 
%d\n",
+xen_be_printf(>c.xendev, 1,
+  "framebuffer %dx%dx%d offset %d stride %d\n",
   width, height, depth, offset, row_stride);
 return 0;
 }
@@ -729,7 +730,8 @@ static void xenfb_update(void *opaque)
 break;
 }
 dpy_gfx_replace_surface(xenfb->c.con, surface);
-xen_be_printf(>c.xendev, 1, "update: resizing: %dx%d @ %d 
bpp%s\n",
+xen_be_printf(>c.xendev, 1,
+  "update: resizing: %dx%d @ %d bpp%s\n",
   xenfb->width, xenfb->height, xenfb->depth,
   is_buffer_shared(surface) ? " (shared)" : "");
 xenfb->up_fullscreen = 1;
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 9d93466..dbf3a89 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -140,7 +140,8 @@ static void net_tx_packets(struct XenNetDev *netdev)
 #endif
 
 if (txreq.size < 14) {
-xen_be_printf(>xendev, 0, "bad packet size: %d\n", 
txreq.size);
+xen_be_printf(>xendev, 0, "bad packet size: %d\n",
+  txreq.size);
 net_tx_error(netdev, , rc);
 continue;
 }
@@ -213,7 +214,8 @@ static void net_rx_response(struct XenNetDev *netdev,
 resp->status = (int16_t)st;
 }
 
-xen_be_printf(>xendev, 3, "rx response: idx %d, status %d, flags 
0x%x\n",
+xen_be_printf(>xendev, 3,
+  "rx response: idx %d, status %d, flags 0x%x\n",
   i, resp->status, resp->flags);
 
 netdev->rx_ring.rsp_prod_pvt = ++i;
@@ -256,7 +258,8 @@ static ssize_t net_rx_packet(NetClientState *nc, const 
uint8_t *buf, size_t size
netdev->xendev.dom,
rxreq.gref, PROT_WRITE);
 if (page == NULL) {
-xen_be_printf(>xendev, 0, "error: rx gref dereference failed 
(%d)\n",
+xen_be_printf(>xendev, 0,
+  "error: rx gref dereference failed (%d)\n",
   rxreq.gref);
 net_rx_response(netdev, , NETIF_RSP_ERROR, 0, 0, 0);
 return -1;
@@ -330,7 +333,8 @@ static int net_connect(struct XenDevice *xendev)
 rx_copy = 0;
 }
 if (rx_copy == 0) {
-xen_be_printf(>xendev, 0, "frontend doesn't support 
rx-copy.\n");
+xen_be_printf(>xendev, 0,
+  "frontend doesn't support rx-copy.\n");
 return -1;
 }
 
diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
index 545ee47..0e95880 100644
--- a/hw/xen/xen_backend.c
+++ b/hw/xen/xen_backend.c
@@ -53,7 +53,8 @@ struct xs_dirs {
 static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
 QTAILQ_HEAD_INITIALIZER(xs_cleanup);
 
-static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs =

[Qemu-devel] [PATCH v3 03/13] xen: Create a new file xen_pvdev.c

2016-10-24 Thread Emil Condrea

The purpose of the new file is to store generic functions shared by frontend
and backends such as xenstore operations, xendevs.

Signed-off-by: Quan Xu 
Signed-off-by: Emil Condrea 
---
 hw/xen/Makefile.objs |   2 +-
 hw/xen/xen_backend.c | 126 +---
 hw/xen/xen_pvdev.c   | 150 +++
 include/hw/xen/xen_backend.h |  64 +-
 include/hw/xen/xen_pvdev.h   |  69 
 5 files changed, 222 insertions(+), 189 deletions(-)
 create mode 100644 hw/xen/xen_pvdev.c
 create mode 100644 include/hw/xen/xen_pvdev.h

diff --git a/hw/xen/Makefile.objs b/hw/xen/Makefile.objs
index d367094..591cdc2 100644
--- a/hw/xen/Makefile.objs
+++ b/hw/xen/Makefile.objs
@@ -1,5 +1,5 @@
 # xen backend driver support
-common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o
+common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o xen_pvdev.o
 
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o 
xen_pt_graphics.o xen_pt_msi.o
diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
index 0e95880..b32b0dd 100644
--- a/hw/xen/xen_backend.c
+++ b/hw/xen/xen_backend.c
@@ -30,6 +30,7 @@
 #include "sysemu/char.h"
 #include "qemu/log.h"
 #include "hw/xen/xen_backend.h"
+#include "hw/xen/xen_pvdev.h"
 
 #include 
 
@@ -57,8 +58,6 @@ static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs =
 QTAILQ_HEAD_INITIALIZER(xendevs);
 static int debug;
 
-/* - */
-
 static void xenstore_cleanup_dir(char *dir)
 {
 struct xs_dirs *d;
@@ -77,34 +76,6 @@ void xen_config_cleanup(void)
 }
 }
 
-int xenstore_write_str(const char *base, const char *node, const char *val)
-{
-char abspath[XEN_BUFSIZE];
-
-snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
-return -1;
-}
-return 0;
-}
-
-char *xenstore_read_str(const char *base, const char *node)
-{
-char abspath[XEN_BUFSIZE];
-unsigned int len;
-char *str, *ret = NULL;
-
-snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-str = xs_read(xenstore, 0, abspath, );
-if (str != NULL) {
-/* move to qemu-allocated memory to make sure
- * callers can savely g_free() stuff. */
-ret = g_strdup(str);
-free(str);
-}
-return ret;
-}
-
 int xenstore_mkdir(char *path, int p)
 {
 struct xs_permissions perms[2] = {
@@ -129,48 +100,6 @@ int xenstore_mkdir(char *path, int p)
 return 0;
 }
 
-int xenstore_write_int(const char *base, const char *node, int ival)
-{
-char val[12];
-
-snprintf(val, sizeof(val), "%d", ival);
-return xenstore_write_str(base, node, val);
-}
-
-int xenstore_write_int64(const char *base, const char *node, int64_t ival)
-{
-char val[21];
-
-snprintf(val, sizeof(val), "%"PRId64, ival);
-return xenstore_write_str(base, node, val);
-}
-
-int xenstore_read_int(const char *base, const char *node, int *ival)
-{
-char *val;
-int rc = -1;
-
-val = xenstore_read_str(base, node);
-if (val && 1 == sscanf(val, "%d", ival)) {
-rc = 0;
-}
-g_free(val);
-return rc;
-}
-
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
-{
-char *val;
-int rc = -1;
-
-val = xenstore_read_str(base, node);
-if (val && 1 == sscanf(val, "%"SCNu64, uval)) {
-rc = 0;
-}
-g_free(val);
-return rc;
-}
-
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const 
char *val)
 {
 return xenstore_write_str(xendev->be, node, val);
@@ -214,20 +143,6 @@ int xenstore_read_fe_uint64(struct XenDevice *xendev, 
const char *node,
 
 /* - */
 
-const char *xenbus_strstate(enum xenbus_state state)
-{
-static const char *const name[] = {
-[XenbusStateUnknown]   = "Unknown",
-[XenbusStateInitialising]  = "Initialising",
-[XenbusStateInitWait]  = "InitWait",
-[XenbusStateInitialised]   = "Initialised",
-[XenbusStateConnected] = "Connected",
-[XenbusStateClosing]   = "Closing",
-[XenbusStateClosed]= "Closed",
-};
-return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
-}
-
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state)
 {
 int rc;
@@ -827,45 +742,6 @@ int xen_be_send_notify(struct XenDevice *xendev)
 return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
 }
 
-/*
- * msg_level:
- *  0 == errors (stderr + logfile).
- *  1 == informative debug messages (logfile only).
- *  2 == noisy debug messages (logfile only).
- *  3 == will flood your log (logfile only).
- */
-void xen_be_printf(struct XenDevice *xendev, int msg_level,
-

Re: [Qemu-devel] [PATCH v5 15/17] ppc/pnv: Add cut down PSI bridge model and hookup external interrupt

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:48AM +0200, Cédric Le Goater wrote:
> From: Benjamin Herrenschmidt 
> 
> The PSI (Processor Service Interface) is one of the engines of the
> "Bridge" unit which connects the different interfaces to the Power
> Processor.
> 
> This adds just enough of the PSI bridge to handle various on-chip and
> the one external interrupt. The rest of PSI has to do with the link to
> the IBM FSP service processor which we don't plan to emulate (not used
> on OpenPower machines).
> 
> Signed-off-by: Benjamin Herrenschmidt 
> [clg: - updated for qemu-2.7
>   - changed the XSCOM interface to fit new model
>   - QOMified the model
>   - reworked set_xive ]
> Signed-off-by: Cédric Le Goater 
> ---
> 
>  When skiboot initializes PSIHB, it fills the xives with server=0,
>  prio=0xff, which is fine, but for some reason the last two xive
>  settings reach the qemu MMIO region with a bogus value :
>  
>   pnv_psi_mmio_write: MMIO write 0x30 val 0x00ff
>   pnv_psi_mmio_write: MMIO write 0x60 val 0x00ff2000
>   pnv_psi_mmio_write: MMIO write 0x68 val 0x00ff4000
>   pnv_psi_mmio_write: MMIO write 0x70 val 0x00ff6000
>   pnv_psi_mmio_write: MMIO write 0x78 val 0x8000
>   pnv_psi_mmio_write: MMIO write 0x80 val 0xa000
> 
>  It looks like a badly initialized temp variable in the call
>  stack. The memory regions look fine, maybe in stdcix ? For the
>  moment, I have added a logging error to catch non zero values as the
>  guest should not do that in any case.

Just to clarify, I think you're saying that you believe this to be a
skiboot (guest side) bug rather than a qemu bug.  Is that right?

> 
> 
>  Changes since v4:
> 
>  - used the helpers for the XSCOM region
>  - introduced pnv->cpld_irqstate to remove a static 
>  - reworked pnv_psi_set_xivr() to use xics_get_cpu_index_by_pir().
>This is similar to rtas_set_xive but skiboot does a first
>initialization with empty servers so we need to check for that
>case.
> 
>  hw/ppc/Makefile.objs   |   2 +-
>  hw/ppc/pnv.c   |  31 ++-
>  hw/ppc/pnv_psi.c   | 615 
> +
>  include/hw/ppc/pnv.h   |   7 +
>  include/hw/ppc/pnv_psi.h   |  64 +
>  include/hw/ppc/pnv_xscom.h |   3 +
>  6 files changed, 715 insertions(+), 7 deletions(-)
>  create mode 100644 hw/ppc/pnv_psi.c
>  create mode 100644 include/hw/ppc/pnv_psi.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index ebc72af0a7c6..4feb15b360c8 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o 
> spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
>  # IBM PowerNV
> -obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o
> +obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 16d7baf0da71..ec1a17699023 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -316,15 +316,22 @@ static void ppc_powernv_reset(void)
>   * have a CPLD that will collect the SerIRQ and shoot them as a
>   * single level interrupt to the P8 chip. So let's setup a hook
>   * for doing just that.
> - *
> - * Note: The actual interrupt input isn't emulated yet, this will
> - * come with the PSI bridge model.
>   */
>  static void pnv_lpc_isa_irq_handler_cpld(void *opaque, int n, int level)
>  {
> -/* We don't yet emulate the PSI bridge which provides the external
> - * interrupt, so just drop interrupts on the floor
> - */
> +PnvMachineState *pnv = POWERNV_MACHINE(qdev_get_machine());
> +uint32_t old_state = pnv->cpld_irqstate;
> +PnvChip *chip = opaque;
> +
> +if (level) {
> +pnv->cpld_irqstate |= 1u << n;
> +} else {
> +pnv->cpld_irqstate &= ~(1u << n);
> +}
> +if (pnv->cpld_irqstate != old_state) {
> +pnv_psi_irq_set(>psi, PSIHB_IRQ_EXTERNAL,
> +pnv->cpld_irqstate != 0);
> +}
>  }
>  
>  static void pnv_lpc_isa_irq_handler(void *opaque, int n, int level)
> @@ -644,6 +651,9 @@ static void pnv_chip_init(Object *obj)
>  
>  object_initialize(>xics, sizeof(chip->xics), TYPE_XICS_NATIVE);
>  object_property_add_child(obj, "xics", OBJECT(>xics), NULL);
> +
> +object_initialize(>psi, sizeof(chip->psi), TYPE_PNV_PSI);
> +object_property_add_child(obj, "psi", OBJECT(>psi), NULL);
>  }
>  
>  static void pnv_chip_realize(DeviceState *dev, Error **errp)
> @@ -728,6 +738,15 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> **errp)
>  }
>  g_free(typename);
>  
> +
> +/* Processor Service Interface (PSI) Host Bridge */
> +

Re: [Qemu-devel] [PATCH v8 03/36] block: Introduce image file locking

2016-10-24 Thread Fam Zheng

On Fri, 10/21 23:04, Max Reitz wrote:
> > +ImageLockMode bdrv_lock_mode_from_flags(int flags)
> > +{
> > +if (flags & BDRV_O_NO_LOCK) {
> > +return IMAGE_LOCK_MODE_NOLOCK;
> > +} else if (flags & BDRV_O_SHARED_LOCK) {
> > +return IMAGE_LOCK_MODE_SHARED;
> > +} else if (flags & BDRV_O_EXCLUSIVE_LOCK) {
> > +return IMAGE_LOCK_MODE_EXCLUSIVE;
> > +} else {
> > +return IMAGE_LOCK_MODE_AUTO;
> > +}
> > +}
> 
> I don't know if there's been any discussion about the order of the flags
> here, but I personally would order them exactly the other way around:
> Asking for exclusive locking should override nolock, in my opinion.

The idea was to assert no two bits are set at the same time. But I seem to have
forgotten to actually add the assertion.

> 
> > +
> > +ImageLockMode bdrv_get_lock_mode(BlockDriverState *bs)
> > +{
> > +return bs->cur_lock;
> > +}
> > +
> > +int bdrv_set_lock_mode(BlockDriverState *bs, ImageLockMode mode)
> > +{
> > +int ret;
> > +
> > +if (bs->cur_lock == mode) {
> > +return 0;
> > +} else if (!bs->drv) {
> > +return -ENOMEDIUM;
> > +} else if (!bs->drv->bdrv_lockf) {
> > +if (bs->file) {
> > +return bdrv_set_lock_mode(bs->file->bs, mode);
> > +}
> > +return 0;
> > +}
> > +ret = bs->drv->bdrv_lockf(bs, mode);
> > +if (ret == -ENOTSUP) {
> > +/* Handle it the same way as !bs->drv->bdrv_lockf */
> > +ret = 0;
> 
> Yes, well, why do you handle both as success? Wouldn't returning
> -ENOTSUP make more sense?
> 
> I guess the caller can find out itself by checking whether bs->cur_lock
> has changed, but...

I can't think of a reason for any caller to do something different for -ENOTSUP
from success, hence the check here.

> 
> > +} else if (ret == 0) {
> > +bs->cur_lock = mode;
> > +}
> > +return ret;
> > +}
> > +
> >  static QemuOptsList bdrv_runtime_opts = {
> >  .name = "bdrv_common",
> >  .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> > @@ -1076,6 +1119,10 @@ static int bdrv_open_common(BlockDriverState *bs, 
> > BdrvChild *file,
> >  goto free_and_fail;
> >  }
> >  
> > +if (open_flags & BDRV_O_INACTIVE) {
> > +open_flags = (open_flags & ~BDRV_O_LOCK_MASK) & BDRV_O_NO_LOCK;
> 
> I suppose the second & is supposed to be a |?

Yes. Thanks for catching it.

> 
> > +}
> > +
> >  ret = refresh_total_sectors(bs, bs->total_sectors);
> >  if (ret < 0) {
> >  error_setg_errno(errp, -ret, "Could not refresh total sector 
> > count");
> > @@ -2273,6 +2320,7 @@ static void bdrv_close(BlockDriverState *bs)
> >  if (bs->drv) {
> >  BdrvChild *child, *next;
> >  
> > +bdrv_set_lock_mode(bs, IMAGE_LOCK_MODE_NOLOCK);
> >  bs->drv->bdrv_close(bs);
> >  bs->drv = NULL;
> >  
> > @@ -3188,6 +3236,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, 
> > Error **errp)
> 
> This function's name is pretty weird... Maybe it would be better to
> rename it to "bdrv_complete_incoming" or something. (Unrelated to this
> series, of course.)
> 
> >  error_setg_errno(errp, -ret, "Could not refresh total sector 
> > count");
> >  return;
> >  }
> > +if (bs->cur_lock != IMAGE_LOCK_MODE__MAX) {
> > +bdrv_set_lock_mode(bs, bs->cur_lock);
> > +}
> >  }
> >  
> >  void bdrv_invalidate_cache_all(Error **errp)
> > @@ -3230,6 +3281,7 @@ static int bdrv_inactivate_recurse(BlockDriverState 
> > *bs,
> >  }
> >  
> >  if (setting_flag) {
> > +ret = bdrv_set_lock_mode(bs, IMAGE_LOCK_MODE_NOLOCK);
> 
> Maybe it would make sense to do something with the return value...? :-)

Yes, sounds good.

Fam

Re: [Qemu-devel] [PATCH v5 12/17] ppc/pnv: add a XICS native to each PowerNV chip

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 05:42:52PM +0200, Cédric Le Goater wrote:
> On 10/22/2016 11:46 AM, Cédric Le Goater wrote:
> > It also links the XICS object to each core as it is needed to do the
> > CPU setup and the ICP MMIO windows are memory mapped for each thread.
> > 
> > Signed-off-by: Cédric Le Goater 
> > ---
> > 
> >  Changes since v4:
> > 
> >  - changed the calculation of the number of ICPs to use smp_threads
> >  - added the mapping of the ICP subregions per thread
> > 
> >  hw/ppc/pnv.c | 27 +++
> >  hw/ppc/pnv_core.c| 24 
> >  include/hw/ppc/pnv.h |  2 ++
> >  3 files changed, 49 insertions(+), 4 deletions(-)
> > 
> > diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> > index c6dc7ca895b6..16d7baf0da71 100644
> > --- a/hw/ppc/pnv.c
> > +++ b/hw/ppc/pnv.c
> > @@ -33,6 +33,7 @@
> >  #include "qemu/cutils.h"
> >  #include "qapi/visitor.h"
> >  
> > +#include "hw/ppc/xics.h"
> >  #include "hw/ppc/pnv_xscom.h"
> >  
> >  #include "hw/isa/isa.h"
> > @@ -231,6 +232,9 @@ static void powernv_populate_chip(PnvChip *chip, void 
> > *fdt)
> >  PnvCore *pnv_core = PNV_CORE(chip->cores + i * typesize);
> >  
> >  powernv_create_core_node(chip, pnv_core, fdt);
> > +
> > +/* Interrupt presentation controllers (ICP). One per thread. */
> > +xics_native_populate_icp(chip, fdt, 0, pnv_core->pir, smp_threads);
> >  }
> >  
> >  if (chip->ram_size) {
> > @@ -637,6 +641,9 @@ static void pnv_chip_init(Object *obj)
> >  
> >  object_initialize(>lpc, sizeof(chip->lpc), TYPE_PNV_LPC);
> >  object_property_add_child(obj, "lpc", OBJECT(>lpc), NULL);
> > +
> > +object_initialize(>xics, sizeof(chip->xics), TYPE_XICS_NATIVE);
> > +object_property_add_child(obj, "xics", OBJECT(>xics), NULL);
> >  }
> >  
> >  static void pnv_chip_realize(DeviceState *dev, Error **errp)
> > @@ -668,12 +675,23 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> > **errp)
> >  return;
> >  }
> >  
> > +/*
> > + * Interrupt Controller. To be created before the cores because
> > + * each thread will fetch its ICP in the XICS
> > + */
> > +object_property_set_int(OBJECT(>xics), chip->nr_cores * 
> > smp_threads,
> > +"nr_servers",  _fatal);
> > +object_property_set_bool(OBJECT(>xics), true, "realized",
> > + _fatal);
> > +sysbus_mmio_map(SYS_BUS_DEVICE(>xics), 0, PNV_XICS_BASE);
> > +
> >  chip->cores = g_malloc0(typesize * chip->nr_cores);
> >  
> >  for (i = 0, core_hwid = 0; (core_hwid < sizeof(chip->cores_mask) * 8)
> >   && (i < chip->nr_cores); core_hwid++) {
> >  char core_name[32];
> >  void *pnv_core = chip->cores + i * typesize;
> > +int j;
> >  
> >  if (!(chip->cores_mask & (1ull << core_hwid))) {
> >  continue;
> > @@ -690,6 +708,8 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> > **errp)
> >  object_property_set_int(OBJECT(pnv_core),
> >  pcc->core_pir(chip, core_hwid),
> >  "pir", _fatal);
> > +object_property_add_const_link(OBJECT(pnv_core), "xics",
> > +   OBJECT(>xics), _fatal);
> >  object_property_set_bool(OBJECT(pnv_core), true, "realized",
> >   _fatal);
> >  object_unref(OBJECT(pnv_core));
> > @@ -697,6 +717,13 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> > **errp)
> >  /* Each core has an XSCOM MMIO region */
> >  pnv_xscom_add_subregion(chip, PNV_XSCOM_EX_CORE_BASE(core_hwid),
> >  _CORE(pnv_core)->xscom_regs);
> > +
> > +/* Each thread as region for its ICP */
> > +for (j = 0; j < smp_threads; j++) {
> > +memory_region_add_subregion(>xics.icp_mmio,
> > +pcc->core_pir(chip, core_hwid) << 
> > 12,
> 
> Pffut ... This should be :
> 
>   (pcc->core_pir(chip, core_hwid) + 
> j) << 12,
> 
> but as smp_threads=1, it has no consequences for the moment. Tell me 
> how you would prefer me to fix this.

I think I have enough comments on the previous patch that a respin of
patches 11+ will make sense, so just fix it then.

> 
> Thanks,
> 
> C.
> 
> 
> > +>xics.icp_mmios[i]);
> > +}
> >  i++;
> >  }
> >  g_free(typename);
> > diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> > index 2acda9637db5..e15c76163759 100644
> > --- a/hw/ppc/pnv_core.c
> > +++ b/hw/ppc/pnv_core.c
> > @@ -24,6 +24,7 @@
> >  #include "hw/ppc/ppc.h"
> >  #include "hw/ppc/pnv.h"
> >  #include "hw/ppc/pnv_core.h"
> > +#include "hw/ppc/xics.h"
> >  
> >  static void powernv_cpu_reset(void *opaque)
> >  {
> > @@ -42,7 +43,7 @@ static void powernv_cpu_reset(void *opaque)
> >

Re: [Qemu-devel] [PATCH v5 17/17] ppc/pnv: Add Naples chip support for LPC interrupts

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:50AM +0200, Cédric Le Goater wrote:
> From: Benjamin Herrenschmidt 
> 
> It adds the Naples chip which supports proper LPC interrupts via the
> LPC controller rather than via an external CPLD.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> [clg: - updated for qemu-2.7
>   - ported on latest PowerNV patchset (v3) ]
> Signed-off-by: Cédric Le Goater 

Reviewed-by: David Gibson 

> ---
> 
>  Changes since v4:
> 
>  - remove test on ISA_NUM_IRQS 
> 
>  hw/ppc/pnv.c | 15 ++-
>  hw/ppc/pnv_lpc.c | 47 +--
>  include/hw/ppc/pnv_lpc.h |  9 +
>  3 files changed, 68 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index ddbf7510424c..4ef80b5b4110 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -336,7 +336,14 @@ static void pnv_lpc_isa_irq_handler_cpld(void *opaque, 
> int n, int level)
>  
>  static void pnv_lpc_isa_irq_handler(void *opaque, int n, int level)
>  {
> - /* XXX TODO */
> +PnvChip *chip = opaque;
> +PnvLpcController *lpc = >lpc;
> +
> +/* The Naples HW latches the 1 levels, clearing is done by SW */
> +if (level) {
> +lpc->lpc_hc_irqstat |= LPC_HC_IRQ_SERIRQ0 >> n;
> +pnv_lpc_eval_irqs(lpc);
> +}
>  }
>  
>  static ISABus *pnv_isa_create(PnvChip *chip)
> @@ -659,6 +666,12 @@ static void pnv_chip_init(Object *obj)
>  object_property_add_child(obj, "occ", OBJECT(>occ), NULL);
>  object_property_add_const_link(OBJECT(>occ), "psi",
> OBJECT(>psi), _abort);
> +
> +/*
> + * The LPC controller needs PSI to generate interrupts
> + */
> +object_property_add_const_link(OBJECT(>lpc), "psi",
> +   OBJECT(>psi), _abort);
>  }
>  
>  static void pnv_chip_realize(DeviceState *dev, Error **errp)
> diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
> index 00dbd8b07b38..91e966565694 100644
> --- a/hw/ppc/pnv_lpc.c
> +++ b/hw/ppc/pnv_lpc.c
> @@ -249,6 +249,34 @@ static const MemoryRegionOps pnv_lpc_xscom_ops = {
>  .endianness = DEVICE_BIG_ENDIAN,
>  };
>  
> +void pnv_lpc_eval_irqs(PnvLpcController *lpc)
> +{
> +bool lpc_to_opb_irq = false;
> +
> +/* Update LPC controller to OPB line */
> +if (lpc->lpc_hc_irqser_ctrl & LPC_HC_IRQSER_EN) {
> +uint32_t irqs;
> +
> +irqs = lpc->lpc_hc_irqstat & lpc->lpc_hc_irqmask;
> +lpc_to_opb_irq = (irqs != 0);
> +}
> +
> +/* We don't honor the polarity register, it's pointless and unused
> + * anyway
> + */
> +if (lpc_to_opb_irq) {
> +lpc->opb_irq_input |= OPB_MASTER_IRQ_LPC;
> +} else {
> +lpc->opb_irq_input &= ~OPB_MASTER_IRQ_LPC;
> +}
> +
> +/* Update OPB internal latch */
> +lpc->opb_irq_stat |= lpc->opb_irq_input & lpc->opb_irq_mask;
> +
> +/* Reflect the interrupt */
> +pnv_psi_irq_set(lpc->psi, PSIHB_IRQ_LPC_I2C, lpc->opb_irq_stat != 0);
> +}
> +
>  static uint64_t lpc_hc_read(void *opaque, hwaddr addr, unsigned size)
>  {
>  PnvLpcController *lpc = opaque;
> @@ -299,12 +327,15 @@ static void lpc_hc_write(void *opaque, hwaddr addr, 
> uint64_t val,
>  break;
>  case LPC_HC_IRQSER_CTRL:
>  lpc->lpc_hc_irqser_ctrl = val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case LPC_HC_IRQMASK:
>  lpc->lpc_hc_irqmask = val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case LPC_HC_IRQSTAT:
>  lpc->lpc_hc_irqstat &= ~val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case LPC_HC_ERROR_ADDRESS:
>  break;
> @@ -362,14 +393,15 @@ static void opb_master_write(void *opaque, hwaddr addr,
>  switch (addr) {
>  case OPB_MASTER_LS_IRQ_STAT:
>  lpc->opb_irq_stat &= ~val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case OPB_MASTER_LS_IRQ_MASK:
> -/* XXX Filter out reserved bits */
>  lpc->opb_irq_mask = val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case OPB_MASTER_LS_IRQ_POL:
> -/* XXX Filter out reserved bits */
>  lpc->opb_irq_pol = val;
> +pnv_lpc_eval_irqs(lpc);
>  break;
>  case OPB_MASTER_LS_IRQ_INPUT:
>  /* Read only */
> @@ -397,6 +429,8 @@ static const MemoryRegionOps opb_master_ops = {
>  static void pnv_lpc_realize(DeviceState *dev, Error **errp)
>  {
>  PnvLpcController *lpc = PNV_LPC(dev);
> +Object *obj;
> +Error *error = NULL;
>  
>  /* Reg inits */
>  lpc->lpc_hc_fw_rd_acc_size = LPC_HC_FW_RD_4B;
> @@ -440,6 +474,15 @@ static void pnv_lpc_realize(DeviceState *dev, Error 
> **errp)
>  pnv_xscom_region_init(>xscom_regs, OBJECT(dev),
>_lpc_xscom_ops, lpc, "xscom-lpc",
>PNV_XSCOM_LPC_SIZE);
> +
> +/* get PSI object from chip */
> +obj =

Re: [Qemu-devel] [PATCH v5 13/17] ppc/xics: add a xics_get_cpu_index_by_pir helper

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:46AM +0200, Cédric Le Goater wrote:
> We will need this helper to translate the server number of the XIVE
> (which is a PIR) into an ICPState index number (which is a cpu index).
> 
> Signed-off-by: Cédric Le Goater 

Looks correct as far as it goes, but I wonder if this would be more
generally useful as a machine level function that searches the cpu
objects by PIR, returning a pointer.  From that to the cpu_index is
then trivial.

> ---
>  hw/intc/xics_native.c | 19 +++
>  include/hw/ppc/xics.h |  1 +
>  2 files changed, 20 insertions(+)
> 
> diff --git a/hw/intc/xics_native.c b/hw/intc/xics_native.c
> index bbdd786aeb50..6318862f53fc 100644
> --- a/hw/intc/xics_native.c
> +++ b/hw/intc/xics_native.c
> @@ -33,6 +33,25 @@
>  
>  #include 
>  
> +int xics_get_cpu_index_by_pir(XICSState *xics, int pir)
> +{
> +int i;
> +
> +for (i = 0; i < xics->nr_servers; i++) {
> +ICPState *icp = >ss[i];
> +if (icp->cs) {
> +PowerPCCPU *cpu = POWERPC_CPU(icp->cs);
> +CPUPPCState *env = >env;
> +
> +if (env->spr_cb[SPR_PIR].default_value == pir) {
> +return i;
> +}
> +}
> +}
> +
> +return -1;
> +}
> +
>  static void xics_native_reset(void *opaque)
>  {
>  device_reset(DEVICE(opaque));
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 911cdd5e549f..beb232e616c5 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -214,6 +214,7 @@ void xics_set_nr_servers(XICSState *xics, uint32_t 
> nr_servers,
>  
>  /* Internal XICS interfaces */
>  int xics_get_cpu_index_by_dt_id(int cpu_dt_id);
> +int xics_get_cpu_index_by_pir(XICSState *xics, int pir);
>  
>  void icp_set_cppr(ICPState *icp, uint8_t cppr);
>  void icp_set_mfrr(ICPState *icp, uint8_t mfrr);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH] vfio: Handle zero-length sparse mmap ranges

2016-10-24 Thread no-reply

Hi,

Your series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20161025033140.15273.87118.st...@gimli.home
Subject: [Qemu-devel] [PATCH] vfio: Handle zero-length sparse mmap ranges

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git show --no-patch --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
8536f76 vfio: Handle zero-length sparse mmap ranges

=== OUTPUT BEGIN ===
Checking PATCH 1/1: vfio: Handle zero-length sparse mmap ranges...
ERROR: code indent should never use tabs
#68: FILE: hw/vfio/common.c:637:
+^Iif (sparse->areas[i].size) {$

total: 1 errors, 0 warnings, 63 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-devel] [PATCH v8 02/36] qapi: Add ImageLockMode

2016-10-24 Thread Fam Zheng

On Fri, 10/21 22:45, Max Reitz wrote:
> On 30.09.2016 14:09, Fam Zheng wrote:
> > Signed-off-by: Fam Zheng 
> > ---
> >  qapi/block-core.json | 18 ++
> >  1 file changed, 18 insertions(+)
> > 
> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> > index 92193ab..22e8d04 100644
> > --- a/qapi/block-core.json
> > +++ b/qapi/block-core.json
> > @@ -2754,3 +2754,21 @@
> >'data' : { 'parent': 'str',
> >   '*child': 'str',
> >   '*node': 'str' } }
> > +
> > +##
> > +# @ImageLockMode:
> > +#
> > +# @auto: defer to the block driver to use the least strict mode, based on
> > +#the nature of format and read-only flag, and the supported locking
> > +#operations of the protocol.
> 
> I have some difficulty understanding this description. I'd intuitively
> assume no locking to be the "least strict mode"; however, since it
> should be always possible not to lock an image, this would mean that
> auto=nolock. Which is hopefully isn't.
> 
> If it's not easy to come up with a thorough explanation, perhaps it
> would be best to give some examples which help to understand the concept
> behind "auto" intuitively.

It could have beeen more specific, it's my bad being too terse here. Maybe
something like this:

@auto: defer to the block layer to use an appropriate lock mode, based on
   the driver used and read-only option: for read-only images, shared
   lock mode, or otherwise exclusive lock mode, will be attempted; if
   the driver doesn't support this mode (or sharing is particularly
   desired by its design), nolock will be used.

?

Fam

Re: [Qemu-devel] [PATCH v5 11/17] ppc/xics: Add "native" XICS subclass

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:44AM +0200, Cédric Le Goater wrote:
> This provides access to the MMIO based Interrupt Presentation
> Controllers (ICP) as found on a POWER8 system.
> 
> A new XICSNative class is introduced to hold the MMIO region of the
> ICPs. Each thread of the system has a subregion, indexed by its PIR
> number, holding a XIVE (External Interrupt Vector Entry). This
> provides a mean to make the link with the ICPState of the CPU.
> 
> Signed-off-by: Cédric Le Goater 
> ---
> 
>  Changes since v4:
> 
>  - replaced the pir_table by memory subregions using an ICP. 
>  - removed the find_icp() and cpu_setup() handlers which became
>useless with the memory regions.
>  - removed the superfluous inits done in xics_native_initfn. This is
>covered in the parent class init.
>  - took ownership of the patch.
> 
>  default-configs/ppc64-softmmu.mak |   3 +-
>  hw/intc/Makefile.objs |   1 +
>  hw/intc/xics_native.c | 304 
> ++
>  include/hw/ppc/pnv.h  |  19 +++
>  include/hw/ppc/xics.h |  24 +++
>  5 files changed, 350 insertions(+), 1 deletion(-)
>  create mode 100644 hw/intc/xics_native.c
> 
> diff --git a/default-configs/ppc64-softmmu.mak 
> b/default-configs/ppc64-softmmu.mak
> index 67a9bcaa67fa..a22c93a48686 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -48,8 +48,9 @@ CONFIG_PLATFORM_BUS=y
>  CONFIG_ETSEC=y
>  CONFIG_LIBDECNUMBER=y
>  # For pSeries
> -CONFIG_XICS=$(CONFIG_PSERIES)
> +CONFIG_XICS=$(or $(CONFIG_PSERIES),$(CONFIG_POWERNV))
>  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
> +CONFIG_XICS_NATIVE=$(CONFIG_POWERNV)
>  CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
>  # For PReP
>  CONFIG_MC146818RTC=y
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 2f44a2da26e9..e44a29d75b32 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -34,6 +34,7 @@ obj-$(CONFIG_RASPI) += bcm2835_ic.o bcm2836_control.o
>  obj-$(CONFIG_SH4) += sh_intc.o
>  obj-$(CONFIG_XICS) += xics.o
>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
> +obj-$(CONFIG_XICS_NATIVE) += xics_native.o
>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
> diff --git a/hw/intc/xics_native.c b/hw/intc/xics_native.c
> new file mode 100644
> index ..bbdd786aeb50
> --- /dev/null
> +++ b/hw/intc/xics_native.c
> @@ -0,0 +1,304 @@
> +/*
> + * QEMU PowerPC PowerNV machine model
> + *
> + * Native version of ICS/ICP
> + *
> + * Copyright (c) 2016, IBM Corporation.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see 
> .
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu-common.h"
> +#include "cpu.h"
> +#include "hw/hw.h"
> +#include "qemu/log.h"
> +#include "qapi/error.h"
> +
> +#include "hw/ppc/fdt.h"
> +#include "hw/ppc/xics.h"
> +#include "hw/ppc/pnv.h"
> +
> +#include 
> +
> +static void xics_native_reset(void *opaque)
> +{
> +device_reset(DEVICE(opaque));
> +}
> +
> +static void xics_native_initfn(Object *obj)
> +{
> +qemu_register_reset(xics_native_reset, obj);
> +}

I think we need to investigate why the xics native is not showing up
on the SysBus.  As a "raw" MMIO device, it really should.  If it was,
device_reset should be called without these shenannigans.

> +
> +static uint64_t xics_native_read(void *opaque, hwaddr addr, unsigned width)
> +{
> +ICPState *icp = opaque;
> +bool byte0 = (width == 1 && (addr & 0x3) == 0);
> +uint64_t val = 0x;
> +
> +switch (addr & 0xffc) {
> +case 0: /* poll */
> +val = icp_ipoll(icp, NULL);
> +if (byte0) {
> +val >>= 24;
> +} else if (width != 4) {
> +goto bad_access;
> +}
> +break;
> +case 4: /* xirr */
> +if (byte0) {
> +val = icp_ipoll(icp, NULL) >> 24;
> +} else if (width == 4) {
> +val = icp_accept(icp);
> +} else {
> +goto bad_access;
> +}
> +break;
> +case 12:
> +if (byte0) {
> +val = icp->mfrr;
> +} else {
> +goto bad_access;
> +}
> +break;
> +case 16:
> +if (width == 4) {
> +

Re: [Qemu-devel] [PATCH] hw/arm/pxa2xx: Correctly handle external GPIO reset requests

2016-10-24 Thread no-reply

Hi,

Your series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PATCH] hw/arm/pxa2xx: Correctly handle external GPIO 
reset requests
Type: series
Message-id: 1477361212-18833-1-git-send-email-li...@roeck-us.net

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git show --no-patch --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
5e1b6c1 hw/arm/pxa2xx: Correctly handle external GPIO reset requests

=== OUTPUT BEGIN ===
Checking PATCH 1/1: hw/arm/pxa2xx: Correctly handle external GPIO reset 
requests...
ERROR: code indent should never use tabs
#29: FILE: hw/arm/pxa2xx.c:2056:
+if (line == 1 && level && (s->pm_regs[PCFR >> 2] & 0x10)) {^I/* GPR_EN */$

total: 1 errors, 0 warnings, 19 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-devel] [PATCH v5 14/17] ppc/xics: introduce a helper to insert a new ics

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:47AM +0200, Cédric Le Goater wrote:
> Interrupt Control Sources (ICS) are now maintained under a list.
> 
> Signed-off-by: Cédric Le Goater 

Reviewed-by: David Gibson 

> ---
>  hw/intc/xics.c| 6 ++
>  include/hw/ppc/xics.h | 1 +
>  2 files changed, 7 insertions(+)
> 
> diff --git a/hw/intc/xics.c b/hw/intc/xics.c
> index 095c16a30082..f24787e95013 100644
> --- a/hw/intc/xics.c
> +++ b/hw/intc/xics.c
> @@ -151,6 +151,12 @@ static void xics_common_reset(DeviceState *d)
>  }
>  }
>  
> +void xics_insert_ics(XICSState *xics, ICSState *ics)
> +{
> +ics->xics = xics;
> +QLIST_INSERT_HEAD(>ics, ics, list);
> +}
> +
>  static void xics_prop_get_nr_irqs(Object *obj, Visitor *v, const char *name,
>void *opaque, Error **errp)
>  {
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index beb232e616c5..f31eef8c9f6c 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -228,6 +228,7 @@ void ics_simple_write_xive(ICSState *ics, int nr, int 
> server,
>  void ics_set_irq_type(ICSState *ics, int srcno, bool lsi);
>  
>  ICSState *xics_find_source(XICSState *icp, int irq);
> +void xics_insert_ics(XICSState *xics, ICSState *ics);
>  
>  typedef struct PnvChip PnvChip;
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

2016-10-24 Thread Ketan Nilangekar

We are able to derive significant performance from the qemu block driver as 
compared to nbd/iscsi/nfs. We have prototyped nfs and nbd based io tap in the 
past and the performance of qemu block driver is significantly better. Hence we 
would like to go with the vxhs driver for now.

Ketan

> On Oct 24, 2016, at 4:24 PM, Paolo Bonzini  wrote:
> 
> 
> 
>> On 20/10/2016 03:31, Ketan Nilangekar wrote:
>> This way the failover logic will be completely out of qemu address
>> space. We are considering use of some of our proprietary
>> clustering/monitoring services to implement service failover.
> 
> Are you implementing a different protocol just for the sake of QEMU, in
> other words, and forwarding from that protocol to your proprietary code?
> 
> If that is what you are doing, you don't need at all a vxhs driver in
> QEMU.  Just implement NBD or iSCSI on your side, QEMU already has
> drivers for that.
> 
> Paolo

[Qemu-devel] [PATCH 05/10] spapr: update spapr hotplug documentation

2016-10-24 Thread Michael Roth

This updates the existing documentation to reflect recent updates to
the hotplug event structure, which are in draft form but slated
for inclusion in PAPR/LoPAPR.

Signed-off-by: Michael Roth 
Reviewed-by: David Gibson 
---
 docs/specs/ppc-spapr-hotplug.txt | 55 +---
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
index 631b0ca..f57e2a0 100644
--- a/docs/specs/ppc-spapr-hotplug.txt
+++ b/docs/specs/ppc-spapr-hotplug.txt
@@ -233,12 +233,27 @@ tools by host-level management such as an HMC. This level 
of management is not
 applicable to PowerKVM, hence the reason for extending the notification
 framework to support hotplug events.
 
-Note that these events are not yet formally part of the PAPR+ specification,
-but support for this format has already been implemented in DR-related
-guest tools such as powerpc-utils/librtas, as well as kernel patches that have
-been submitted to handle in-kernel processing of memory/cpu-related hotplug
-events[1], and is planned for formal inclusion is PAPR+ specification. The
-hotplug-specific payload is QEMU implemented as follows (with all values
+The format for these EPOW-signalled events is described below under
+"hotplug/unplug event structure". Note that these events are not
+formally part of the PAPR+ specification, and have been superseded by a
+newer format, also described below under "hotplug/unplug event structure",
+and so are now deemed a "legacy" format. The formats are similar, but the
+"modern" format contains additional fields/flags, which are denoted for the
+purposes of this documentation with "#ifdef GUEST_SUPPORTS_MODERN" guards.
+
+QEMU should assume support only for "legacy" fields/flags unless the guest
+advertises support for the "modern" format via ibm,client-architecture-support
+hcall by setting byte 5, bit 6 of it's ibm,architecture-vec-5 option vector
+structure (as described by LoPAPR v11, B.6.2.3). As with "legacy" format 
events,
+"modern" format events are surfaced to the guest via check-exception RTAS 
calls,
+but use a dedicated event source to signal the guest. This event source is
+advertised to the guest by the addition of a "hot-plug-events" node under
+"/event-sources" node of the guest's device tree using the standard format
+described in LoPAPR v11, B.6.12.1.
+
+== hotplug/unplug event structure ==
+
+The hotplug-specific payload in QEMU is implemented as follows (with all values
 encoded in big-endian format):
 
 struct rtas_event_log_v6_hp {
@@ -263,14 +278,23 @@ struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_HP_ACTION_ADD   1
 #define RTAS_LOG_V6_HP_ACTION_REMOVE2
 uint8_t hotplug_action; /* action (add/remove) */
-#define RTAS_LOG_V6_HP_ID_DRC_NAME  1
-#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2
-#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3
+#define RTAS_LOG_V6_HP_ID_DRC_NAME  1
+#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3
+#ifdef GUEST_SUPPORTS_MODERN
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED 4
+#endif
 uint8_t hotplug_identifier; /* type of the resource identifier,
  * which serves as the discriminator
  * for the 'drc' union field below
  */
+#ifdef GUEST_SUPPORTS_MODERN
+uint8_t capabilities;   /* capability flags, currently unused
+ * by QEMU
+ */
+#else
 uint8_t reserved;
+#endif
 union {
 uint32_t index; /* DRC index of resource to take action
  * on
@@ -278,6 +302,19 @@ struct rtas_event_log_v6_hp {
 uint32_t count; /* number of DR resources to take
  * action on (guest chooses which)
  */
+#ifdef GUEST_SUPPORTS_MODERN
+struct {
+uint32_t count; /* number of DR resources to take
+ * action on
+ */
+uint32_t index; /* DRC index of first resource to take
+ * action on. guest will take action
+ * on DRC index  through
+ * DRC index  in
+ * sequential order
+ */
+} count_indexed;
+#endif
 char name[1];   /* string representing the name of the
  * DRC to take action on
  */
-- 
1.9.1

[Qemu-devel] [PATCH 07/10] spapr_events: add support for dedicated hotplug event source

2016-10-24 Thread Michael Roth

Hotplug events were previously delivered using an EPOW interrupt
and were queued by linux guests into a circular buffer. For traditional
EPOW events like shutdown/resets, this isn't an issue, but for hotplug
events there are cases where this buffer can be exhausted, resulting
in the loss of hotplug events, resets, etc.

Newer-style hotplug event are delivered using a dedicated event source.
We enable this in supported guests by adding standard an additional
event source in the guest device-tree via /event-sources, and, if
the guest advertises support for the newer-style hotplug events,
using the corresponding interrupt to signal the available of
hotplug/unplug events.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c |   4 +-
 hw/ppc/spapr_events.c  | 202 -
 include/hw/ppc/spapr.h |   5 +-
 3 files changed, 170 insertions(+), 41 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index a3ea140..dc4224b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -973,7 +973,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
 }
 
 /* /event-sources */
-spapr_dt_events(fdt, spapr->check_exception_irq);
+spapr_dt_events(spapr, fdt);
 
 /* /rtas */
 spapr_dt_rtas(spapr, fdt);
@@ -1917,7 +1917,7 @@ static void ppc_spapr_init(MachineState *machine)
 }
 g_free(filename);
 
-/* Set up EPOW events infrastructure */
+/* Set up RTAS event infrastructure */
 spapr_events_init(spapr);
 
 /* Set up the RTC RTAS interfaces */
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 89aa5a7..b6b3511 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -40,6 +40,7 @@
 #include "hw/ppc/spapr_drc.h"
 #include "qemu/help_option.h"
 #include "qemu/bcd.h"
+#include "hw/ppc/spapr_ovec.h"
 #include 
 
 struct rtas_error_log {
@@ -206,27 +207,140 @@ struct hp_log_full {
 struct rtas_event_log_v6_hp hp;
 } QEMU_PACKED;
 
-#define EVENT_MASK_INTERNAL_ERRORS   0x8000
-#define EVENT_MASK_EPOW  0x4000
-#define EVENT_MASK_HOTPLUG   0x1000
-#define EVENT_MASK_IO0x0800
+typedef enum EventClass {
+EVENT_CLASS_INTERNAL_ERRORS = 0,
+EVENT_CLASS_EPOW= 1,
+EVENT_CLASS_RESERVED= 2,
+EVENT_CLASS_HOT_PLUG= 3,
+EVENT_CLASS_IO  = 4,
+EVENT_CLASS_MAX
+} EventClassIndex;
+#define EVENT_CLASS_MASK(index) (1 << (31 - index))
+
+static const char *event_names[EVENT_CLASS_MAX] = {
+[EVENT_CLASS_INTERNAL_ERRORS]   = "internal-errors",
+[EVENT_CLASS_EPOW]  = "epow-events",
+[EVENT_CLASS_HOT_PLUG]  = "hot-plug-events",
+[EVENT_CLASS_IO]= "ibm,io-events",
+};
+
+struct sPAPREventSource {
+const char *name;
+int irq;
+uint32_t mask;
+bool enabled;
+};
+
+static sPAPREventSource *spapr_event_sources_new(void)
+{
+sPAPREventSource *event_sources = g_new0(sPAPREventSource,
+ EVENT_CLASS_MAX);
+int i;
+
+for (i = 0; i < EVENT_CLASS_MAX; i++) {
+event_sources[i].name = event_names[i];
+}
 
-void spapr_dt_events(void *fdt, uint32_t check_exception_irq)
+return event_sources;
+}
+
+static void spapr_event_sources_register(sPAPREventSource *event_sources,
+EventClassIndex index, int irq)
 {
-int event_sources, epow_events;
-uint32_t irq_ranges[] = {cpu_to_be32(check_exception_irq), cpu_to_be32(1)};
-uint32_t interrupts[] = {cpu_to_be32(check_exception_irq), 0};
+/* we only support 1 irq per event class at the moment */
+g_assert(event_sources);
+g_assert(!event_sources[index].enabled);
+event_sources[index].irq = irq;
+event_sources[index].mask = EVENT_CLASS_MASK(index);
+event_sources[index].enabled = true;
+}
+
+static const sPAPREventSource
+*spapr_event_sources_get_source(sPAPREventSource *event_sources,
+EventClassIndex index)
+{
+g_assert(index < EVENT_CLASS_MAX);
+g_assert(event_sources);
+
+return _sources[index];
+}
+
+void spapr_dt_events(sPAPRMachineState *spapr, void *fdt)
+{
+uint32_t irq_ranges[EVENT_CLASS_MAX * 2];
+int i, count = 0, event_sources;
+sPAPREventSource *events = spapr->event_sources;
+
+g_assert(events);
 
 _FDT(event_sources = fdt_add_subnode(fdt, 0, "event-sources"));
 
-_FDT(fdt_setprop(fdt, event_sources, "interrupt-controller", NULL, 0));
-_FDT(fdt_setprop_cell(fdt, event_sources, "#interrupt-cells", 2));
-_FDT(fdt_setprop(fdt, event_sources, "interrupt-ranges",
- irq_ranges, sizeof(irq_ranges)));
+for (i = 0, count = 0; i < EVENT_CLASS_MAX; i++) {
+int node_offset;
+uint32_t interrupts[2];
+const sPAPREventSource *source =
+

[Qemu-devel] [PATCH 08/10] spapr: Add DRC count indexed hotplug identifier type

2016-10-24 Thread Michael Roth

From: Bharata B Rao 

Add support for DRC count indexed hotplug ID type which is primarily
needed for memory hot unplug. This type allows for specifying the
number of DRs that should be plugged/unplugged starting from a given
DRC index.

Signed-off-by: Bharata B Rao 
* updated rtas_event_log_v6_hp to reflect count/index field ordering
  used in PAPR hotplug ACR
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr_events.c  | 76 --
 include/hw/ppc/spapr.h |  4 +++
 2 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index b6b3511..596e991 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -175,6 +175,16 @@ struct epow_log_full {
 struct rtas_event_log_v6_epow epow;
 } QEMU_PACKED;
 
+union drc_identifier {
+uint32_t index;
+uint32_t count;
+struct {
+uint32_t count;
+uint32_t index;
+} count_indexed;
+char name[1];
+} QEMU_PACKED;
+
 struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_SECTION_ID_HOTPLUG  0x4850 /* HP */
 struct rtas_event_log_v6_section_header hdr;
@@ -191,12 +201,9 @@ struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_HP_ID_DRC_NAME   1
 #define RTAS_LOG_V6_HP_ID_DRC_INDEX  2
 #define RTAS_LOG_V6_HP_ID_DRC_COUNT  3
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED  4
 uint8_t reserved;
-union {
-uint32_t index;
-uint32_t count;
-char name[1];
-} drc;
+union drc_identifier drc_id;
 } QEMU_PACKED;
 
 struct hp_log_full {
@@ -496,7 +503,7 @@ static void spapr_hotplug_set_signalled(uint32_t drc_index)
 
 static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
 sPAPRDRConnectorType drc_type,
-uint32_t drc)
+union drc_identifier *drc_id)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 struct hp_log_full *new_hp;
@@ -541,7 +548,7 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t 
hp_action,
 case SPAPR_DR_CONNECTOR_TYPE_PCI:
 hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PCI;
 if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
-spapr_hotplug_set_signalled(drc);
+spapr_hotplug_set_signalled(drc_id->index);
 }
 break;
 case SPAPR_DR_CONNECTOR_TYPE_LMB:
@@ -559,9 +566,18 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t 
hp_action,
 }
 
 if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
-hp->drc.count = cpu_to_be32(drc);
+hp->drc_id.count = cpu_to_be32(drc_id->count);
 } else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_INDEX) {
-hp->drc.index = cpu_to_be32(drc);
+hp->drc_id.index = cpu_to_be32(drc_id->index);
+} else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT_INDEXED) {
+/* we should not be using count_indexed value unless the guest
+ * supports dedicated hotplug event source
+ */
+g_assert(spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT));
+hp->drc_id.count_indexed.count =
+cpu_to_be32(drc_id->count_indexed.count);
+hp->drc_id.count_indexed.index =
+cpu_to_be32(drc_id->count_indexed.index);
 }
 
 rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
@@ -575,34 +591,64 @@ void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc)
 {
 sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 sPAPRDRConnectorType drc_type = drck->get_type(drc);
-uint32_t index = drck->get_index(drc);
+union drc_identifier drc_id;
 
+drc_id.index = drck->get_index(drc);
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
-RTAS_LOG_V6_HP_ACTION_ADD, drc_type, index);
+RTAS_LOG_V6_HP_ACTION_ADD, drc_type, _id);
 }
 
 void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc)
 {
 sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 sPAPRDRConnectorType drc_type = drck->get_type(drc);
-uint32_t index = drck->get_index(drc);
+union drc_identifier drc_id;
 
+drc_id.index = drck->get_index(drc);
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
-RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, index);
+RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, _id);
 }
 
 void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
uint32_t count)
 {
+union drc_identifier drc_id;
+
+drc_id.count = count;
 spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
-RTAS_LOG_V6_HP_ACTION_ADD, drc_type, count);
+RTAS_LOG_V6_HP_ACTION_ADD, drc_type, _id);
 }
 
 void

[Qemu-devel] [PATCH 06/10] spapr: add hotplug interrupt machine options

2016-10-24 Thread Michael Roth

This adds machine options of the form:

  -machine pseries,modern-hotplug-events=true
  -machine pseries,modern-hotplug-events=false

If false, QEMU will force the use of "legacy" style hotplug events,
which are surfaced through EPOW events instead of a dedicated
hot plug event source, and lack certain features necessary, mainly,
for memory unplug support.

If true, QEMU will enable support for "modern" dedicated hot plug
event source. Note that we will still default to "legacy" style unless
the guest advertises support for the "modern" hotplug events via
ibm,client-architecture-support hcall during early boot.

For pseries-2.7 and earlier we default to false, for newer machine
types we default to true.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c  | 33 +
 include/hw/ppc/spapr.h  |  1 +
 include/hw/ppc/spapr_ovec.h |  1 +
 3 files changed, 35 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 828072a..a3ea140 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1789,6 +1789,11 @@ static void ppc_spapr_init(MachineState *machine)
 
 spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
 
+/* advertise support for dedicated HP event source to guests */
+if (spapr->use_hotplug_event_source) {
+spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
+}
+
 /* init CPUs */
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : smc->tcg_default_cpu;
@@ -2138,16 +2143,41 @@ static void spapr_set_kvm_type(Object *obj, const char 
*value, Error **errp)
 spapr->kvm_type = g_strdup(value);
 }
 
+static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
+{
+sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+return spapr->use_hotplug_event_source;
+}
+
+static void spapr_set_modern_hotplug_events(Object *obj, bool value,
+Error **errp)
+{
+sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+spapr->use_hotplug_event_source = value;
+}
+
 static void spapr_machine_initfn(Object *obj)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
 
 spapr->htab_fd = -1;
+spapr->use_hotplug_event_source = true;
 object_property_add_str(obj, "kvm-type",
 spapr_get_kvm_type, spapr_set_kvm_type, NULL);
 object_property_set_description(obj, "kvm-type",
 "Specifies the KVM virtualization mode 
(HV, PR)",
 NULL);
+object_property_add_bool(obj, "modern-hotplug-events",
+spapr_get_modern_hotplug_events,
+spapr_set_modern_hotplug_events,
+NULL);
+object_property_set_description(obj, "modern-hotplug-events",
+"Use dedicated hotplug event mechanism in"
+" place of standard EPOW events when 
possible"
+" (required for memory hot-unplug 
support)",
+NULL);
 }
 
 static void spapr_machine_finalizefn(Object *obj)
@@ -2594,7 +2624,10 @@ static void phb_placement_2_7(sPAPRMachineState *spapr, 
uint32_t index,
 
 static void spapr_machine_2_7_instance_options(MachineState *machine)
 {
+sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
+
 spapr_machine_2_8_instance_options(machine);
+spapr->use_hotplug_event_source = false;
 }
 
 static void spapr_machine_2_7_class_options(MachineClass *mc)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index b6f9f1b..851f536 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -80,6 +80,7 @@ struct sPAPRMachineState {
 uint32_t check_exception_irq;
 Notifier epow_notifier;
 QTAILQ_HEAD(, sPAPREventLogEntry) pending_events;
+bool use_hotplug_event_source;
 
 /* Migration state */
 int htab_save_index;
diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
index 47fa04c..92167c6 100644
--- a/include/hw/ppc/spapr_ovec.h
+++ b/include/hw/ppc/spapr_ovec.h
@@ -45,6 +45,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
 /* option vector 5 */
 #define OV5_DRCONF_MEMORY   OV_BIT(2, 2)
 #define OV5_FORM1_AFFINITY  OV_BIT(5, 0)
+#define OV5_HP_EVT  OV_BIT(6, 5)
 
 /* interfaces */
 sPAPROptionVector *spapr_ovec_new(void);
-- 
1.9.1

[Qemu-devel] [PATCH 01/10] spapr_ovec: initial implementation of option vector helpers

2016-10-24 Thread Michael Roth

PAPR guests advertise their capabilities to the platform by passing
an ibm,architecture-vec structure via an
ibm,client-architecture-support hcall as described by LoPAPR v11,
B.6.2.3. during early boot.

Using this information, the platform enables the capabilities it
supports, then encodes a subset of those enabled capabilities (the
5th option vector of the ibm,architecture-vec structure passed to
ibm,client-architecture-support) into the guest device tree via
"/chosen/ibm,architecture-vec-5".

The logical format of these these option vectors is a bit-vector,
where individual bits are addressed/documented based on the byte-wise
offset from the beginning of the bit-vector, followed by the bit-wise
index starting from the byte-wise offset. Thus the bits of each of
these bytes are stored in reverse order. Additionally, the first
byte of each option vector is encodes the length of the option vector,
so byte offsets begin at 1, and bit offset at 0.

This is not very intuitive for the purposes of mapping these bits to
a particular documented capability, so this patch introduces a set
of abstractions that encapsulate the work of parsing/encoding these
options vectors and testing for individual capabilities.

Cc: Bharata B Rao 
Signed-off-by: Michael Roth 
---
 hw/ppc/Makefile.objs|   2 +-
 hw/ppc/spapr_ovec.c | 242 
 include/hw/ppc/spapr_ovec.h |  62 
 3 files changed, 305 insertions(+), 1 deletion(-)
 create mode 100644 hw/ppc/spapr_ovec.c
 create mode 100644 include/hw/ppc/spapr_ovec.h

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index ebc72af..8025129 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
 obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
-obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
+obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
 # IBM PowerNV
 obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
diff --git a/hw/ppc/spapr_ovec.c b/hw/ppc/spapr_ovec.c
new file mode 100644
index 000..c2a0d18
--- /dev/null
+++ b/hw/ppc/spapr_ovec.c
@@ -0,0 +1,242 @@
+/*
+ * QEMU SPAPR Architecture Option Vector Helper Functions
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Authors:
+ *  Bharata B Rao 
+ *  Michael Roth  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/ppc/spapr_ovec.h"
+#include "qemu/bitmap.h"
+#include "exec/address-spaces.h"
+#include "qemu/error-report.h"
+#include 
+
+/* #define DEBUG_SPAPR_OVEC */
+
+#ifdef DEBUG_SPAPR_OVEC
+#define DPRINTFN(fmt, ...) \
+do { fprintf(stderr, fmt "\n", ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTFN(fmt, ...) \
+do { } while (0)
+#endif
+
+#define OV_MAXBYTES 256 /* not including length byte */
+#define OV_MAXBITS (OV_MAXBYTES * BITS_PER_BYTE)
+
+/* we *could* work with bitmaps directly, but handling the bitmap privately
+ * allows us to more safely make assumptions about the bitmap size and
+ * simplify the calling code somewhat
+ */
+struct sPAPROptionVector {
+unsigned long *bitmap;
+};
+
+sPAPROptionVector *spapr_ovec_new(void)
+{
+sPAPROptionVector *ov;
+
+ov = g_new0(sPAPROptionVector, 1);
+ov->bitmap = bitmap_new(OV_MAXBITS);
+
+return ov;
+}
+
+sPAPROptionVector *spapr_ovec_clone(sPAPROptionVector *ov_orig)
+{
+sPAPROptionVector *ov;
+
+g_assert(ov_orig);
+
+ov = spapr_ovec_new();
+bitmap_copy(ov->bitmap, ov_orig->bitmap, OV_MAXBITS);
+
+return ov;
+}
+
+void spapr_ovec_intersect(sPAPROptionVector *ov,
+  sPAPROptionVector *ov1,
+  sPAPROptionVector *ov2)
+{
+g_assert(ov);
+g_assert(ov1);
+g_assert(ov2);
+
+bitmap_and(ov->bitmap, ov1->bitmap, ov2->bitmap, OV_MAXBITS);
+}
+
+/* returns true if options bits were removed, false otherwise */
+bool spapr_ovec_diff(sPAPROptionVector *ov,
+ sPAPROptionVector *ov_old,
+ sPAPROptionVector *ov_new)
+{
+unsigned long *change_mask = bitmap_new(OV_MAXBITS);
+unsigned long *removed_bits = bitmap_new(OV_MAXBITS);
+bool bits_were_removed = false;
+
+g_assert(ov);
+g_assert(ov_old);
+g_assert(ov_new);
+
+bitmap_xor(change_mask, ov_old->bitmap, ov_new->bitmap, OV_MAXBITS);
+bitmap_and(ov->bitmap, ov_new->bitmap, change_mask, OV_MAXBITS);
+bitmap_and(removed_bits, ov_old->bitmap, change_mask, OV_MAXBITS);
+
+if (!bitmap_empty(removed_bits, OV_MAXBITS)) {
+bits_were_removed = true;
+}
+
+

[Qemu-devel] [PATCH 00/10] spapr: option vector re-work and memory unplug support

2016-10-24 Thread Michael Roth

This series is based on David's ppc-for-2.8 branch, and is also available from:

  https://github.com/mdroth/qemu/commits/spapr-hotplug-event-update

Changes since RFC:
  * Submit as v1 now that PAPR Hotplug ACR is accepted
  * Rebase on latest ppc-for-2.8 (with device-tree refactoring)
  * address Patchew warnings
  * add comments to clarify spapr->ov5/ov5_cas usage. (David)
  * revise comment to clarify intent when setting spapr->ov5
OV5_HP_EVT bit. (Bharata)
  * drop internal usage of spapr_ovec_from_bitmap() in favor of
directly assigning bitmap to sPAPROptionVector instances. (David)
  * standardize meaning of 'vector_len' variable through spapr_ovec_*
functions to be the byte-wise length of option vectors entries,
and not including the preceeding length byte itself. (David)
  * fix spapr_ovec_populate_dt() to parse up to OV_MAXBITS bits
rather than OV_MAXBITS - 1. (David)
  * fix spapr_ovec_populate_dt() encode the minimum of 1 option
vector byte instead of the max of OV_MAXBYTES in cases where
no option bits are set. (David)
  * add some comments to spapr_ovec_populate_dt() to clarify what
is being encoded into length byte of ibm,architecture-vec-5
  * switch 'legacy-hotplug-events' option to
'modern-hotplug-events' (David)
  * modify rtas_event_log_to_source() to check for OV5_HP_EVT
option rather than relying on whether the hotplug source is
specifically enabled. Assert the latter in cases where
OV5_HP_EVT is set. (Bharata)
  * drop global EventSource list in favor of an sPAPREventSource
list field within sPAPRMachineState (David)
  * add CPU unplug hook in mc->unplug_request (Bharata)


Patches 1-4 address various deficiencies in how we currently handle option
vectors via ibm,client-architecture-support. This is done here in preparation
for a new option vector bit introduced later in this series, as well as a
number of future option vector bits related to other features, but I can
break this out into a separate series if preferred.

Patches 5-7 add support for an updated event format for hotplug events,
which includes a new way to specify a range of DRCs/LMBs to hotplug/unplug
using a starting position and count, which is necessary for memory unplug.
The format for this new event format is still in draft form, but slated
for inclusion in the PAPR/LoPAPR.

Patches 8-10 add support for memory unplug using the new event format.

In addition to kernel 4.8 or later, there are a number of patches required
to enable support on the guest kernel side. I've including the minimum set
of patches in my branch here:

   https://github.com/mdroth/linux/commits/spapr-hotplug-event-update

   *powerpc/pseries: advertise Hot Plug Event support to firmware
   powerpc/pseries: Implement indexed-count hotplug memory remove
   powerpc/pseries: Implement indexed-count hotplug memory add

Note that there is currently an issue that arises when attempting to
offline an LMB that was onlined using a guest kernel's auto-onlining
mechanism, which can prevent full completion of memory unplug requests.
This is being investigated, but for the purposes of testing this can
be worked around currently by disabling auto-onlining in guests via:

  "echo offline >/sys/devices/system/memory/auto_online_blocks"

and instead onlining the blocks manually or via udev.

 docs/specs/ppc-spapr-hotplug.txt |  55 +-
 hw/ppc/Makefile.objs |   2 +-
 hw/ppc/spapr.c   | 243 
+
 hw/ppc/spapr_drc.c   |  17 
 hw/ppc/spapr_events.c| 278 
+-
 hw/ppc/spapr_hcall.c |  70 ++---
 hw/ppc/spapr_ovec.c  | 242 
+
 include/hw/ppc/spapr.h   |  17 ++--
 include/hw/ppc/spapr_ovec.h  |  67 
 9 files changed, 868 insertions(+), 123 deletions(-)

[Qemu-devel] [PATCH 04/10] spapr: improve ibm, architecture-vec-5 property handling

2016-10-24 Thread Michael Roth

ibm,architecture-vec-5 is supposed to encode all option vector 5 bits
negotiated between platform/guest. Currently we hardcode this property
in the boot-time device tree to advertise a single negotiated
capability, "Form 1" NUMA Affinity, regardless of whether or not CAS
has been invoked or that capability has actually been negotiated.

Improve this by generating ibm,architecture-vec-5 based on the full
set of option vector 5 capabilities negotiated via CAS.

Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c  | 23 +--
 include/hw/ppc/spapr_ovec.h |  1 +
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3b64580..828072a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -659,14 +659,28 @@ static int spapr_dt_cas_updates(sPAPRMachineState *spapr, 
void *fdt,
 sPAPROptionVector *ov5_updates)
 {
 sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
-int ret = 0;
+int ret = 0, offset;
 
 /* Generate ibm,dynamic-reconfiguration-memory node if required */
 if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
 g_assert(smc->dr_lmb_enabled);
 ret = spapr_populate_drconf_memory(spapr, fdt);
+if (ret) {
+goto out;
+}
 }
 
+offset = fdt_path_offset(fdt, "/chosen");
+if (offset < 0) {
+offset = fdt_add_subnode(fdt, 0, "chosen");
+if (offset < 0) {
+return offset;
+}
+}
+ret = spapr_ovec_populate_dt(fdt, offset, spapr->ov5_cas,
+ "ibm,architecture-vec-5");
+
+out:
 return ret;
 }
 
@@ -792,14 +806,9 @@ static void spapr_dt_chosen(sPAPRMachineState *spapr, void 
*fdt)
 char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
 size_t cb = 0;
 char *bootlist = get_boot_devices_list(, true);
-unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
 
 _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
 
-/* Set Form1_affinity */
-_FDT(fdt_setprop(fdt, chosen, "ibm,architecture-vec-5",
- vec5, sizeof(vec5)));
-
 _FDT(fdt_setprop_string(fdt, chosen, "bootargs", machine->kernel_cmdline));
 _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
   spapr->initrd_base));
@@ -1778,6 +1787,8 @@ static void ppc_spapr_init(MachineState *machine)
 spapr_validate_node_memory(machine, _fatal);
 }
 
+spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
+
 /* init CPUs */
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : smc->tcg_default_cpu;
diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h
index 09afd59..47fa04c 100644
--- a/include/hw/ppc/spapr_ovec.h
+++ b/include/hw/ppc/spapr_ovec.h
@@ -44,6 +44,7 @@ typedef struct sPAPROptionVector sPAPROptionVector;
 
 /* option vector 5 */
 #define OV5_DRCONF_MEMORY   OV_BIT(2, 2)
+#define OV5_FORM1_AFFINITY  OV_BIT(5, 0)
 
 /* interfaces */
 sPAPROptionVector *spapr_ovec_new(void);
-- 
1.9.1

[Qemu-devel] [PATCH 02/10] spapr_hcall: use spapr_ovec_* interfaces for CAS options

2016-10-24 Thread Michael Roth

Currently we access individual bytes of an option vector via
ldub_phys() to test for the presence of a particular capability
within that byte. Currently this is only done for the "dynamic
reconfiguration memory" capability bit. If that bit is present,
we pass a boolean value to spapr_h_cas_compose_response()
to generate a modified device tree segment with the additional
properties required to enable this functionality.

As more capability bits are added, will would need to modify the
code to add additional option vector accesses and extend the
param list for spapr_h_cas_compose_response() to include similar
boolean values for these parameters.

Avoid this by switching to spapr_ovec_* helpers so we can do all
the parsing in one shot and then test for these additional bits
within spapr_h_cas_compose_response() directly.

Cc: Bharata B Rao 
Signed-off-by: Michael Roth 
Reviewed-by: David Gibson 
Reviewed-by: Bharata B Rao 
---
 hw/ppc/spapr.c  | 10 ++--
 hw/ppc/spapr_hcall.c| 56 -
 include/hw/ppc/spapr.h  |  5 +++-
 include/hw/ppc/spapr_ovec.h |  3 +++
 4 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 593d437..af5a239 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -657,7 +657,7 @@ out:
 
 int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
  target_ulong addr, target_ulong size,
- bool cpu_update, bool memory_update)
+ bool cpu_update)
 {
 void *fdt, *fdt_skel;
 sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
@@ -681,7 +681,8 @@ int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
 }
 
 /* Generate ibm,dynamic-reconfiguration-memory node if required */
-if (memory_update && smc->dr_lmb_enabled) {
+if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
+g_assert(smc->dr_lmb_enabled);
 _FDT((spapr_populate_drconf_memory(spapr, fdt)));
 }
 
@@ -1740,7 +1741,12 @@ static void ppc_spapr_init(MachineState *machine)
DIV_ROUND_UP(max_cpus * smt, smp_threads),
XICS_IRQS_SPAPR, _fatal);
 
+/* Set up containers for ibm,client-set-architecture negotiated options */
+spapr->ov5 = spapr_ovec_new();
+spapr->ov5_cas = spapr_ovec_new();
+
 if (smc->dr_lmb_enabled) {
+spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
 spapr_validate_node_memory(machine, _fatal);
 }
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index c5e7e8c..f1d081b 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -11,6 +11,7 @@
 #include "trace.h"
 #include "sysemu/kvm.h"
 #include "kvm_ppc.h"
+#include "hw/ppc/spapr_ovec.h"
 
 struct SPRSyncState {
 int spr;
@@ -880,32 +881,6 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, 
sPAPRMachineState *spapr,
 return ret;
 }
 
-/*
- * Return the offset to the requested option vector @vector in the
- * option vector table @table.
- */
-static target_ulong cas_get_option_vector(int vector, target_ulong table)
-{
-int i;
-char nr_vectors, nr_entries;
-
-if (!table) {
-return 0;
-}
-
-nr_vectors = (ldl_phys(_space_memory, table) >> 24) + 1;
-if (!vector || vector > nr_vectors) {
-return 0;
-}
-table++; /* skip nr option vectors */
-
-for (i = 0; i < vector - 1; i++) {
-nr_entries = ldl_phys(_space_memory, table) >> 24;
-table += nr_entries + 2;
-}
-return table;
-}
-
 typedef struct {
 uint32_t cpu_version;
 Error *err;
@@ -961,23 +936,21 @@ static void cas_handle_compat_cpu(PowerPCCPUClass *pcc, 
uint32_t pvr,
 }
 }
 
-#define OV5_DRCONF_MEMORY 0x20
-
 static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
   sPAPRMachineState *spapr,
   target_ulong opcode,
   target_ulong *args)
 {
 target_ulong list = ppc64_phys_to_real(args[0]);
-target_ulong ov_table, ov5;
+target_ulong ov_table;
 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu_);
 CPUState *cs;
-bool cpu_match = false, cpu_update = true, memory_update = false;
+bool cpu_match = false, cpu_update = true;
 unsigned old_cpu_version = cpu_->cpu_version;
 unsigned compat_lvl = 0, cpu_version = 0;
 unsigned max_lvl = get_compat_level(cpu_->max_compat);
 int counter;
-char ov5_byte2;
+sPAPROptionVector *ov5_guest;
 
 /* Parse PVR list */
 for (counter = 0; counter < 512; ++counter) {
@@ -1033,19 +1006,20 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu_,
 /* For the future use: here @ov_table points to the

[Qemu-devel] [PATCH 09/10] spapr: use count+index for memory hotplug

2016-10-24 Thread Michael Roth

Commit 0a417869:

spapr: Move memory hotplug to RTAS_LOG_V6_HP_ID_DRC_COUNT type

dropped per-DRC/per-LMB hotplugs event in favor of a bulk add via a
single LMB count value. This was to avoid overrunning the guest EPOW
event queue with hotplug events. This works fine, but relies on the
guest exhaustively scanning for pluggable LMBs to satisfy the
requested count by issuing rtas-get-sensor(DR_ENTITY_SENSE, ...) calls
until all the LMBs associated with the DIMM are identified.

With newer support for dedicated hotplug event source, this queue
exhaustion is no longer as much of an issue due to implementation
details on the guest side, but we still try to avoid excessive hotplug
events by now supporting both a count and a starting index to avoid
unecessary work. This patch makes use of that approach when the
capability is available.

Cc: bhar...@linux.vnet.ibm.com
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index dc4224b..0b3aa2f 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2202,14 +2202,16 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error 
**errp)
 }
 }
 
-static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
-   uint32_t node, Error **errp)
+static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
+   uint32_t node, bool dedicated_hp_event_source,
+   Error **errp)
 {
 sPAPRDRConnector *drc;
 sPAPRDRConnectorClass *drck;
 uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
 int i, fdt_offset, fdt_size;
 void *fdt;
+uint64_t addr = addr_start;
 
 for (i = 0; i < nr_lmbs; i++) {
 drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
@@ -2228,7 +2230,17 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t 
addr, uint64_t size,
  * guest only in case of hotplugged memory
  */
 if (dev->hotplugged) {
-   spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+if (dedicated_hp_event_source) {
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+addr_start / SPAPR_MEMORY_BLOCK_SIZE);
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   nr_lmbs,
+   drck->get_index(drc));
+} else {
+spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   nr_lmbs);
+}
 }
 }
 
@@ -2261,7 +2273,9 @@ static void spapr_memory_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 goto out;
 }
 
-spapr_add_lmbs(dev, addr, size, node, _abort);
+spapr_add_lmbs(dev, addr, size, node,
+   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
+   _abort);
 
 out:
 error_propagate(errp, local_err);
-- 
1.9.1

[Qemu-devel] [PATCH 03/10] spapr: add option vector handling in CAS-generated resets

2016-10-24 Thread Michael Roth

In some cases, ibm,client-architecture-support calls can fail. This
could happen in the current code for situations where the modified
device tree segment exceeds the buffer size provided by the guest
via the call parameters. In these cases, QEMU will reset, allowing
an opportunity to regenerate the device tree from scratch via
boot-time handling. There are potentially other scenarios as well,
not currently reachable in the current code, but possible in theory,
such as cases where device-tree properties or nodes need to be removed.

We currently don't handle either of these properly for option vector
capabilities however. Instead of carrying the negotiated capability
beyond the reset and creating the boot-time device tree accordingly,
we start from scratch, generating the same boot-time device tree as we
did prior to the CAS-generated and the same device tree updates as we
did before. This could (in theory) cause us to get stuck in a reset
loop. This hasn't been observed, but depending on the extensiveness
of CAS-induced device tree updates in the future, could eventually
become an issue.

Address this by pulling capability-related device tree
updates resulting from CAS calls into a common routine,
spapr_dt_cas_updates(), and adding an sPAPROptionVector*
parameter that allows us to test for newly-negotiated capabilities.
We invoke it as follows:

1) When ibm,client-architecture-support gets called, we
   call spapr_dt_cas_updates() with the set of capabilities
   added since the previous call to ibm,client-architecture-support.
   For the initial boot, or a system reset generated by something
   other than the CAS call itself, this set will consist of *all*
   options supported both the platform and the guest. For calls
   to ibm,client-architecture-support immediately after a CAS-induced
   reset, we call spapr_dt_cas_updates() with only the set
   of capabilities added since the previous call, since the other
   capabilities will have already been addressed by the boot-time
   device-tree this time around. In the unlikely event that
   capabilities are *removed* since the previous CAS, we will
   generate a CAS-induced reset. In the unlikely event that we
   cannot fit the device-tree updates into the buffer provided
   by the guest, well generate a CAS-induced reset.

2) When a CAS update results in the need to reset the machine and
   include the updates in the boot-time device tree, we call the
   spapr_dt_cas_updates() using the full set of negotiated
   capabilities as part of the reset path. At initial boot, or after
   a reset generated by something other than the CAS call itself,
   this set will be empty, resulting in what should be the same
   boot-time device-tree as we generated prior to this patch. For
   CAS-induced reset, this routine will be called with the full set of
   capabilities negotiated by the platform/guest in the previous
   CAS call, which should result in CAS updates from previous call
   being accounted for in the initial boot-time device tree.

Signed-off-by: Michael Roth 
Reviewed-by: David Gibson 
---
 hw/ppc/spapr.c | 40 ++--
 hw/ppc/spapr_hcall.c   | 22 ++
 include/hw/ppc/spapr.h |  4 +++-
 3 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index af5a239..3b64580 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -655,13 +655,28 @@ out:
 return ret;
 }
 
+static int spapr_dt_cas_updates(sPAPRMachineState *spapr, void *fdt,
+sPAPROptionVector *ov5_updates)
+{
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+int ret = 0;
+
+/* Generate ibm,dynamic-reconfiguration-memory node if required */
+if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
+g_assert(smc->dr_lmb_enabled);
+ret = spapr_populate_drconf_memory(spapr, fdt);
+}
+
+return ret;
+}
+
 int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
  target_ulong addr, target_ulong size,
- bool cpu_update)
+ bool cpu_update,
+ sPAPROptionVector *ov5_updates)
 {
 void *fdt, *fdt_skel;
 sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
-sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(qdev_get_machine());
 
 size -= sizeof(hdr);
 
@@ -680,10 +695,8 @@ int spapr_h_cas_compose_response(sPAPRMachineState *spapr,
 _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
 }
 
-/* Generate ibm,dynamic-reconfiguration-memory node if required */
-if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
-g_assert(smc->dr_lmb_enabled);
-_FDT((spapr_populate_drconf_memory(spapr, fdt)));
+if (spapr_dt_cas_updates(spapr, fdt, ov5_updates)) {
+return -1;
 }
 
 /* Pack resulting tree */
@@ -972,6

[Qemu-devel] [PATCH 10/10] spapr: Memory hot-unplug support

2016-10-24 Thread Michael Roth

From: Bharata B Rao 

Add support to hot remove pc-dimm memory devices.

Since we're introducing a machine-level unplug_request hook, we also
had handling for CPU unplug there as well to ensure CPU unplug
continues to work as it did before.

Signed-off-by: Bharata B Rao 
* add hooks to CAS/cmdline enablement of hotplug ACR support
* add hook for CPU unplug
Signed-off-by: Michael Roth 
---
 hw/ppc/spapr.c | 119 -
 hw/ppc/spapr_drc.c |  17 
 2 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0b3aa2f..a4a6058 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2281,6 +2281,90 @@ out:
 error_propagate(errp, local_err);
 }
 
+typedef struct sPAPRDIMMState {
+uint32_t nr_lmbs;
+} sPAPRDIMMState;
+
+static void spapr_lmb_release(DeviceState *dev, void *opaque)
+{
+sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
+HotplugHandler *hotplug_ctrl = NULL;
+
+if (--ds->nr_lmbs) {
+return;
+}
+
+g_free(ds);
+
+/*
+ * Now that all the LMBs have been removed by the guest, call the
+ * pc-dimm unplug handler to cleanup up the pc-dimm device.
+ */
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
+static void spapr_del_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
+   Error **errp)
+{
+sPAPRDRConnector *drc;
+sPAPRDRConnectorClass *drck;
+uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
+int i;
+sPAPRDIMMState *ds = g_malloc0(sizeof(sPAPRDIMMState));
+uint64_t addr = addr_start;
+
+ds->nr_lmbs = nr_lmbs;
+for (i = 0; i < nr_lmbs; i++) {
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+addr / SPAPR_MEMORY_BLOCK_SIZE);
+g_assert(drc);
+
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+drck->detach(drc, dev, spapr_lmb_release, ds, errp);
+addr += SPAPR_MEMORY_BLOCK_SIZE;
+}
+
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
+  nr_lmbs,
+  drck->get_index(drc));
+}
+
+static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+Error **errp)
+{
+sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+MemoryRegion *mr = ddc->get_memory_region(dimm);
+
+pc_dimm_memory_unplug(dev, >hotplug_memory, mr);
+object_unparent(OBJECT(dev));
+}
+
+static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
+DeviceState *dev, Error **errp)
+{
+Error *local_err = NULL;
+PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+MemoryRegion *mr = ddc->get_memory_region(dimm);
+uint64_t size = memory_region_size(mr);
+uint64_t addr;
+
+addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, 
_err);
+if (local_err) {
+goto out;
+}
+
+spapr_del_lmbs(dev, addr, size, _abort);
+out:
+error_propagate(errp, local_err);
+}
+
 void *spapr_populate_hotplug_cpu_dt(CPUState *cs, int *fdt_offset,
 sPAPRMachineState *spapr)
 {
@@ -2354,10 +2438,42 @@ static void spapr_machine_device_plug(HotplugHandler 
*hotplug_dev,
 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
   DeviceState *dev, Error **errp)
 {
+sPAPRMachineState *sms = SPAPR_MACHINE(qdev_get_machine());
 MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-error_setg(errp, "Memory hot unplug not supported by sPAPR");
+if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) {
+spapr_memory_unplug(hotplug_dev, dev, errp);
+} else {
+error_setg(errp, "Memory hot unplug not supported for this guest");
+}
+} else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
+if (!mc->query_hotpluggable_cpus) {
+error_setg(errp, "CPU hot unplug not supported on this machine");
+return;
+}
+spapr_core_unplug(hotplug_dev, dev, errp);
+}
+}
+
+static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
+DeviceState *dev, Error **errp)
+{
+sPAPRMachineState *sms = SPAPR_MACHINE(qdev_get_machine());
+MachineClass *mc =

Re: [Qemu-devel] [PATCH 3/4] target-ppc: add vrldnmi and vrlwmi instructions

2016-10-24 Thread Nikunj A Dadhania

Richard Henderson  writes:

> On 10/24/2016 09:08 PM, Nikunj A Dadhania wrote:
>> Richard Henderson  writes:
>>
>>> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
 +#define EXTRACT_BITS(size)  \
 +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
 +  uint##size##_t start, \
 +  uint##size##_t end)   \
 +{   \
 +uint##size##_t nr_mask_bits = end - start + 1;  \
 +uint##size##_t val = 1; \
 +uint##size##_t mask = (val << nr_mask_bits) - 1;\
 +uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);   \
 +return shifted_reg & mask;  \
 +}
 +
 +EXTRACT_BITS(64);
 +EXTRACT_BITS(32);
>>>
>>> We already have extract32 and extract64, which you're (nearly) duplicating.
>>
>> The bit position number notation is different, because of this using the
>> above routine, MSB=0 and LSB=63.
>>
>> While the below assumes: MSB=63 and LSB=0
>>
>> static inline uint64_t extract64(uint64_t value, int start, int length)
>> {
>> assert(start >= 0 && length > 0 && length <= 64 - start);
>> return (value >> start) & (~0ULL >> (64 - length));
>> }
>>
>> Let me know if I am missing something here.
>
> Since the arguments to extract_bits_uN are completely under your control, via 
> the arguments to VRLMI, this is a non-argument.  Just change them to 
> little-endian position + length.

Sure, was already trying that, I have the changed version now:

#define VRLMI(name, size, element,  \
  begin_last,   \
  end_last, \
  shift_last, num_bits, insert) \
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)\
{   \
int i;  \
for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
uint##size##_t src1 = a->element[i];\
uint##size##_t src2 = b->element[i];\
uint##size##_t src3 = r->element[i];\
uint##size##_t begin, end, shift, mask, rot_val;\
\
begin = extract##size(src2, size - begin_last - 1, num_bits);   \
end = extract##size(src2, size - end_last - 1, num_bits);   \
shift = extract##size(src2, size - shift_last - 1, num_bits);   \
rot_val = rol##size(src1, shift);   \
mask = mask_u##size(begin, end);\
if (insert) {   \
r->element[i] = (rot_val & mask) | (src3 & ~mask);  \
} else {\
r->element[i] = (rot_val & mask);   \
}   \
}   \
}

VRLMI(vrldmi, 64, u64,
  47,  /* begin_last */
  55,  /* end_last */
  63,  /* shift_last */
  6,   /* num_bits */
  1);  /* mask and insert */

>
> (And, after you do that conversion for vrldmi and vilwmi, you'll see why 
> big-endian bit numbering is the spawn of the devil.  ;-)

That bit numbering gives nightmares ;-)

Regards
Nikunj

Re: [Qemu-devel] [PATCH V7 0/2] Add option to configure guest vPMU

2016-10-24 Thread Wei Huang



On 10/24/2016 03:49 AM, Andrew Jones wrote:
> On Fri, Oct 21, 2016 at 05:53:00PM -0400, Wei Huang wrote:
>> This patchset adds a pmu=[on/off] option to enable/disable vPMU support 
>> for guest VM. There are several reasons to justify this option. First,
>> vPMU can be problematic for cross-migration between different SoC as perf
>> counters are architecture-dependent. It is more flexible to have an option
>> to turn it on/off. Secondly this option matches the "pmu" option as
>> supported in libvirt. To make sure backward compatible, a PMU-related
>> property is added to mach-virt machine types.
>>
>> The following are testing results with this patchset:
>>  CONFIG (qemu-system-aarch64)   vPMU   WARNING
>>   -M virt-2.8/virt,accel=kvm -cpu host   YESNO
>>   -M virt-2.8/virt,accel=kvm -cpu host,pmu=off   NO NO
>>   -M virt-2.8/virt,accel=kvm -cpu host,pmu=onYESNO
>>   -M virt-2.7,accel=kvm -cpu hostYESNO
>>   -M virt-2.7,accel=kvm -cpu host,pmu=offNO NO
>>   -M virt-2.7,accel=kvm -cpu host,pmu=on YESNO
>>   -M virt-2.6,accel=kvm -cpu hostNO NO
>>   -M virt-2.6,accel=kvm -cpu host,pmu=offNO NO
>>   -M virt-2.6,accel=kvm -cpu host,pmu=on NO NO
>>
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a57 NO NO
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a57,pmu=off NO "No PMU property"
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a57,pmu=on  NO "No PMU property"
>>   -M virt-2.7,accel=tcg -cpu cortex-a57  NO NO
>>   -M virt-2.7,accel=tcg -cpu cortex-a57,pmu=off  NO "No PMU property"
>>   -M virt-2.7,accel=tcg -cpu cortex-a57,pmu=on   NO "No PMU property"
>>   -M virt-2.6,accel=tcg -cpu cortex-a57  NO NO
>>   -M virt-2.6,accel=tcg -cpu cortex-a57,pmu=off  NO "No PMU property"
>>   -M virt-2.6,accel=tcg -cpu cortex-a57,pmu=on   NO "No PMU property"
>>
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a15 NO NO
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a15,pmu=off NO "No PMU property"
>>   -M virt-2.8/virt,accel=tcg -cpu cortex-a15,pmu=on  NO "No PMU property"
>>   -M virt-2.7,accel=tcg -cpu cortex-a15  NO NO
>>   -M virt-2.7,accel=tcg -cpu cortex-a15,pmu=off  NO "No PMU property"
>>   -M virt-2.7,accel=tcg -cpu cortex-a15,pmu=on   NO "No PMU property"
>>   -M virt-2.6,accel=tcg -cpu cortex-a15  NO NO
>>   -M virt-2.6,accel=tcg -cpu cortex-a15,pmu=off  NO "No PMU property"
>>   -M virt-2.6,accel=tcg -cpu cortex-a15,pmu=on   NO "No PMU property"
>>
>>   * "No PMU property" msg, e.g.
>> can't apply global cortex-a15-arm-cpu.pmu=off: Property '.pmu' not found
>>
>> V6->V7:
>>   * change has_pmu variable type from OnOffAuto to Boolean
>>   * only add "pmu" property to CPU under kvm mode, default ON
> 
> Hmm, if we don't allow the property with TCG then switching a guest from
> KVM to TCG will require more than just an accelerator switch. That's a
> bit annoying and I think we'd have to teach it to libvirt too. I'd prefer
> 
>  -M virt-2.8,accel=tcg -cpu cortex-a57 NO NO
>  -M virt-2.8,accel=tcg -cpu cortex-a57,pmu=off NO NO
>  -M virt-2.8,accel=tcg -cpu cortex-a57,pmu=on  NO "Warning: PMU not
> yet supported with TCG" (or something)

I am fine with this request. But note that, if we enforce
pmu-default=ON, we can't tell "-cpu cortex-a57" apart from
"cortex-a57,pmu=on", implying that we have to print the warning msg for
the case of "cortex-a57" as well (this was why we switch to tri-state
before). To solve this problem, we have to switch from pmu-default=ON to
pmu-default=OFF under TCG mode, something like:

if (arm_feature(>env, ARM_FEATURE_PMU)) {
qdev_property_add_static(DEVICE(obj),
_cpu_has_pmu_property, _abort);
if (!kvm_enabled())
object_property_set_bool(obj, false, "pmu", NULL);
}

Then we do:
if (cpu->has_pmu && !kvm_enabled()) {
cpu->has_pmu = false;
if (!pmu_warned && !qtest_enabled()) {
error_report("warning: pmu not supported under TCG");
pmu_warned = true;
}
}

This will work. Are you and Peter OK with this solution?

-Wei


> 
> 
>>   * set no_pmu=true for machvirt-2.6
>>
>> V5->V6:
>>   * adapt patches for new machine type 2.8
>>
>> V4->V5:
>>   * remove comment change for has_pmu
>>   * remove warning msg when pmu_default_on=TRUE && has_pmu=AUTO && tcg=TRUE
>>
>> V3->V4:
>>   * change has_pmu from Boolean to OnOffAuto to handle different cases
>>   * "pmu" property is re-defined as DEFINE_PROP_ON_OFF_AUTO
>>
>> V2->V3:
>>   * revise patch 1 commit msg and if-else statement (Drew) 
>>   * move property field into VirtMachineClass (Drew)
>>
>> V1->V2:
>>   * keep the original field name as "has_pmu"
>>   * add a warning message when PMU is turned on without

Re: [Qemu-devel] [PATCH 3/6] target-ppc: add vextu[bhw]rx instructions

2016-10-24 Thread Rajalakshmi Srinivasaraghavan




On 09/28/2016 11:15 AM, Rajalakshmi Srinivasaraghavan wrote:

From: Hariharan T.S 


Attached updatde patch based on comments on vextu[bhw]lx.

--
Thanks
Rajalakshmi S

>From f027eb4903b89720634423c335e3688cf1e8632d Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan 
Date: Mon, 24 Oct 2016 12:23:31 +0530
Subject: [PATCH 2/2] target-ppc: add vextu[bhw]rx instructions

vextubrx: Vector Extract Unsigned Byte Right-Indexed VX-form
vextuhrx: Vector Extract Unsigned  Halfword Right-Indexed VX-form
vextuwrx: Vector Extract Unsigned Word Right-Indexed VX-form

Signed-off-by: Hariharan T.S. 
Signed-off-by: Avinesh Kumar 
Signed-off-by: Rajalakshmi Srinivasaraghavan 
---
 target-ppc/helper.h |3 ++
 target-ppc/int_helper.c |   60 +++
 target-ppc/translate/vmx-impl.inc.c |5 +++
 target-ppc/translate/vmx-ops.inc.c  |4 ++-
 4 files changed, 71 insertions(+), 1 deletions(-)

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 8551568..f532977 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -360,6 +360,9 @@ DEF_HELPER_3(vpmsumd, void, avr, avr, avr)
 DEF_HELPER_2(vextublx, tl, tl, avr)
 DEF_HELPER_2(vextuhlx, tl, tl, avr)
 DEF_HELPER_2(vextuwlx, tl, tl, avr)
+DEF_HELPER_2(vextubrx, tl, tl, avr)
+DEF_HELPER_2(vextuhrx, tl, tl, avr)
+DEF_HELPER_2(vextuwrx, tl, tl, avr)
 
 DEF_HELPER_2(vsbox, void, avr, avr)
 DEF_HELPER_3(vcipher, void, avr, avr, avr)
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index 2b28848..17f0613 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -1805,6 +1805,66 @@ VEXTULX_DO(vextuhlx, 2)
 VEXTULX_DO(vextuwlx, 4)
 #undef VEXTULX_DO
 
+#if defined(HOST_WORDS_BIGENDIAN)
+# if defined (CONFIG_INT128)\
+#  define VEXTURX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int size =  elem * 8;   \
+int index = (15 - (a & 0xf) + 1) * 8;   \
+r = EXTRACT128(b->u128, (index - size), size);  \
+return r;   \
+}
+# else
+#  define VEXTURX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int i;  \
+int index = a & 0xf;\
+for (i = elem - 1; i >= 0; i--) {   \
+r = r << 8; \
+if ((15 - i - index) >= 0) {\
+r = r | b->u8[15 - i - index];  \
+}   \
+}   \
+return r;   \
+}
+# endif
+#else
+# if defined (CONFIG_INT128)
+#  define VEXTURX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int index = (a & 0xf) * 8;  \
+r = EXTRACT128(b->u128, index, elem * 8);   \
+return r;   \
+}
+# else
+#  define VEXTURX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int i;  \
+int index = 15 - (a & 0xf); \
+for (i = elem - 1; i >= 0; i--) {   \
+r = r << 8; \
+if ((15 + i - index) <= 15) {   \
+r = r | b->u8[15 + i - index];  \
+}   \
+}   \
+return r;   \
+}
+# endif
+#endif
+

Re: [Qemu-devel] [PATCH 2/6] target-ppc: add vextu[bhw]lx instructions

2016-10-24 Thread Rajalakshmi Srinivasaraghavan




On 10/05/2016 10:51 AM, Rajalakshmi Srinivasaraghavan wrote:



On 09/28/2016 10:24 PM, Richard Henderson wrote:

On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote:

+#if defined(HOST_WORDS_BIGENDIAN)
+#define VEXTULX_DO(name, elem)  \
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int i;  \
+int index = a & 0xf;\
+for (i = 0; i < elem; i++) {\
+r = r << 8; \
+if (index + i <= 15) {  \
+r = r | b->u8[index + i];   \
+ }   \
+ }   \
+return r;   \
+}
+#else
+#define VEXTULX_DO(name, elem)  \
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int i;  \
+int index = 15 - (a & 0xf); \
+for (i = 0; i < elem; i++) {\
+r = r << 8; \
+if (index - i >= 0) {   \
+r = r | b->u8[index - i];   \
+ }   \
+ }   \
+return r;   \
+}
+#endif
+
+VEXTULX_DO(vextublx, 1)
+VEXTULX_DO(vextuhlx, 2)
+VEXTULX_DO(vextuwlx, 4)
+#undef VEXTULX_DO

Ew.

This should be one 128-bit shift and one and.

Since the shift amount is a multiple of 8, the 128-bit shift for 
vextub[lr]x
does not need to cross a double-word boundary, and so can be 
decomposed into

one 64-bit shift of (count & 64 ? hi : lo).

For vextu[hw]lr]x, you'd need to do the whole left-shift, 
right-shift, or thing.


But still, fantastically better than a loop.

Ack. Will send an updated patch.

Attached updated patch.



r~






--
Thanks
Rajalakshmi S

>From 59b96e11dd4c649ba9dbf0435439f717b931530f Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan 
Date: Mon, 24 Oct 2016 11:36:33 +0530
Subject: [PATCH 1/2] target-ppc: add vextu[bhw]lx instructions

vextublx:  Vector Extract Unsigned Byte Left
vextuhlx:  Vector Extract Unsigned Halfword Left
vextuwlx:  Vector Extract Unsigned Word Left

Signed-off-by: Avinesh Kumar 
Signed-off-by: Rajalakshmi Srinivasaraghavan 
---
 target-ppc/helper.h |3 ++
 target-ppc/int_helper.c |   63 +++
 target-ppc/translate/vmx-impl.inc.c |   18 ++
 target-ppc/translate/vmx-ops.inc.c  |4 ++-
 4 files changed, 87 insertions(+), 1 deletions(-)

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 04c6421..8551568 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -357,6 +357,9 @@ DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumd, void, avr, avr, avr)
+DEF_HELPER_2(vextublx, tl, tl, avr)
+DEF_HELPER_2(vextuhlx, tl, tl, avr)
+DEF_HELPER_2(vextuwlx, tl, tl, avr)
 
 DEF_HELPER_2(vsbox, void, avr, avr)
 DEF_HELPER_3(vcipher, void, avr, avr, avr)
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index 5aee0a8..2b28848 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -1742,6 +1742,69 @@ void helper_vlogefp(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *b)
 }
 }
 
+#define EXTRACT128(value, start, length)\
+((value >> start) & (~(__uint128_t)0 >> (128 - length)))
+
+#if defined(HOST_WORDS_BIGENDIAN)
+# if defined (CONFIG_INT128)\
+#  define VEXTULX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{   \
+target_ulong r = 0; \
+int index = (a & 0xf) * 8;  \
+r = EXTRACT128(b->u128, index, elem * 8);   \
+return r;   \
+}
+# else 
+#  define VEXTULX_DO(name, elem)\
+target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b)  \
+{

Re: [Qemu-devel] [PATCH 3/4] target-ppc: add vrldnmi and vrlwmi instructions

2016-10-24 Thread Richard Henderson


On 10/24/2016 09:08 PM, Nikunj A Dadhania wrote:

Richard Henderson  writes:


On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:

+#define EXTRACT_BITS(size)  \
+static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
+  uint##size##_t start, \
+  uint##size##_t end)   \
+{   \
+uint##size##_t nr_mask_bits = end - start + 1;  \
+uint##size##_t val = 1; \
+uint##size##_t mask = (val << nr_mask_bits) - 1;\
+uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);   \
+return shifted_reg & mask;  \
+}
+
+EXTRACT_BITS(64);
+EXTRACT_BITS(32);


We already have extract32 and extract64, which you're (nearly) duplicating.


The bit position number notation is different, because of this using the
above routine, MSB=0 and LSB=63.

While the below assumes: MSB=63 and LSB=0

static inline uint64_t extract64(uint64_t value, int start, int length)
{
assert(start >= 0 && length > 0 && length <= 64 - start);
return (value >> start) & (~0ULL >> (64 - length));
}

Let me know if I am missing something here.


Since the arguments to extract_bits_uN are completely under your control, via 
the arguments to VRLMI, this is a non-argument.  Just change them to 
little-endian position + length.


(And, after you do that conversion for vrldmi and vilwmi, you'll see why 
big-endian bit numbering is the spawn of the devil.  ;-)



r~

Re: [Qemu-devel] [PATCH 3/4] target-ppc: add vrldnmi and vrlwmi instructions

2016-10-24 Thread Nikunj A Dadhania

Richard Henderson  writes:

> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
>> +#define EXTRACT_BITS(size)  \
>> +static inline uint##size##_t extract_bits_u##size(uint##size##_t reg,   \
>> +  uint##size##_t start, \
>> +  uint##size##_t end)   \
>> +{   \
>> +uint##size##_t nr_mask_bits = end - start + 1;  \
>> +uint##size##_t val = 1; \
>> +uint##size##_t mask = (val << nr_mask_bits) - 1;\
>> +uint##size##_t shifted_reg = reg  >> ((size - 1)  - end);   \
>> +return shifted_reg & mask;  \
>> +}
>> +
>> +EXTRACT_BITS(64);
>> +EXTRACT_BITS(32);
>
> We already have extract32 and extract64, which you're (nearly) duplicating.

The bit position number notation is different, because of this using the
above routine, MSB=0 and LSB=63.

While the below assumes: MSB=63 and LSB=0

static inline uint64_t extract64(uint64_t value, int start, int length)
{
assert(start >= 0 && length > 0 && length <= 64 - start);
return (value >> start) & (~0ULL >> (64 - length));
}

Let me know if I am missing something here.

>> +#define MASK(size, max_val) \
>> +static inline uint##size##_t mask_u##size(uint##size##_t start, \
>> +uint##size##_t end) \
>> +{   \
>> +uint##size##_t ret, max_bit = size - 1; \
>> +\
>> +if (likely(start == 0)) {   \
>> +ret = max_val << (max_bit - end);   \
>> +} else if (likely(end == max_bit)) {\
>> +ret = max_val >> start; \
>> +} else {\
>> +ret = (((uint##size##_t)(-1ULL)) >> (start)) ^  \
>> +(((uint##size##_t)(-1ULL) >> (end)) >> 1);  \
>> +if (unlikely(start > end)) {\
>> +return ~ret;\
>> +}   \
>> +}   \
>
> Why the two likely cases?  Doesn't the third case cover them?
>
> Also, (uint##size##_t)(-1ULL) should be just (uint##size##_t)-1.
> Please remove all the other unnecessarry parenthesis too.
>
> Hmph.  I see you've copied all this silliness from translate.c, so...
> nevermind, I guess.  Let's leave this a near-exact copy.

Ok.

>> +#define LEFT_ROTATE(size)\
>> +static inline uint##size##_t left_rotate_u##size(uint##size##_t val, \
>> +  uint##size##_t shift)  \
>> +{\
>> +if (!shift) {\
>> +return val;  \
>> +}\
>> + \
>> +uint##size##_t left_val = extract_bits_u##size(val, 0, shift - 1); \
>> +uint##size##_t right_val = val & mask_u##size(shift, size - 1);\
>> + \
>> +return right_val << shift | left_val;\
>> +}
>> +
>> +LEFT_ROTATE(32);
>> +LEFT_ROTATE(64);
>
> We already have rol32 and rol64.
>
> Which I see are broken for shift == 0.  Let's please fix that, as a separate
> patch, like so:
>
>   return (word << shift) | (word >> ((32 - shift) & 31));

Sure.

Regards
Nikunj

Re: [Qemu-devel] [PATCHv5 00/12] Cleanups to qtest PCI handling

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 03:59:49PM +1100, David Gibson wrote:
> This series contains a number of cleanups to the libqos code for
> accessing PCI devices, and to tests which use it.
> 
> The general aim is to improve the consistency of semantics across
> functions, and reduce the amount of intimate knowledge of the libqos
> PCI layer needed by tests.
> 
> This should make it easier to write PCI tests which will be portable
> to different guest machines with different PCI host bridge
> arrangements.
> 
> This series is on top of my ppc-for-2.8 branch, since it contains
> patches enabling the virtio tests on ppc, which would otherwise
> conflict with the changes here.

Greg, Alexey, Michael,

Some reviews from outside RH would be really welcome.

> 
> Changes since v4:
>   * Fixed some remaining abstraction breaks in ahci-test
>   * Removed QPCI_BAR_INVALID, turned out not to really be useful
> 
> Changes since v3:
>   * Fixed another endian bug introduced in ide-test
> 
> Changes since v2:
>   * Fixed build bugs in virtio-9p-test, which I didn't find earlier
> due to not having the right libraries installed
>   * Fixed an endian bug I accidentally introduced in ide-test
>   * Better handling of invalid BAR tokens
> 
> Changes since v1:
>   * Split out updates to tco-test into separate patch
>   * Split out updates to ide-test into separate patch
>   * Neater and more general handling of legacy PIO addresses
>   * Removed now-redundant fields from platform specific bus structures
>   * Introduced CONFIG_BASE() macro to virtio-pci to remove many
> similar assignments
>   * Fixed handling of two guest testcasesin ivshmem
>   * Added 64-bit accessors
>   * Rebase on ppc-for-2.8 to avoid conflict with Laurent's series in
> the same area
> 
> David Gibson (12):
>   libqos: Give qvirtio_config_read*() consistent semantics
>   libqos: Handle PCI IO de-multiplexing in common code
>   libqos: Move BAR assignment to common code
>   libqos: Better handling of PCI legacy IO
>   tests: Adjust tco-test to use qpci_legacy_iomap()
>   libqos: Add streaming accessors for PCI MMIO
>   libqos: Implement mmio accessors in terms of mem{read,write}
>   tests: Clean up IO handling in ide-test
>   libqos: Add 64-bit PCI IO accessors
>   tests: Use qpci_mem{read,write} in ivshmem-test
>   tests: Don't assume structure of PCI IO base in ahci-test
>   libqos: Change PCI accessors to take opaque BAR handle
> 
>  tests/ahci-test.c  |  13 +--
>  tests/e1000e-test.c|   7 +-
>  tests/ide-test.c   | 177 +++--
>  tests/ivshmem-test.c   |  47 +++
>  tests/libqos/ahci.c|   4 +-
>  tests/libqos/ahci.h|   7 +-
>  tests/libqos/pci-pc.c  | 187 ++-
>  tests/libqos/pci-spapr.c   | 194 
> -
>  tests/libqos/pci.c | 194 
> +
>  tests/libqos/pci.h |  66 ++-
>  tests/libqos/usb.c |   6 +-
>  tests/libqos/usb.h |   2 +-
>  tests/libqos/virtio-mmio.c |  16 ++--
>  tests/libqos/virtio-pci.c  | 122 ++--
>  tests/libqos/virtio-pci.h  |   2 +-
>  tests/rtl8139-test.c   |  10 +--
>  tests/tco-test.c   |  80 +--
>  tests/usb-hcd-ehci-test.c  |   5 +-
>  tests/virtio-9p-test.c |   8 +-
>  tests/virtio-blk-test.c|  42 +++---
>  tests/virtio-scsi-test.c   |   4 +-
>  21 files changed, 598 insertions(+), 595 deletions(-)
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [RESEND PATCH v3 kernel 0/7] Extend virtio-balloon for fast (de)inflating & fast live migration

2016-10-24 Thread Michael S. Tsirkin

On Sun, Oct 23, 2016 at 11:29:25AM +, Li, Liang Z wrote:
> > On Fri, Oct 21, 2016 at 10:25:21AM -0700, Dave Hansen wrote:
> > > On 10/20/2016 11:24 PM, Liang Li wrote:
> > > > Dave Hansen suggested a new scheme to encode the data structure,
> > > > because of additional complexity, it's not implemented in v3.
> > >
> > > So, what do you want done with this patch set?  Do you want it applied
> > > as-is so that we can introduce a new host/guest ABI that we must
> > > support until the end of time?  Then, we go back in a year or two and
> > > add the newer format that addresses the deficiencies that this ABI has
> > > with a third version?
> > >
> > 
> > Exactly my questions.
> 
> Hi Dave & Michael,
> 
> In the V2, both of you thought that the memory I allocated for the bitmap is 
> too large, and gave some
>  suggestions about the solution, so I changed the implementation and used  
> scattered pages for the bitmap
> instead of a large physical continued memory. I didn't get the comments about 
> the changes, so I am not 
> sure whether that is OK or not, that's the why I resend the V3, I just want 
> your opinions about that part. 
> 
> I will implement the new schema as Dave suggested in V4. Before that, could 
> you take a look at this version and
> give some comments? 
> 
> Thanks!
> Liang

Sure, I'll try to review just that part.

Re: [Qemu-devel] Holding the BQL for emulate_ppc_hypercall

2016-10-24 Thread Nikunj A Dadhania

Alex Bennée  writes:

> Hi,
>
> In the MTTCG patch set one of the big patches is to remove the
> requirement to hold the BQL while running code:
>
>   tcg: drop global lock during TCG code execution
>
> And this broke the PPC code because emulate_ppc_hypercall can cause
> changes to the global state. This function just calls spapr_hypercall()
> and puts the results into the TCG register file. Normally
> spapr_hypercall() is called under the BQL in KVM as
> kvm_arch_handle_exit() does things with the BQL held.
>
> I blithely wrapped the called in a lock/unlock pair only to find the
> ppc64 check builds failed as the hypercall was made during the
> cc->do_interrupt() code which also holds the BQL.
>
> I'm a little confused by the nature of PPC hypercalls in TCG? Are they
> not all detectable at code generation time? What is the case that causes
> an exception to occur rather than the helper function doing the
> hypercall?
>
> I guess it comes down to can I avoid doing:
>
>   /* If we come via cc->do_interrupt BQL may already be held */
>   if (!qemu_mutex_iothread_locked()) {
>   g_mutex_lock_iothread();
>   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
>   g_muetx_unlock_iothread();
>   } else {
>   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
>   }
>
> Any thoughts?

Similar discussions happened on this patch:
https://lists.gnu.org/archive/html/qemu-ppc/2016-09/msg00015.html

This was just working for TCG case, need to fix for KVM. I would need to
handle KVM case to avoid a deadlock.

Regards
Nikunj

Re: [Qemu-devel] [PATCHv3 00/12] pseries: Consolidate guest device tree construction

2016-10-24 Thread Bharata B Rao

On Tue, Oct 25, 2016 at 02:23:41PM +1100, David Gibson wrote:
> On Tue, Oct 25, 2016 at 01:50:02PM +1100, David Gibson wrote:
> > On Mon, Oct 24, 2016 at 04:04:31PM +1100, David Gibson wrote:
> > > For historical reasons construction of the guest device tree in spapr
> > > is divided between spapr_create_fdt_skel() which is called at init
> > > time, and spapr_build_fdt() which runs at reset time.  Over time, more
> > > and more things have needed to be moved to reset time.
> > > 
> > > This series consolidates all the device tree construction to reset
> > > time, with some minor cleanups along the way.  This will help to make
> > > it more maintainable in future.
> > > 
> > > Changes since v2:
> > >   * Removed a leftover reference to a variable called 'stdout',
> > > shadowing the standard library stdout
> > > Changes since v1:
> > >   * Fixed a memory leak introduced by 1/12 (spotted by Thomas Huth)
> > >   * Removed one patch that's already merged in ppc-for-2.8
> > 
> > Thanks for the acks, everyone.  I've now merged this series into
> > ppc-for-2.8, tentatively.
> > 
> > Bharata Rao reported some boot failures caused by the series, but I
> > haven't been able to reproduce them so far.  Bharata, could you retest
> > with the latest version (in ppc-for-2.8) and, if the problems are
> > still present, send me some reproducer steps.
> 
> Actually, I just realised the bug Bharata reported looks identical to
> one someone else spotted.  That was caused by one of Thomas' NVRAM
> cleanups, rather than the DT cleanup.  It's already been removed from
> ppc-for-2.8 pending debug.

David - After I reported the boot failure with this patchset to you, I
figured out that the root cause was in fact the NVRAM patchset which
I later reported separately.

Regards,
Bharata.

[Qemu-devel] [PATCH] vfio: Handle zero-length sparse mmap ranges

2016-10-24 Thread Alex Williamson

As reported in the link below, user has a PCI device with a 4KB BAR
which contains the MSI-X table.  This seems to hit a corner case in
the kernel where the region reports being mmap capable, but the sparse
mmap information reports a zero sized range.  It's not entirely clear
that the kernel is incorrect in doing this, but regardless, we need
to handle it.  To do this, fill our mmap array only with non-zero
sized sparse mmap entries and add an error return from the function
so we can tell the difference between nr_mmaps being zero based on
sparse mmap info vs lack of sparse mmap info.

NB, this doesn't actually change the behavior of the device, it only
removes the scary "Failed to mmap ... Performance may be slow" error
message.  We cannot currently create an mmap over the MSI-X table.

Link: http://lists.nongnu.org/archive/html/qemu-discuss/2016-10/msg9.html
Signed-off-by: Alex Williamson 
---
 hw/vfio/common.c |   36 ++--
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index c764cb3..e928b9b 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -610,16 +610,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, 
uint16_t id)
 return NULL;
 }
 
-static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
-   struct vfio_region_info *info)
+static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+  struct vfio_region_info *info)
 {
 struct vfio_info_cap_header *hdr;
 struct vfio_region_info_cap_sparse_mmap *sparse;
-int i;
+int i, j;
 
 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
 if (!hdr) {
-return;
+return -ENODEV;
 }
 
 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, 
header);
@@ -627,16 +627,24 @@ static void vfio_setup_region_sparse_mmaps(VFIORegion 
*region,
 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
  region->nr, sparse->nr_areas);
 
-region->nr_mmaps = sparse->nr_areas;
-region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
 
-for (i = 0; i < region->nr_mmaps; i++) {
-region->mmaps[i].offset = sparse->areas[i].offset;
-region->mmaps[i].size = sparse->areas[i].size;
-trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
-region->mmaps[i].offset +
-region->mmaps[i].size);
+for (i = 0, j = 0; i < sparse->nr_areas; i++) {
+trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
+sparse->areas[i].offset +
+sparse->areas[i].size);
+
+   if (sparse->areas[i].size) {
+region->mmaps[j].offset = sparse->areas[i].offset;
+region->mmaps[j].size = sparse->areas[i].size;
+j++;
+}
 }
+
+region->nr_mmaps = j;
+region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
+
+return 0;
 }
 
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
@@ -665,9 +673,9 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, 
VFIORegion *region,
 region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
 !(region->size & ~qemu_real_host_page_mask)) {
 
-vfio_setup_region_sparse_mmaps(region, info);
+ret = vfio_setup_region_sparse_mmaps(region, info);
 
-if (!region->nr_mmaps) {
+if (ret) {
 region->nr_mmaps = 1;
 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
 region->mmaps[0].offset = 0;

Re: [Qemu-devel] [PATCHv3 00/12] pseries: Consolidate guest device tree construction

2016-10-24 Thread David Gibson

On Tue, Oct 25, 2016 at 01:50:02PM +1100, David Gibson wrote:
> On Mon, Oct 24, 2016 at 04:04:31PM +1100, David Gibson wrote:
> > For historical reasons construction of the guest device tree in spapr
> > is divided between spapr_create_fdt_skel() which is called at init
> > time, and spapr_build_fdt() which runs at reset time.  Over time, more
> > and more things have needed to be moved to reset time.
> > 
> > This series consolidates all the device tree construction to reset
> > time, with some minor cleanups along the way.  This will help to make
> > it more maintainable in future.
> > 
> > Changes since v2:
> >   * Removed a leftover reference to a variable called 'stdout',
> > shadowing the standard library stdout
> > Changes since v1:
> >   * Fixed a memory leak introduced by 1/12 (spotted by Thomas Huth)
> >   * Removed one patch that's already merged in ppc-for-2.8
> 
> Thanks for the acks, everyone.  I've now merged this series into
> ppc-for-2.8, tentatively.
> 
> Bharata Rao reported some boot failures caused by the series, but I
> haven't been able to reproduce them so far.  Bharata, could you retest
> with the latest version (in ppc-for-2.8) and, if the problems are
> still present, send me some reproducer steps.

Actually, I just realised the bug Bharata reported looks identical to
one someone else spotted.  That was caused by one of Thomas' NVRAM
cleanups, rather than the DT cleanup.  It's already been removed from
ppc-for-2.8 pending debug.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

[Qemu-devel] [PATCH 4/4] block/curl: Do not wait for data beyond EOF

2016-10-24 Thread Max Reitz

libcurl will only give us as much data as there is, not more. The block
layer will deny requests beyond the end of file for us; but since this
block driver is still using a sector-based interface, we can still get
in trouble if the file size is not a multiple of 512.

While we have already made sure not to attempt transfers beyond the end
of the file, we are currently still trying to receive data from there if
the original request exceeds the file size. This patch fixes this issue
and invokes qemu_iovec_memset() on the iovec's tail.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Max Reitz 
---
 block/curl.c | 32 +++-
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 4fbba5c..2cb875a 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -253,8 +253,17 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t 
nmemb, void *opaque)
 continue;
 
 if ((s->buf_off >= acb->end)) {
+size_t request_length = acb->nb_sectors * BDRV_SECTOR_SIZE;
+
 qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
 acb->end - acb->start);
+
+if (acb->end - acb->start < request_length) {
+size_t offset = acb->end - acb->start;
+qemu_iovec_memset(acb->qiov, offset, 0,
+  request_length - offset);
+}
+
 acb->common.cb(acb->common.opaque, 0);
 qemu_aio_unref(acb);
 s->acb[i] = NULL;
@@ -271,6 +280,8 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, 
size_t len,
 {
 int i;
 size_t end = start + len;
+size_t clamped_end = MIN(end, s->len);
+size_t clamped_len = clamped_end - start;
 
 for (i=0; istates[i];
@@ -285,12 +296,15 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, 
size_t len,
 // Does the existing buffer cover our section?
 if ((start >= state->buf_start) &&
 (start <= buf_end) &&
-(end >= state->buf_start) &&
-(end <= buf_end))
+(clamped_end >= state->buf_start) &&
+(clamped_end <= buf_end))
 {
 char *buf = state->orig_buf + (start - state->buf_start);
 
-qemu_iovec_from_buf(acb->qiov, 0, buf, len);
+qemu_iovec_from_buf(acb->qiov, 0, buf, clamped_len);
+if (clamped_len < len) {
+qemu_iovec_memset(acb->qiov, clamped_len, 0, len - 
clamped_len);
+}
 acb->common.cb(acb->common.opaque, 0);
 
 return FIND_RET_OK;
@@ -300,13 +314,13 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, 
size_t len,
 if (state->in_use &&
 (start >= state->buf_start) &&
 (start <= buf_fend) &&
-(end >= state->buf_start) &&
-(end <= buf_fend))
+(clamped_end >= state->buf_start) &&
+(clamped_end <= buf_fend))
 {
 int j;
 
 acb->start = start - state->buf_start;
-acb->end = acb->start + len;
+acb->end = acb->start + clamped_len;
 
 for (j=0; jacb[j]) {
@@ -799,13 +813,13 @@ static void curl_readv_bh_cb(void *p)
 }
 
 acb->start = 0;
-acb->end = (acb->nb_sectors * BDRV_SECTOR_SIZE);
+acb->end = MIN(acb->nb_sectors * BDRV_SECTOR_SIZE, s->len - start);
 
 state->buf_off = 0;
 g_free(state->orig_buf);
 state->buf_start = start;
-state->buf_len = acb->end + s->readahead_size;
-end = MIN(start + state->buf_len, s->len) - 1;
+state->buf_len = MIN(acb->end + s->readahead_size, s->len - start);
+end = start + state->buf_len - 1;
 state->orig_buf = g_try_malloc(state->buf_len);
 if (state->buf_len && state->orig_buf == NULL) {
 curl_clean_state(state);
-- 
2.10.1

Re: [Qemu-devel] Holding the BQL for emulate_ppc_hypercall

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 03:44:01PM +0100, Alex Bennée wrote:
> 
> Alex Bennée  writes:
> 
> > Hi,
> >
> > In the MTTCG patch set one of the big patches is to remove the
> > requirement to hold the BQL while running code:
> >
> >   tcg: drop global lock during TCG code execution
> >
> > And this broke the PPC code because emulate_ppc_hypercall can cause
> > changes to the global state. This function just calls spapr_hypercall()
> > and puts the results into the TCG register file. Normally
> > spapr_hypercall() is called under the BQL in KVM as
> > kvm_arch_handle_exit() does things with the BQL held.
> >
> > I blithely wrapped the called in a lock/unlock pair only to find the
> > ppc64 check builds failed as the hypercall was made during the
> > cc->do_interrupt() code which also holds the BQL.
> >
> > I'm a little confused by the nature of PPC hypercalls in TCG? Are they
> > not all detectable at code generation time? What is the case that causes
> > an exception to occur rather than the helper function doing the
> > hypercall?
> >
> > I guess it comes down to can I avoid doing:
> >
> >   /* If we come via cc->do_interrupt BQL may already be held */
> >   if (!qemu_mutex_iothread_locked()) {
> >   g_mutex_lock_iothread();
> >   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
> >   g_muetx_unlock_iothread();
> >   } else {
> >   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
> >   }
> 
> Of course I mean:
> 
>   /* If we come via cc->do_interrupt BQL may already be held */
>   if (!qemu_mutex_iothread_locked()) {
>   qemu_mutex_lock_iothread();
>   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
>   qemu_mutex_unlock_iothread();
>   } else {
>   env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], >gpr[4]);
>   }
> 
> > Any thoughts?

So, I understand why the hypercall is being called from exception code
and therefore with the BQL held.  On Power, the hypercall instruction
is the same as the guest-level system call instruction, just with a
flag bit set.  System calls are, of course, treated as exceptions,
because they change the CPU's privilege mode.  Likewise if we were
implementing a full host system (like the upcoming 'powernv' machine
type) we'd need to treat hypercalls as exceptions for the same reason.

We could detect hypercalls at translation time, but at present we
don't: we go into the exception path, then detect that it's a "level
1" (i.e. hypervisor) sc instruction and branch off to the hypercall
emulation code if that's been set up.  It just seemed the simplet
approach at the time.

What I *don't* understand is how the hypercall code is ever being
invoked *without* the BQL.  I grepped through and the only entry paths
I can see are the one in the exception handling and KVM.

Could you try to get a backtrace from the case where we're entering
the hypercall without the BQL?

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

signature.asc
Description: PGP signature

[Qemu-devel] [PATCH 0/4] block/curl: Fix FTP

2016-10-24 Thread Max Reitz

At least for me, the FTP support of our curl block driver currently
doesn't work at all. This is due to (at least) three issues, for each of
which this series provides a patch (and the first patch is just a minor
clean-up).

1. When establishing an FTP connection, libcurl hands us some data we do
   not expect because we have not really been asking for it. Not an
   issue in theory, because we can just ignore it. Unfortunately, qemu
   has decided to be more direct about the issue and tell libcurl that
   we did not process any of that data. libcurl doesn't like that. At
   all. Therefore, it returns the favor and just cancels the connection.
   In effect, it is impossible to even open a connection to an FTP
   server (at least in my test environment).
   Patch 2 fixes this by just playing along nicely.

2. libcurl has an old function called curl_multi_socket_all(). It allows
   you to kick off action on all of the sockets there are.
   Unfortunately, it is deprecated. Therefore, our code decides to be
   good and use the non-deprecated curl_multi_socket_action() function.
   However, that one only works on a single socket and wants to know
   which. So our code remembers the socket of the current connection.
   Works great for HTTP which generally only uses one socket.
   Unfortunately, FTP generally uses at least two, one for the control
   and one for the data stream. So us remembering only one of those two
   results in qemu only being able to receive the first 16 kB of any
   request (and maybe even of any connection).
   Patch 3 fixes this by putting the sockets into a list and thus being
   able to remember more than one.

3. The first two patches make curl work on files with file sizes that
   are multiples of 512, but not so well with others. curl still uses
   the sector-based interface, so it may receive requests beyond the
   EOF into the partial last sector. While it will actually not pass a
   request beyond the EOF to libcurl, it will unfortunately still wait
   to receive data from there. Which of course will not happen. So every
   request into that last sector makes the whole BDS hang indefinitely.
   Patch 4 fixes this by letting go of the futile hope for data from
   where there is none.


Max Reitz (4):
  block/curl: Use BDRV_SECTOR_SIZE
  block/curl: Fix return value from curl_read_cb
  block/curl: Remember all sockets
  block/curl: Do not wait for data beyond EOF

 block/curl.c | 99 +---
 1 file changed, 75 insertions(+), 24 deletions(-)

-- 
2.10.1

[Qemu-devel] [PATCH 3/4] block/curl: Remember all sockets

2016-10-24 Thread Max Reitz

For some connection types (like FTP, generally), more than one socket
may be used (in FTP's case: control vs. data stream). As of commit
838ef602498b8d1985a231a06f5e328e2946a81d ("curl: Eliminate unnecessary
use of curl_multi_socket_all"), we have to remember all of the sockets
used by libcurl, but in fact we only did that for a single one. Since
one libcurl connection may use multiple sockets, however, we have to
remember them all.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Max Reitz 
---
 block/curl.c | 47 +--
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 095ffda..4fbba5c 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -104,12 +104,17 @@ typedef struct CURLAIOCB {
 size_t end;
 } CURLAIOCB;
 
+typedef struct CURLSocket {
+int fd;
+QLIST_ENTRY(CURLSocket) next;
+} CURLSocket;
+
 typedef struct CURLState
 {
 struct BDRVCURLState *s;
 CURLAIOCB *acb[CURL_NUM_ACB];
 CURL *curl;
-curl_socket_t sock_fd;
+QLIST_HEAD(, CURLSocket) sockets;
 char *orig_buf;
 size_t buf_start;
 size_t buf_off;
@@ -163,10 +168,27 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int 
action,
 {
 BDRVCURLState *s;
 CURLState *state = NULL;
+CURLSocket *socket;
+
 curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **));
-state->sock_fd = fd;
 s = state->s;
 
+QLIST_FOREACH(socket, >sockets, next) {
+if (socket->fd == fd) {
+if (action == CURL_POLL_REMOVE) {
+QLIST_REMOVE(socket, next);
+g_free(socket);
+}
+break;
+}
+}
+if (!socket) {
+socket = g_new0(CURLSocket, 1);
+socket->fd = fd;
+QLIST_INSERT_HEAD(>sockets, socket, next);
+}
+socket = NULL;
+
 DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, (int)fd);
 switch (action) {
 case CURL_POLL_IN:
@@ -354,6 +376,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
 static void curl_multi_do(void *arg)
 {
 CURLState *s = (CURLState *)arg;
+CURLSocket *socket, *next_socket;
 int running;
 int r;
 
@@ -361,10 +384,13 @@ static void curl_multi_do(void *arg)
 return;
 }
 
-do {
-r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, );
-} while(r == CURLM_CALL_MULTI_PERFORM);
-
+/* Need to use _SAFE because curl_multi_socket_action() may trigger
+ * curl_sock_cb() which might modify this list */
+QLIST_FOREACH_SAFE(socket, >sockets, next, next_socket) {
+do {
+r = curl_multi_socket_action(s->s->multi, socket->fd, 0, );
+} while (r == CURLM_CALL_MULTI_PERFORM);
+}
 }
 
 static void curl_multi_read(void *arg)
@@ -468,6 +494,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, 
BDRVCURLState *s)
 #endif
 }
 
+QLIST_INIT(>sockets);
 state->s = s;
 
 return state;
@@ -477,6 +504,14 @@ static void curl_clean_state(CURLState *s)
 {
 if (s->s->multi)
 curl_multi_remove_handle(s->s->multi, s->curl);
+
+while (!QLIST_EMPTY(>sockets)) {
+CURLSocket *socket = QLIST_FIRST(>sockets);
+
+QLIST_REMOVE(socket, next);
+g_free(socket);
+}
+
 s->in_use = 0;
 }
 
-- 
2.10.1

Re: [Qemu-devel] [PATCH v5 07/17] ppc/pnv: add XSCOM infrastructure

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:40AM +0200, Cédric Le Goater wrote:
> On a real POWER8 system, the Pervasive Interconnect Bus (PIB) serves
> as a backbone to connect different units of the system. The host
> firmware connects to the PIB through a bridge unit, the
> Alter-Display-Unit (ADU), which gives him access to all the chiplets
> on the PCB network (Pervasive Connect Bus), the PIB acting as the root
> of this network.
> 
> XSCOM (serial communication) is the interface to the sideband bus
> provided by the POWER8 pervasive unit to read and write to chiplets
> resources. This is needed by the host firmware, OPAL and to a lesser
> extent, Linux. This is among others how the PCI Host bridges get
> configured at boot or how the LPC bus is accessed.
> 
> To represent the ADU of a real system, we introduce a specific
> AddressSpace to dispatch XSCOM accesses to the targeted chiplets. The
> translation of an XSCOM address into a PCB register address is
> slightly different between the P9 and the P8. This is handled before
> the dispatch using a 8byte alignment for all.
> 
> To customize the device tree, a QOM InterfaceClass, PnvXScomInterface,
> is provided with a populate() handler. The chip populates the device
> tree by simply looping on its children. Therefore, each model needing
> custom nodes should not forget to declare itself as a child at
> instantiation time.
> 
> Based on previous work done by :
>   Benjamin Herrenschmidt 
> 
> Signed-off-by: Cédric Le Goater 

Looks like xscom_complete() is still using current_cpu, which I've
mentioned before.  Apart from that;

Reviewed-by: David Gibson 

> ---
> 
>  Changes since v4:
> 
>  - added helpers to initialize and map the chiplet XSCOM regions
>in the XSCOM address space. This is to hide the '<< 3' shift.
> 
>  Changes since v3:
> 
>  - reworked the model to dispatch addresses to the memory regions
>using pcb_addr << 3, which is a no-op for the P9. The benefit is
>that all the address translation work can be done before dispatch
>and the conversion handlers in the chip and in the xscom interface
>are gone.
>
>  - removed the proxy PnnXscom object and extended the PnvChip object
>with an address space for XSCOM and its associated memory region.
>
>  - changed the read/write handlers in the address space to use
>address_space_stq() and address_space_ldq()
>
>  - introduced 'fake' default read/write handlers to handle 'core'
>registers. We can add a real device model when more work needs to
>be done under these.
>
>  - fixed an issue with the monitor doing read/write in the XSCOM
>address space. When under the monitor, we don't have a cpu to
>update the HMER SPR. That might need more work in the long term.
>
>  - introduced a xscom base field to hold the xscom base address as
>it is different on P9
> 
>  - renamed the devnode() handler to populate()
> 
>  Changes since v2:
> 
>  - QOMified the model.
>  
>  - all mappings in main memory space are now gathered in
>pnv_chip_realize() as done on other architectures.
>
>  - removed XScomBus. The parenthood is established through the QOM
>model
>
>  - replaced the XScomDevice with an InterfaceClass : PnvXScomInterface. 
>  - introduced an XSCOM address space to dispatch accesses to the
>chiplets
> 
>  hw/ppc/Makefile.objs   |   2 +-
>  hw/ppc/pnv.c   |  25 
>  hw/ppc/pnv_xscom.c | 277 
> +
>  include/hw/ppc/pnv.h   |  15 +++
>  include/hw/ppc/pnv_xscom.h |  56 +
>  5 files changed, 374 insertions(+), 1 deletion(-)
>  create mode 100644 hw/ppc/pnv_xscom.c
>  create mode 100644 include/hw/ppc/pnv_xscom.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f8c7d1db9ade..08c213c40684 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o 
> spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
>  # IBM PowerNV
> -obj-$(CONFIG_POWERNV) += pnv.o pnv_core.o
> +obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 3413107697d3..96ba36cc272d 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -33,6 +33,8 @@
>  #include "qemu/cutils.h"
>  #include "qapi/visitor.h"
>  
> +#include "hw/ppc/pnv_xscom.h"
> +
>  #include 
>  
>  #define FDT_MAX_SIZE0x0010
> @@ -219,6 +221,8 @@ static void powernv_populate_chip(PnvChip *chip, void 
> *fdt)
>  size_t typesize = object_type_get_instance_size(typename);
>  int i;
>  
> +pnv_xscom_populate(chip, fdt, 0);
> +
>  for (i = 0; i < chip->nr_cores; i++) {
>  PnvCore *pnv_core =

[Qemu-devel] [PATCH 1/4] block/curl: Use BDRV_SECTOR_SIZE

2016-10-24 Thread Max Reitz

Currently, curl defines its own constant SECTOR_SIZE. There is no
advantage over using the global BDRV_SECTOR_SIZE, so drop it.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Max Reitz 
---
 block/curl.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index e5eaa7b..12afa15 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -73,7 +73,6 @@ static CURLMcode __curl_multi_socket_action(CURLM 
*multi_handle,
 
 #define CURL_NUM_STATES 8
 #define CURL_NUM_ACB8
-#define SECTOR_SIZE 512
 #define READ_AHEAD_DEFAULT (256 * 1024)
 #define CURL_TIMEOUT_DEFAULT 5
 #define CURL_TIMEOUT_MAX 1
@@ -738,12 +737,12 @@ static void curl_readv_bh_cb(void *p)
 CURLAIOCB *acb = p;
 BDRVCURLState *s = acb->common.bs->opaque;
 
-size_t start = acb->sector_num * SECTOR_SIZE;
+size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
 size_t end;
 
 // In case we have the requested data already (e.g. read-ahead),
 // we can just call the callback and be done.
-switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) {
+switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
 case FIND_RET_OK:
 qemu_aio_unref(acb);
 // fall through
@@ -762,7 +761,7 @@ static void curl_readv_bh_cb(void *p)
 }
 
 acb->start = 0;
-acb->end = (acb->nb_sectors * SECTOR_SIZE);
+acb->end = (acb->nb_sectors * BDRV_SECTOR_SIZE);
 
 state->buf_off = 0;
 g_free(state->orig_buf);
@@ -779,8 +778,8 @@ static void curl_readv_bh_cb(void *p)
 state->acb[0] = acb;
 
 snprintf(state->range, 127, "%zd-%zd", start, end);
-DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n",
-(acb->nb_sectors * SECTOR_SIZE), start, state->range);
+DPRINTF("CURL (AIO): Reading %llu at %zd (%s)\n",
+(acb->nb_sectors * BDRV_SECTOR_SIZE), start, state->range);
 curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);
 
 curl_multi_add_handle(s->multi, state->curl);
-- 
2.10.1

Re: [Qemu-devel] [PATCHv3 00/12] pseries: Consolidate guest device tree construction

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 04:04:31PM +1100, David Gibson wrote:
> For historical reasons construction of the guest device tree in spapr
> is divided between spapr_create_fdt_skel() which is called at init
> time, and spapr_build_fdt() which runs at reset time.  Over time, more
> and more things have needed to be moved to reset time.
> 
> This series consolidates all the device tree construction to reset
> time, with some minor cleanups along the way.  This will help to make
> it more maintainable in future.
> 
> Changes since v2:
>   * Removed a leftover reference to a variable called 'stdout',
> shadowing the standard library stdout
> Changes since v1:
>   * Fixed a memory leak introduced by 1/12 (spotted by Thomas Huth)
>   * Removed one patch that's already merged in ppc-for-2.8

Thanks for the acks, everyone.  I've now merged this series into
ppc-for-2.8, tentatively.

Bharata Rao reported some boot failures caused by the series, but I
haven't been able to reproduce them so far.  Bharata, could you retest
with the latest version (in ppc-for-2.8) and, if the problems are
still present, send me some reproducer steps.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

[Qemu-devel] [PATCH 2/4] block/curl: Fix return value from curl_read_cb

2016-10-24 Thread Max Reitz

While commit 38bbc0a580f9f10570b1d1b5d3e92f0e6feb2970 is correct in that
the callback is supposed to return the number of bytes handled; what it
does not mention is that libcurl will throw an error if the callback did
not "handle" all of the data passed to it.

Therefore, if the callback receives some data that it cannot handle
(either because the receive buffer has not been set up yet or because it
would not fit into the receive buffer) and we have to ignore it, we
still have to report that the data has been handled.

Obviously, this should not happen normally. But it does happen at least
for FTP connections where some data (that we do not expect) may be
generated when the connection is established.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Max Reitz 
---
 block/curl.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 12afa15..095ffda 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -212,12 +212,13 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t 
nmemb, void *opaque)
 
 DPRINTF("CURL: Just reading %zd bytes\n", realsize);
 
-if (!s || !s->orig_buf)
-return 0;
+if (!s || !s->orig_buf) {
+goto read_end;
+}
 
 if (s->buf_off >= s->buf_len) {
 /* buffer full, read nothing */
-return 0;
+goto read_end;
 }
 realsize = MIN(realsize, s->buf_len - s->buf_off);
 memcpy(s->orig_buf + s->buf_off, ptr, realsize);
@@ -238,7 +239,9 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t 
nmemb, void *opaque)
 }
 }
 
-return realsize;
+read_end:
+/* curl will error out if we do not return this value */
+return size * nmemb;
 }
 
 static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
-- 
2.10.1

Re: [Qemu-devel] [PATCH 2/4] target-ppc: add vmul10[u, eu, cu, ecu]q instructions

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 09:04:13AM -0700, Richard Henderson wrote:
> On 10/24/2016 02:14 AM, Nikunj A Dadhania wrote:
> > From: Vasant Hegde 
> > 
> > vmul10uq  : Vector Multiply-by-10 Unsigned Quadword VX-form
> > vmul10euq : Vector Multiply-by-10 Extended Unsigned Quadword VX-form
> > vmul10cuq : Vector Multiply-by-10 & write Carry Unsigned Quadword VX-form
> > vmul10ecuq: Vector Multiply-by-10 Extended & write Carry Unsigned Quadword 
> > VX-form
> > 
> > Signed-off-by: Vasant Hegde 
> > [ Add GEN_VXFORM_DUAL_EXT with invalid bit mask ]
> > Signed-off-by: Nikunj A Dadhania 
> > ---
> >  target-ppc/translate/vmx-impl.inc.c | 72 
> > +
> >  target-ppc/translate/vmx-ops.inc.c  |  8 ++---
> >  2 files changed, 76 insertions(+), 4 deletions(-)
> 
> Reviewed-by: Richard Henderson 

Applied to ppc-for-2.8.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCHv2 01/12] pseries: Split device tree construction from device tree load

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 04:17:05PM +1100, Alexey Kardashevskiy wrote:
> On 21/10/16 13:56, David Gibson wrote:
> > spapr_finalize_fdt() both finishes building the device tree for the guest
> > and loads it into guest memory.  For future cleanups, it's going to be
> > more convenient to do these two things separately.  The loading portion is
> > pretty trivial, so we move it inline into the caller, ppc_spapr_reset().
> > 
> > We also rename spapr_finalize_fdt(), because the current name is going to
> > become inaccurate.
> > 
> > Signed-off-by: David Gibson 
> 
> 
> 
> Reviewed-by: Alexey Kardashevskiy 
> 
> with a small nit, grep finds "spapr_finalize_fdt" in a comment:
> 
> hw/ppc/spapr_cpu_core.c:187: * coldplugged CPUs DT entries are setup in
> spapr_finalize_fdt().

Thanks, will fix.

> 
> 
> 
> > ---
> >  hw/ppc/spapr.c | 42 +++---
> >  1 file changed, 23 insertions(+), 19 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index ddb7438..0864411 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -900,10 +900,9 @@ int spapr_h_cas_compose_response(sPAPRMachineState 
> > *spapr,
> >  return 0;
> >  }
> >  
> > -static void spapr_finalize_fdt(sPAPRMachineState *spapr,
> > -   hwaddr fdt_addr,
> > -   hwaddr rtas_addr,
> > -   hwaddr rtas_size)
> > +static void *spapr_build_fdt(sPAPRMachineState *spapr,
> > + hwaddr rtas_addr,
> > + hwaddr rtas_size)
> >  {
> >  MachineState *machine = MACHINE(qdev_get_machine());
> >  MachineClass *mc = MACHINE_GET_CLASS(machine);
> > @@ -999,19 +998,8 @@ static void spapr_finalize_fdt(sPAPRMachineState 
> > *spapr,
> >  }
> >  }
> >  
> > -_FDT((fdt_pack(fdt)));
> > -
> > -if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
> > -error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
> > - fdt_totalsize(fdt), FDT_MAX_SIZE);
> > -exit(1);
> > -}
> > -
> > -qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> > -cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> > -
> >  g_free(bootlist);
> > -g_free(fdt);
> > +return fdt;
> >  }
> >  
> >  static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
> > @@ -1147,6 +1135,8 @@ static void ppc_spapr_reset(void)
> >  sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
> >  PowerPCCPU *first_ppc_cpu;
> >  uint32_t rtas_limit;
> > +void *fdt;
> > +int rc;
> >  
> >  /* Check for unknown sysbus devices */
> >  foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);
> > @@ -1173,14 +1163,28 @@ static void ppc_spapr_reset(void)
> >  spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
> >  spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
> >  
> > -/* Load the fdt */
> > -spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> > -   spapr->rtas_size);
> > +fdt = spapr_build_fdt(spapr, spapr->rtas_addr, spapr->rtas_size);
> >  
> >  /* Copy RTAS over */
> >  cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
> >spapr->rtas_size);
> >  
> > +rc = fdt_pack(fdt);
> > +
> > +/* Should only fail if we've built a corrupted tree */
> > +assert(rc == 0);
> > +
> > +if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
> > +error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
> > + fdt_totalsize(fdt), FDT_MAX_SIZE);
> > +exit(1);
> > +}
> > +
> > +/* Load the fdt */
> > +qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> > +cpu_physical_memory_write(spapr->fdt_addr, fdt, fdt_totalsize(fdt));
> > +g_free(fdt);
> > +
> >  /* Set up the entry state */
> >  first_ppc_cpu = POWERPC_CPU(first_cpu);
> >  first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
> > 
> 
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v2] ppc: allow certain HV interrupts to be delivered to guests

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 05:56:22PM +1100, Nicholas Piggin wrote:
> On Mon, 24 Oct 2016 12:16:19 +1100
> David Gibson  wrote:
> 
> > On Fri, Oct 21, 2016 at 03:35:43PM +1100, Nicholas Piggin wrote:
> > > On Fri, 21 Oct 2016 12:09:54 +1100
> > > David Gibson  wrote:
> > >   
> > > > On Fri, Oct 21, 2016 at 12:40:58AM +1100, Nicholas Piggin wrote:  
> > > > > On Thu, 20 Oct 2016 15:08:07 +0200
> > > > > Cédric Le Goater  wrote:
> > > > > 
> > > > > > On 10/20/2016 08:59 AM, Nicholas Piggin wrote:
> > > > > > > Signed-off-by: Nicholas Piggin 
> > > > > > > ---
> > > > > > >  target-ppc/excp_helper.c | 8 ++--
> > > > > > >  1 file changed, 6 insertions(+), 2 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c
> > > > > > > index 53c4075..477af10 100644
> > > > > > > --- a/target-ppc/excp_helper.c
> > > > > > > +++ b/target-ppc/excp_helper.c
> > > > > > > @@ -390,9 +390,13 @@ static inline void powerpc_excp(PowerPCCPU 
> > > > > > > *cpu, int excp_model, int excp)
> > > > > > >  /* indicate that we resumed from power save mode */
> > > > > > >  msr |= 0x1;
> > > > > > >  new_msr |= ((target_ulong)1 << MSR_ME);
> > > > > > > +new_msr |= (target_ulong)MSR_HVB;
> > > > > > > +} else {
> > > > > > > + /* The ISA specifies the HV bit is set when the hardware 
> > > > > > > interrupt
> > > > > > > +  * is raised, however when hypervisors deliver the 
> > > > > > > exception to
> > > > > > > +  * guests, it should not be set.
> > > > > > > +  */
> > > > > > >  }
> > > > > > > -
> > > > > > > -new_msr |= (target_ulong)MSR_HVB;
> > > > > > >  ail = 0;
> > > > > > >  break;
> > > > > > >  case POWERPC_EXCP_DSEG:  /* Data segment exception   
> > > > > > > */
> > > > > > >   
> > > > > > 
> > > > > > should not that be cleared later on in powerpc_excp() by :
> > > > > > 
> > > > > > env->msr = new_msr & env->msr_mask;
> > > > > > 
> > > > > > ? but the routine is rather long so I might be missing a branch.
> > > > > 
> > > > > No you're right, so it can't leak into the guest, phew!
> > > > > 
> > > > > The problem I get is the interrupt code doing some things differently
> > > > > depending on on the HV bit. For example what I noticed is the guest
> > > > > losing its LE bit upon entry.
> > > > > 
> > > > > Perhaps a cleaner way is for the system reset case to set new_msr
> > > > > according to the ISA, and then apply the msr_mask (or at least mask
> > > > > out HV) before calculating the exception model? Any preference?
> > > > 
> > > > I think the proposed revision makes sense.
> > > >   
> > > 
> > > What do you think of this version? This fixes up machine check guest
> > > delivery as well. I'm sending this ahead of the new hcall patch, because
> > > it's a bugfix for existing code. I'll get around to the hcall again next
> > > week.
> > > 
> > > Thanks,
> > > Nick
> > > 
> > > 
> > > ppc hypervisors have delivered system reset and machine check exception
> > > interrupts to guests in some situations (e.g., see FWNMI feature of 
> > > LoPAPR,
> > > or NMI injection in QEMU).
> > > 
> > > These exceptions are architected to set the HV bit in hardware, however
> > > when injected into a guest, the HV bit should be cleared. Current code
> > > masks off the HV bit before setting the new MSR, however this happens 
> > > after
> > > the interrupt delivery model has calculated delivery mode for the 
> > > exception.
> > > This can result in the guest's MSR LE bit being lost.
> > > 
> > > Provide a new flag for HV exceptions to allow delivery to guests. The
> > > exception model masks out the HV bit.
> > > 
> > > Also add another sanity check to ensure other such exceptions don't try
> > > to set HV in guest without setting guest_hv_excp
> > > 
> > > Signed-off-by: Nicholas Piggin 
> > > ---
> > >  target-ppc/excp_helper.c | 25 ++---
> > >  1 file changed, 22 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c
> > > index 53c4075..1b18433 100644
> > > --- a/target-ppc/excp_helper.c
> > > +++ b/target-ppc/excp_helper.c
> > > @@ -77,7 +77,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
> > > excp_model, int excp)
> > >  CPUState *cs = CPU(cpu);
> > >  CPUPPCState *env = >env;
> > >  target_ulong msr, new_msr, vector;
> > > -int srr0, srr1, asrr0, asrr1, lev, ail;
> > > +int srr0, srr1, asrr0, asrr1, lev, ail, guest_hv_excp;  
> > 
> > So, to clarify my understanding of this.
> > 
> > The guest_hv_excp flag indicates that this is a normally-HV exception
> > which *could* be delivered to a guest with HV clear, *not* that we're
> > actually doing so in this instance.  Yes?
> 
> Correct.

Ok.  Hmm.  Could

Re: [Qemu-devel] [PATCH v5 08/17] ppc/pnv: add XSCOM handlers to PnvCore

2016-10-24 Thread David Gibson

On Sat, Oct 22, 2016 at 11:46:41AM +0200, Cédric Le Goater wrote:
> Now that we are using real HW ids for the cores in PowerNV chips, we
> can route the XSCOM accesses to them. We just need to attach a
> specific XSCOM memory region to each core in the appropriate window
> for the core number.
> 
> To start with, let's install the DTS (Digital Thermal Sensor) handlers
> which should return 38°C for each core.
> 
> Signed-off-by: Cédric Le Goater 

Reviewed-by: David Gibson 

> ---
> 
>  Changes since v4:
> 
>  - used the helpers for the XSCOM region 
> 
>  Changes since v3:
> 
>  - moved to new XSCOM model
>  - kept the write op on the XSCOM memory region for later use
> 
>  Changes since v2:
> 
>  - added a XSCOM memory region to handle access to the EX core
>registers   
>  - extended the PnvCore object with a XSCOM_INTERFACE so that we can
>use pnv_xscom_pcba() and pnv_xscom_addr() to handle XSCOM address
>translation.
> 
>  hw/ppc/pnv.c   |  4 
>  hw/ppc/pnv_core.c  | 50 
> ++
>  include/hw/ppc/pnv_core.h  |  2 ++
>  include/hw/ppc/pnv_xscom.h | 19 ++
>  4 files changed, 75 insertions(+)
> 
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 96ba36cc272d..df55a89cb951 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -625,6 +625,10 @@ static void pnv_chip_realize(DeviceState *dev, Error 
> **errp)
>  object_property_set_bool(OBJECT(pnv_core), true, "realized",
>   _fatal);
>  object_unref(OBJECT(pnv_core));
> +
> +/* Each core has an XSCOM MMIO region */
> +pnv_xscom_add_subregion(chip, PNV_XSCOM_EX_CORE_BASE(core_hwid),
> +_CORE(pnv_core)->xscom_regs);
>  i++;
>  }
>  g_free(typename);
> diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> index 04713caa3b24..2acda9637db5 100644
> --- a/hw/ppc/pnv_core.c
> +++ b/hw/ppc/pnv_core.c
> @@ -19,6 +19,7 @@
>  #include "qemu/osdep.h"
>  #include "sysemu/sysemu.h"
>  #include "qapi/error.h"
> +#include "qemu/log.h"
>  #include "target-ppc/cpu.h"
>  #include "hw/ppc/ppc.h"
>  #include "hw/ppc/pnv.h"
> @@ -63,6 +64,51 @@ static void powernv_cpu_init(PowerPCCPU *cpu, Error **errp)
>  qemu_register_reset(powernv_cpu_reset, cpu);
>  }
>  
> +/*
> + * These values are read by the PowerNV HW monitors under Linux
> + */
> +#define PNV_XSCOM_EX_DTS_RESULT0 0x5
> +#define PNV_XSCOM_EX_DTS_RESULT1 0x50001
> +
> +static uint64_t pnv_core_xscom_read(void *opaque, hwaddr addr,
> +unsigned int width)
> +{
> +uint32_t offset = addr >> 3;
> +uint64_t val = 0;
> +
> +/* The result should be 38 C */
> +switch (offset) {
> +case PNV_XSCOM_EX_DTS_RESULT0:
> +val = 0x26f024f023full;
> +break;
> +case PNV_XSCOM_EX_DTS_RESULT1:
> +val = 0x24full;
> +break;
> +default:
> +qemu_log_mask(LOG_UNIMP, "Warning: reading reg=0x%" HWADDR_PRIx,
> +  addr);
> +}
> +
> +return val;
> +}
> +
> +static void pnv_core_xscom_write(void *opaque, hwaddr addr, uint64_t val,
> + unsigned int width)
> +{
> +qemu_log_mask(LOG_UNIMP, "Warning: writing to reg=0x%" HWADDR_PRIx,
> +  addr);
> +}
> +
> +static const MemoryRegionOps pnv_core_xscom_ops = {
> +.read = pnv_core_xscom_read,
> +.write = pnv_core_xscom_write,
> +.valid.min_access_size = 8,
> +.valid.max_access_size = 8,
> +.impl.min_access_size = 8,
> +.impl.max_access_size = 8,
> +.endianness = DEVICE_BIG_ENDIAN,
> +};
> +
>  static void pnv_core_realize_child(Object *child, Error **errp)
>  {
>  Error *local_err = NULL;
> @@ -118,6 +164,10 @@ static void pnv_core_realize(DeviceState *dev, Error 
> **errp)
>  goto err;
>  }
>  }
> +
> +snprintf(name, sizeof(name), "xscom-core.%d", cc->core_id);
> +pnv_xscom_region_init(>xscom_regs, OBJECT(dev), _core_xscom_ops,
> +  pc, name, PNV_XSCOM_EX_CORE_SIZE);
>  return;
>  
>  err:
> diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
> index a151e281c017..2955a41c901f 100644
> --- a/include/hw/ppc/pnv_core.h
> +++ b/include/hw/ppc/pnv_core.h
> @@ -36,6 +36,8 @@ typedef struct PnvCore {
>  /*< public >*/
>  void *threads;
>  uint32_t pir;
> +
> +MemoryRegion xscom_regs;
>  } PnvCore;
>  
>  typedef struct PnvCoreClass {
> diff --git a/include/hw/ppc/pnv_xscom.h b/include/hw/ppc/pnv_xscom.h
> index ee25ec455e3f..5da6e92e698c 100644
> --- a/include/hw/ppc/pnv_xscom.h
> +++ b/include/hw/ppc/pnv_xscom.h
> @@ -41,6 +41,25 @@ typedef struct PnvXScomInterfaceClass {
>  int (*populate)(PnvXScomInterface *dev, void *fdt, int offset);
>  } PnvXScomInterfaceClass;
>  
> +/*
> + * Layout of the XSCOM PCB addresses of EX core 1
> + *
> +

Re: [Qemu-devel] [PATCH v5 00/17] ppc/pnv: booting the kernel and reaching user space

2016-10-24 Thread David Gibson

On Mon, Oct 24, 2016 at 04:33:33PM +1100, David Gibson wrote:
> On Sat, Oct 22, 2016 at 11:46:33AM +0200, Cédric Le Goater wrote:
> > Hello,
> > 
> > Here is the latest version of the ppc/pnv platform patchset. PowerNV
> > (as Non-Virtualized) is the "baremetal" platform using the OPAL
> > firmware. It runs Linux on IBM and Open Power systems and it can be
> > used as an hypervisor OS, to run KVM guests, or simply as a host OS.
> > The goal here is to add support for the baremetal platform and
> > possibly later also for the KVM PR guests but not for HV guests.
> > 
> > In v5, all the comments from v4 should have been addressed. Most of
> > the differences are cleanups suggested by David but there a couple of
> > important changes :
> > 
> >  - an addition of a new firmware to qemu : skiboot 5.3.7.
> >  - a rework of the native Interrupt Presentation Controller model
> >which now uses memory subregions instead of a hash table.   
> >  - a removal of the Power9 LPC Controller. This is still in the plans
> >but the models need a little more work.
> > 
> > 
> > The initial patches provide a minimal platform with some RAM to load
> > the ROMs : firmware, kernel and initrd. The device tree is built with
> > what is available at reset time. Then, comes the PnvChip object acting
> > as a container for other devices required to run a system. The cores
> > are added to each chip with some restrictions on the number and the
> > ids. Next is the XSCOM model, the sideband bus which gives controls to
> > all the units in the POWER8 chip, the LPC controller for the console,
> > the native interrupt controller and the PSI HB model to handle the
> > external interrupt.
> > 
> > 
> > The next step should be IPMI support which adds a BT device on the ISA
> > bus and some device tree extensions to read sensors and FRUs. This is
> > relatively straight forward and most of the IPMI code has been
> > discussed already on the list. Then should come a PHB3 model to
> > include some PCI devices. This is big and it needs a few helpers in
> > the PCI core.
> 
> I've merged 1-6 into ppc-for-2.8.  The rest I'm still reviewing.

I've now merged 7-10 into ppc-for-2.8, making the change I requested
to xscom_complete() along the way.  Still looking at the rest.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [Qemu-stable] [Qemu-ppc] [PULL 0/4] ppc patches for qemu-2.7 stable branch

2016-10-24 Thread David Gibson

On Mon, Oct 17, 2016 at 04:24:31PM -0500, Michael Roth wrote:
> Quoting Peter Maydell (2016-10-17 13:45:21)
> > On 17 October 2016 at 19:13, Michael Roth  wrote:
> > > We could do both though: use some ad-hoc way to tag for a particular
> > > sub-maintainer tree/stable branch, as well as an explicit "not for
> > > master" in the cover letter ensure it doesn't go into master. It's a bit
> > > more redundant, but flexible in that people can use whatever tagging
> > > format they want for a particular tree.
> > 
> > Yes, that would be my preference. Gmail's filtering is not
> > very good, and it doesn't seem to be able to support
> > multiple or complex matches on the subject line, but
> > it can deal with "doesn't include foo in body".
> > People who actively want to look for stuff not to go
> > into master can filter it however they like.
> 
> Sounds good to me. For my part I think "for-2.7.1" etc. would be
> prefereable. No need to resend this patchset though.
> 
> I suppose MAINTAINERS would be the best place to document something
> like this?

So.. regardless of the outcome in general for future stable merges..

Has this batch been merged for 2.7 stable?  Or do I need to resend it
in the new style?

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

[Qemu-devel] [PATCH] hw/arm/pxa2xx: Set value default values for CCCR and CKEN on PXA255

2016-10-24 Thread Guenter Roeck

The code used default values for PXA270 to configure CCCR. For PXA255,
the resulting register value is invalid (unsupported) and resulted
in a division by zero in the Linux kernel. Use default values from
datasheet instead.

Signed-off-by: Guenter Roeck 
---
 hw/arm/pxa2xx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index 2a2a821..4d9aed4 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -2274,7 +2274,9 @@ PXA2xxState *pxa255_init(MemoryRegion *address_space, 
unsigned int sdram_size)
 qdev_get_gpio_in(s->pic, PXA2XX_PIC_LCD));
 
 s->cm_base = 0x4130;
-s->cm_regs[CCCR >> 2] = 0x02000210;/* 416.0 MHz */
+s->cm_regs[CCCR >> 2] = 0x0121;/* from datasheet */
+s->cm_regs[CKEN >> 2] = 0x00017def;/* from datasheet */
+
 s->clkcfg = 0x0009;/* Turbo mode active */
 memory_region_init_io(>cm_iomem, NULL, _cm_ops, s, "pxa2xx-cm", 
0x1000);
 memory_region_add_subregion(address_space, s->cm_base, >cm_iomem);
-- 
2.5.0

[Qemu-devel] [PATCH] hw/arm/pxa2xx: Correctly handle external GPIO reset requests

2016-10-24 Thread Guenter Roeck

The internal GPIO reset, enabled with GPR_EN, only applies to GPIO pin 1.
If other GPIO pins are used for reset, this is unrelated to GPR_EN, the
reset is an external reset pin, and it resets the entire system.

This fixes GPIO reset failures seen with various PXA270 emulations (akita,
borzoi, spitz, tosa, terrier) when running Linux.

Signed-off-by: Guenter Roeck 
---
 hw/arm/pxa2xx.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index cb55704..2a2a821 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -2048,10 +2048,18 @@ static void pxa2xx_reset(void *opaque, int line, int 
level)
 {
 PXA2xxState *s = (PXA2xxState *) opaque;
 
-if (level && (s->pm_regs[PCFR >> 2] & 0x10)) { /* GPR_EN */
+/*
+ * GPIO pin 1 is the CPU internal GPIO reset, enabled with GPR_EN.
+ * Any other pin is board specific and resets the entire system.
+ */
+if (line == 1 && level && (s->pm_regs[PCFR >> 2] & 0x10)) {/* 
GPR_EN */
 cpu_reset(CPU(s->cpu));
 /* TODO: reset peripherals */
 }
+
+if (line != 1 && level) {
+qemu_system_reset_request();
+}
 }
 
 /* Initialise a PXA270 integrated chip (ARM based core).  */
-- 
2.5.0

[Qemu-devel] [PATCH] arm: cubieboard: Add support for initrd

2016-10-24 Thread Guenter Roeck

Signed-off-by: Guenter Roeck 
---
 hw/arm/cubieboard.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/arm/cubieboard.c b/hw/arm/cubieboard.c
index fbd78ed..dd19ba3 100644
--- a/hw/arm/cubieboard.c
+++ b/hw/arm/cubieboard.c
@@ -74,6 +74,7 @@ static void cubieboard_init(MachineState *machine)
 cubieboard_binfo.ram_size = machine->ram_size;
 cubieboard_binfo.kernel_filename = machine->kernel_filename;
 cubieboard_binfo.kernel_cmdline = machine->kernel_cmdline;
+cubieboard_binfo.initrd_filename = machine->initrd_filename;
 arm_load_kernel(>a10->cpu, _binfo);
 }
 
-- 
2.5.0

[Qemu-devel] [PATCH] i.MX: Fix GPIO ISR register write

2016-10-24 Thread Guenter Roeck

Writing the ISR register is supposed to clear interrupt status bits,
not to set them.

This patch makes '-M sabrelite' work without devicetree changes (Linux
kernel versions 3.18 to 4.7 with imx_v6_v7_defconfig and up to v4.8 with
multi_v7_defconfig; mainline has different problems).

Signed-off-by: Guenter Roeck 
---
 hw/gpio/imx_gpio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/gpio/imx_gpio.c b/hw/gpio/imx_gpio.c
index f3574aa..c36c394 100644
--- a/hw/gpio/imx_gpio.c
+++ b/hw/gpio/imx_gpio.c
@@ -237,7 +237,7 @@ static void imx_gpio_write(void *opaque, hwaddr offset, 
uint64_t value,
 break;
 
 case ISR_ADDR:
-s->isr |= ~value;
+s->isr &= ~value;
 imx_gpio_set_all_int_lines(s);
 break;
 
-- 
2.5.0

Re: [Qemu-devel] [PATCH] nbd: Use CoQueue for free_sema instead of CoMutex

2016-10-24 Thread Changlong Xie


On 10/24/2016 05:36 PM, Paolo Bonzini wrote:



On 24/10/2016 03:44, Changlong Xie wrote:

Ping. Any comments? It's really a problem for NBD.


Sorry, I haven't been sending pull requests.  I'll do it this week.



Thanks : )


Paolo


Thanks
 -Xie

On 10/12/2016 06:18 PM, Changlong Xie wrote:

NBD is using the CoMutex in a way that wasn't anticipated. For
example, if there are
N(N=26, MAX_NBD_REQUESTS=16) nbd write requests, so we will invoke
nbd_client_co_pwritev
N times.


time request Actions
11   in_flight=1, Coroutine=C1
22   in_flight=2, Coroutine=C2
...
15   15  in_flight=15, Coroutine=C15
16   16  in_flight=16, Coroutine=C16, free_sema->holder=C16,
mutex->locked=true
17   17  in_flight=16, Coroutine=C17, queue C17 into free_sema->queue
18   18  in_flight=16, Coroutine=C18, queue C18 into free_sema->queue
...
26   N   in_flight=16, Coroutine=C26, queue C26 into free_sema->queue



Once nbd client recieves request No.16' reply, we will re-enter C16.
It's ok, because
it's equal to 'free_sema->holder'.


time request Actions
27   16  in_flight=15, Coroutine=C16, free_sema->holder=C16,
mutex->locked=false



Then nbd_coroutine_end invokes qemu_co_mutex_unlock what will pop
coroutines from
free_sema->queue's head and enter C17. More free_sema->holder is C17 now.


time request Actions
28   17  in_flight=16, Coroutine=C17, free_sema->holder=C17,
mutex->locked=true



In above scenario, we only recieves request No.16' reply. As time goes
by, nbd client will
almostly recieves replies from requests 1 to 15 rather than request 17
who owns C17. In this
case, we will encounter assert "mutex->holder == self" failed since
Kevin's commit 0e438cdc
"coroutine: Let CoMutex remember who holds it". For example, if nbd
client recieves request
No.15' reply, qemu will stop unexpectedly:


time request   Actions
29   15(most case) in_flight=15, Coroutine=C15, free_sema->holder=C17,
mutex->locked=false



Per Paolo's suggestion "The simplest fix is to change it to CoQueue,
which is like a condition
variable", this patch replaces CoMutex with CoQueue.

Cc: Wen Congyang 
Reported-by: zhanghailiang 
Suggested-by: Paolo Bonzini 
Signed-off-by: Changlong Xie 
---
   block/nbd-client.c | 8 
   block/nbd-client.h | 2 +-
   2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 2cf3237..40b28ab 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -199,8 +199,8 @@ static void nbd_coroutine_start(NbdClientSession *s,
   {
   /* Poor man semaphore.  The free_sema is locked when no other
request
* can be accepted, and unlocked after receiving one reply.  */
-if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
-qemu_co_mutex_lock(>free_sema);
+if (s->in_flight == MAX_NBD_REQUESTS) {
+qemu_co_queue_wait(>free_sema);
   assert(s->in_flight < MAX_NBD_REQUESTS);
   }
   s->in_flight++;
@@ -214,7 +214,7 @@ static void nbd_coroutine_end(NbdClientSession *s,
   int i = HANDLE_TO_INDEX(s, request->handle);
   s->recv_coroutine[i] = NULL;
   if (s->in_flight-- == MAX_NBD_REQUESTS) {
-qemu_co_mutex_unlock(>free_sema);
+qemu_co_queue_next(>free_sema);
   }
   }

@@ -386,7 +386,7 @@ int nbd_client_init(BlockDriverState *bs,
   }

   qemu_co_mutex_init(>send_mutex);
-qemu_co_mutex_init(>free_sema);
+qemu_co_queue_init(>free_sema);
   client->sioc = sioc;
   object_ref(OBJECT(client->sioc));

diff --git a/block/nbd-client.h b/block/nbd-client.h
index 044aca4..307b8b1 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -24,7 +24,7 @@ typedef struct NbdClientSession {
   off_t size;

   CoMutex send_mutex;
-CoMutex free_sema;
+CoQueue free_sema;
   Coroutine *send_coroutine;
   int in_flight;










.

Re: [Qemu-devel] [RESEND PATCH v3 kernel 3/7] mm: add a function to get the max pfn

2016-10-24 Thread Li, Liang Z

> On 10/20/2016 11:24 PM, Liang Li wrote:
> > Expose the function to get the max pfn, so it can be used in the
> > virtio-balloon device driver. Simply include the 'linux/bootmem.h'
> > is not enough, if the device driver is built to a module, directly
> > refer the max_pfn lead to build failed.
> 
> I'm not sure the rest of the set is worth reviewing.  I think a lot of it will
> change pretty fundamentally once you have those improved data structures
> in place.

That's true. I will send out the v4 as soon as possible.

Liang

Re: [Qemu-devel] [RESEND PATCH v3 kernel 2/7] virtio-balloon: define new feature bit and page bitmap head

2016-10-24 Thread Li, Liang Z

> On 10/20/2016 11:24 PM, Liang Li wrote:
> > Add a new feature which supports sending the page information with a
> > bitmap. The current implementation uses PFNs array, which is not very
> > efficient. Using bitmap can improve the performance of
> > inflating/deflating significantly
> 
> Why is it not efficient?  How is using a bitmap more efficient?  What kinds of
> cases is the bitmap inefficient?
> 
> > The page bitmap header will used to tell the host some information
> > about the page bitmap. e.g. the page size, page bitmap length and
> > start pfn.
> 
> Why did you choose to add these features to the structure?  What benefits
> do they add?
> 
> Could you describe your solution a bit here, and describe its strengths and
> weaknesses?
> 

Will elaborate the solution in V4.

> >  /* Size of a PFN in the balloon interface. */  #define
> > VIRTIO_BALLOON_PFN_SHIFT 12 @@ -82,4 +83,22 @@ struct
> > virtio_balloon_stat {
> > __virtio64 val;
> >  } __attribute__((packed));
> >
> > +/* Page bitmap header structure */
> > +struct balloon_bmap_hdr {
> > +   /* Used to distinguish different request */
> > +   __virtio16 cmd;
> > +   /* Shift width of page in the bitmap */
> > +   __virtio16 page_shift;
> > +   /* flag used to identify different status */
> > +   __virtio16 flag;
> > +   /* Reserved */
> > +   __virtio16 reserved;
> > +   /* ID of the request */
> > +   __virtio64 req_id;
> > +   /* The pfn of 0 bit in the bitmap */
> > +   __virtio64 start_pfn;
> > +   /* The length of the bitmap, in bytes */
> > +   __virtio64 bmap_len;
> > +};
> 
> FWIW this is totally unreadable.  Please do something like this:
> 
> > +struct balloon_bmap_hdr {
> > +   __virtio16 cmd; /* Used to distinguish different ...
> > +   __virtio16 page_shift;  /* Shift width of page in the bitmap */
> > +   __virtio16 flag;/* flag used to identify different...
> > +   __virtio16 reserved;/* Reserved */
> > +   __virtio64 req_id;  /* ID of the request */
> > +   __virtio64 start_pfn;   /* The pfn of 0 bit in the bitmap */
> > +   __virtio64 bmap_len;/* The length of the bitmap, in bytes */
> > +};
> 
> and please make an effort to add useful comments.  "/* Reserved */"
> seems like a waste of bytes to me.

OK. Maybe 'padding' is better than 'reserved' .

Thanks for your comments!

Liang

Re: [Qemu-devel] [RESEND PATCH v3 kernel 1/7] virtio-balloon: rework deflate to add page to a list

2016-10-24 Thread Li, Liang Z

> On 10/20/2016 11:24 PM, Liang Li wrote:
> > Will allow faster notifications using a bitmap down the road.
> > balloon_pfn_to_page() can be removed because it's useless.
> 
> This is a pretty terse description of what's going on here.  Could you try to
> elaborate a bit?  What *is* the current approach?  Why does it not work
> going forward?  What do you propose instead?  Why is it better?

Sure. The description will be more clear if it's described as you suggest. 
Thanks!

Liang

Re: [Qemu-devel] [PATCHv3 01/12] pseries: Split device tree construction from device tree load

2016-10-24 Thread Alexey Kardashevskiy

On 24/10/16 16:04, David Gibson wrote:
> spapr_finalize_fdt() both finishes building the device tree for the guest
> and loads it into guest memory.  For future cleanups, it's going to be
> more convenient to do these two things separately.  The loading portion is
> pretty trivial, so we move it inline into the caller, ppc_spapr_reset().
> 
> We also rename spapr_finalize_fdt(), because the current name is going to
> become inaccurate.
> 
> Signed-off-by: David Gibson 


I did reply with "rb" v2 of this with a comment, somehow it was lost.


Reviewed-by: Alexey Kardashevskiy 

with a small nit, grep finds "spapr_finalize_fdt" in a comment:

hw/ppc/spapr_cpu_core.c:187: * coldplugged CPUs DT entries are setup in
spapr_finalize_fdt().


> ---
>  hw/ppc/spapr.c | 42 +++---
>  1 file changed, 23 insertions(+), 19 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index ddb7438..0864411 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -900,10 +900,9 @@ int spapr_h_cas_compose_response(sPAPRMachineState 
> *spapr,
>  return 0;
>  }
>  
> -static void spapr_finalize_fdt(sPAPRMachineState *spapr,
> -   hwaddr fdt_addr,
> -   hwaddr rtas_addr,
> -   hwaddr rtas_size)
> +static void *spapr_build_fdt(sPAPRMachineState *spapr,
> + hwaddr rtas_addr,
> + hwaddr rtas_size)
>  {
>  MachineState *machine = MACHINE(qdev_get_machine());
>  MachineClass *mc = MACHINE_GET_CLASS(machine);
> @@ -999,19 +998,8 @@ static void spapr_finalize_fdt(sPAPRMachineState *spapr,
>  }
>  }
>  
> -_FDT((fdt_pack(fdt)));
> -
> -if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
> -error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
> - fdt_totalsize(fdt), FDT_MAX_SIZE);
> -exit(1);
> -}
> -
> -qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> -cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> -
>  g_free(bootlist);
> -g_free(fdt);
> +return fdt;
>  }
>  
>  static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
> @@ -1147,6 +1135,8 @@ static void ppc_spapr_reset(void)
>  sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>  PowerPCCPU *first_ppc_cpu;
>  uint32_t rtas_limit;
> +void *fdt;
> +int rc;
>  
>  /* Check for unknown sysbus devices */
>  foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);
> @@ -1173,14 +1163,28 @@ static void ppc_spapr_reset(void)
>  spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
>  spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
>  
> -/* Load the fdt */
> -spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> -   spapr->rtas_size);
> +fdt = spapr_build_fdt(spapr, spapr->rtas_addr, spapr->rtas_size);
>  
>  /* Copy RTAS over */
>  cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
>spapr->rtas_size);
>  
> +rc = fdt_pack(fdt);
> +
> +/* Should only fail if we've built a corrupted tree */
> +assert(rc == 0);
> +
> +if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
> +error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
> + fdt_totalsize(fdt), FDT_MAX_SIZE);
> +exit(1);
> +}
> +
> +/* Load the fdt */
> +qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> +cpu_physical_memory_write(spapr->fdt_addr, fdt, fdt_totalsize(fdt));
> +g_free(fdt);
> +
>  /* Set up the entry state */
>  first_ppc_cpu = POWERPC_CPU(first_cpu);
>  first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
> 


-- 
Alexey

Re: [Qemu-devel] [Qemu-arm] [PATCH] char: cadence: check divider against baud rate

2016-10-24 Thread Alistair Francis

On Mon, Oct 24, 2016 at 6:25 AM, P J P  wrote:
> +-- On Mon, 24 Oct 2016, Alistair Francis wrote --+
> | > | 
> http://www.xilinx.com/support/documentation/user_guides/ug585-Zynq-7000-TRM.pdf
> |
> | Did the TRM have enough detail for you to figure out how the hardware 
> behaves?
>
>   Yes, it defines range for 'Baud rate generator' and 'Baud rate divider'
> register values and their default reset value. I've sent patch v2.

Great!

Thanks,

Alistair

>
> Thank you.
> --
> Prasad J Pandit / Red Hat Product Security Team
> 47AF CE69 3A90 54AA 9045 1053 DD13 3D32 FE5B 041F
>

Re: [Qemu-devel] Assertion failure on qcow2 disk with cluster_size != 64k

2016-10-24 Thread Ed Swierk

On Mon, Oct 24, 2016 at 2:21 PM, Eric Blake  wrote:
> How are you getting max_transfer == 65536?  I can't reproduce it with
> the following setup:
>
> $ qemu-img create -f qcow2 -o cluster_size=1M file 10M
> $ qemu-io -f qcow2 -c 'w 7m 1k' file
> $ qemu-io -f qcow2 -c 'w -z 8003584 2093056' file
>
> although I did confirm that the above sequence was enough to get the
> -ENOTSUP failure and fall into the code calculating max_transfer.
>
> I'm guessing that you are using something other than a file system as
> the backing protocol for your qcow2 image.  But do you really have a
> protocol that takes AT MOST 64k per transaction, while still trying to a
> cluster size of 1M in the qcow2 format?  That's rather awkward, as it
> means that you are required to do 16 transactions per cluster (the whole
> point of using larger clusters is usually to get fewer transactions).  I
> think we need to get to a root cause of why you are seeing such a small
> max_transfer, before I can propose the right patch, since I haven't been
> able to reproduce it locally yet (although I admit I haven't tried to
> see if blkdebug could reliably introduce artificial limits to simulate
> your setup).  And it may turn out that I just have to fix the
> bdrv_co_do_pwrite_zeroes() code to loop multiple times if the size of
> the unaligned head really does exceed the max_transfer size that the
> underlying protocol is able to support, rather than assuming that the
> unaligned head/tail always fit in a single fallback write.

In this case I'm using a qcow2 image that's stored directly in a raw
dm-crypt/LUKS container, which is itself a loop device on an ext4
filesystem.

It appears loop devices (with or without dm-crypt/LUKS) report a
255-sector maximum per request via the BLKSECTGET ioctl, which qemu
rounds down to 64k in raw_refresh_limits(). However this maximum
appears to be just a hint: bdrv_driver_pwritev() succeeds even with a
385024-byte buffer of zeroes.

As for the 1M cluster size, this is a temporary workaround for another
qemu issue (the default qcow2 L2 table cache size performs well with
random reads covering only up to 8 GB of image data with 64k clusters;
beyond that the L2 table cache thrashes). I agree this is not an
optimal configuration for writes.

> Can you also try this patch? If I'm right, you'll still fail, but the
> assertion will be slightly different.  (Again, I'm passing locally, but
> that's because I'm using the file protocol, and my file system does not
> impose a puny 64k max transfer).
>
> diff --git i/block/io.c w/block/io.c
> index b136c89..8757063 100644
> --- i/block/io.c
> +++ w/block/io.c
> @@ -1179,6 +1179,8 @@ static int coroutine_fn
> bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
>  int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
>  int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
>  bs->bl.request_alignment);
> +int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
> +MAX_WRITE_ZEROES_BOUNCE_BUFFER);
>
>  assert(alignment % bs->bl.request_alignment == 0);
>  head = offset % alignment;
> @@ -1197,6 +1199,8 @@ static int coroutine_fn
> bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
>  /* Make a small request up to the first aligned sector.  */
>  num = MIN(count, alignment - head);
>  head = 0;
> +assert(num < max_write_zeroes);
> +assert(num < max_transfer);
>  } else if (tail && num > alignment) {
>  /* Shorten the request to the last aligned sector.  */
>  num -= tail;
> @@ -1222,8 +1226,6 @@ static int coroutine_fn
> bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
>
>  if (ret == -ENOTSUP) {
>  /* Fall back to bounce buffer if write zeroes is unsupported */
> -int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
> -
> MAX_WRITE_ZEROES_BOUNCE_BUFFER);
>  BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
>
>  if ((flags & BDRV_REQ_FUA) &&

With this change, the num < max_transfer assertion fails on the first
iteration (with num=385024 and max_transfer=65536).

--Ed

[Qemu-devel] [PATCH 2/2] memory: Don't use memcpy for ram_device regions

2016-10-24 Thread Alex Williamson

With a vfio assigned device we lay down a base MemoryRegion registered
as an IO region, giving us read & write accessors.  If the region
supports mmap, we lay down a higher priority sub-region MemoryRegion
on top of the base layer initialized as a RAM device pointer to the
mmap.  Finally, if we have any quirks for the device (ie. address
ranges that need additional virtualization support), we put another IO
sub-region on top of the mmap MemoryRegion.  When this is flattened,
we now potentially have sub-page mmap MemoryRegions exposed which
cannot be directly mapped through KVM.

This is as expected, but a subtle detail of this is that we end up
with two different access mechanisms through QEMU.  If we disable the
mmap MemoryRegion, we make use of the IO MemoryRegion and service
accesses using pread and pwrite to the vfio device file descriptor.
If the mmap MemoryRegion is enabled and results in one of these
sub-page gaps, QEMU handles the access as RAM, using memcpy to the
mmap.  Using either pread/pwrite or the mmap directly should be
correct, but using memcpy causes us problems.  I expect that not only
does memcpy not necessarily honor the original width and alignment in
performing a copy, but it potentially also uses processor instructions
not intended for MMIO spaces.  It turns out that this has been a
problem for Realtek NIC assignment, which has such a quirk that
creates a sub-page mmap MemoryRegion access.

To resolve this, we disable memory_access_is_direct() for ram_device
regions since QEMU assumes that it can use memcpy for those regions.
Instead we access through MemoryRegionOps, which replaces the memcpy
with simple de-references of standard sizes to the host memory.  This
also allows us to eliminate the ram_device bool from the MemoryRegion
structure since we can simply test the ops pointer.

With this patch we attempt to provide unrestricted access to the RAM
device, allowing byte through qword access as well as unaligned
access.  The assumption here is that accesses initiated by the VM are
driven by a device specific driver, which knows the device
capabilities.  If unaligned accesses are not supported by the device,
we don't want them to work in a VM by performing multiple aligned
accesses to compose the unaligned access.  A down-side of this
philosophy is that the xp command from the monitor attempts to use
the largest available access weidth, unaware of the underlying
device.  Using memcpy had this same restriction, but at least now an
operator can dump individual registers, even if blocks of device
memory may result in access widths beyond the capabilities of a
given device (RTL NICs only support up to dword).

Reported-by: Thorsten Kohfeldt 
Signed-off-by: Alex Williamson 
---
 include/exec/memory.h |7 +++--
 memory.c  |   70 -
 trace-events  |2 +
 3 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index a75b8c3..2d4a287 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -209,7 +209,6 @@ struct MemoryRegion {
 void (*destructor)(MemoryRegion *mr);
 uint64_t align;
 bool terminates;
-bool ram_device;
 bool enabled;
 bool warning_printed; /* For reservations */
 uint8_t vga_logging_count;
@@ -1480,9 +1479,11 @@ void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t 
addr);
 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 {
 if (is_write) {
-return memory_region_is_ram(mr) && !mr->readonly;
+return memory_region_is_ram(mr) &&
+   !mr->readonly && !memory_region_is_ram_device(mr);
 } else {
-return memory_region_is_ram(mr) || memory_region_is_romd(mr);
+return (memory_region_is_ram(mr) && !memory_region_is_ram_device(mr)) 
||
+   memory_region_is_romd(mr);
 }
 }
 
diff --git a/memory.c b/memory.c
index 7ffcff1..d07f785 100644
--- a/memory.c
+++ b/memory.c
@@ -1128,6 +1128,71 @@ const MemoryRegionOps unassigned_mem_ops = {
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
+static uint64_t memory_region_ram_device_read(void *opaque,
+  hwaddr addr, unsigned size)
+{
+MemoryRegion *mr = opaque;
+uint64_t data = (uint64_t)~0;
+
+switch (size) {
+case 1:
+data = *(uint8_t *)(mr->ram_block->host + addr);
+break;
+case 2:
+data = *(uint16_t *)(mr->ram_block->host + addr);
+break;
+case 4:
+data = *(uint32_t *)(mr->ram_block->host + addr);
+break;
+case 8:
+data = *(uint64_t *)(mr->ram_block->host + addr);
+break;
+}
+
+trace_memory_region_ram_device_read(get_cpu_index(), mr, addr, data, size);
+
+return data;
+}
+
+static void memory_region_ram_device_write(void *opaque, hwaddr addr,
+

[Qemu-devel] [PATCH 1/2] memory: Replace skip_dump flag with "ram_device"

2016-10-24 Thread Alex Williamson

Setting skip_dump on a MemoryRegion allows us to modify one specific
code path, but the restriction we're trying to address encompasses
more than that.  If we have a RAM MemoryRegion backed by a physical
device, it not only restricts our ability to dump that region, but
also affects how we should manipulate it.  Here we recognize that
MemoryRegions do not change to sometimes allow dumps and other times
not, so we replace setting the skip_dump flag with a new initializer
so that we know exactly the type of region to which we're applying
this behavior.

Signed-off-by: Alex Williamson 
---
 hw/vfio/common.c  |9 -
 hw/vfio/spapr.c   |2 +-
 include/exec/memory.h |   41 -
 memory.c  |   13 +
 memory_mapping.c  |2 +-
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9505fb3..c764cb3 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -724,12 +724,11 @@ int vfio_region_mmap(VFIORegion *region)
 
 name = g_strdup_printf("%s mmaps[%d]",
memory_region_name(region->mem), i);
-memory_region_init_ram_ptr(>mmaps[i].mem,
-   memory_region_owner(region->mem),
-   name, region->mmaps[i].size,
-   region->mmaps[i].mmap);
+memory_region_init_ram_device_ptr(>mmaps[i].mem,
+  memory_region_owner(region->mem),
+  name, region->mmaps[i].size,
+  region->mmaps[i].mmap);
 g_free(name);
-memory_region_set_skip_dump(>mmaps[i].mem);
 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
 >mmaps[i].mem);
 
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 7443d34..4409bcc 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -25,7 +25,7 @@ static bool 
vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
 }
 
 return !memory_region_is_ram(section->mr) ||
-memory_region_is_skip_dump(section->mr);
+memory_region_is_ram_device(section->mr);
 }
 
 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 79ccaab..a75b8c3 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -209,7 +209,7 @@ struct MemoryRegion {
 void (*destructor)(MemoryRegion *mr);
 uint64_t align;
 bool terminates;
-bool skip_dump;
+bool ram_device;
 bool enabled;
 bool warning_printed; /* For reservations */
 uint8_t vga_logging_count;
@@ -449,6 +449,30 @@ void memory_region_init_ram_ptr(MemoryRegion *mr,
 void *ptr);
 
 /**
+ * memory_region_init_ram_device_ptr:  Initialize RAM device memory region from
+ * a user-provided pointer.
+ *
+ * A RAM device represents a mapping to a physical device, such as to a PCI
+ * MMIO BAR of an vfio-pci assigned device.  The memory region may be mapped
+ * into the VM address space and access to the region will modify memory
+ * directly.  However, the memory region should not be included in a memory
+ * dump (device may not be enabled/mapped at the time of the dump), and
+ * operations incompatible with manipulating MMIO should be avoided.  Replaces
+ * skip_dump flag.
+ *
+ * @mr: the #MemoryRegion to be initialized.
+ * @owner: the object that tracks the region's reference count
+ * @name: the name of the region.
+ * @size: size of the region.
+ * @ptr: memory to be mapped; must contain at least @size bytes.
+ */
+void memory_region_init_ram_device_ptr(MemoryRegion *mr,
+   struct Object *owner,
+   const char *name,
+   uint64_t size,
+   void *ptr);
+
+/**
  * memory_region_init_alias: Initialize a memory region that aliases all or a
  *   part of another memory region.
  *
@@ -574,22 +598,13 @@ static inline bool memory_region_is_ram(MemoryRegion *mr)
 }
 
 /**
- * memory_region_is_skip_dump: check whether a memory region should not be
- * dumped
- *
- * Returns %true is a memory region should not be dumped(e.g. VFIO BAR MMAP).
+ * memory_region_is_ram_device: check whether a memory region is a ram device
  *
- * @mr: the memory region being queried
- */
-bool memory_region_is_skip_dump(MemoryRegion *mr);
-
-/**
- * memory_region_set_skip_dump: Set skip_dump flag, dump will ignore this 
memory
- *  region
+ * Returns %true is a memory region is a device backed ram region
  *
  * @mr: the memory region being queried
  */
-void

[Qemu-devel] [PATCH 0/2] memory: Convert skip_dump to ram_device and avoid memcpy

2016-10-24 Thread Alex Williamson

As based on previous RFC:

https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg05183.html

TL;DR, this adds tracing, converts skip_dump to ram_device (named
after rom_device), adds full access widths, identifies ram_device
regions based on ops pointer.

Paolo had suggested converting "skip_dump" to "device_memory", but
"ram_device" seemed like a better fit, feel free to disagree.  It
also didn't feel right to set ram_device via a separate function,
because then we need to worry about all the ways in which a
MemoryRegion might be manipulated to make sure we only apply this
attribute to the kind we want.  Much easier to have a separate
constructor that does this, we're not flipping this on and off like
romd.

Also, I had trouble documenting why I only implemented dword access
as in the RFC, it just seemed like procrastination, so let's just
support full qword.  In fact, why even care about alignment, we
really want to perform the access as prescribed by the guest driver
and performed by the guest processor.  If the driver does an invalid
width or alignment, then so should we.  At least that's my theory.
Of course this means that xp in the monitor once again can't dump
RTL MMIO in more than 4-byte chunks, but that seems like an
operator issue.  Dumping memory via xp is just a tool and if we
apply the tool to do accesses that the device is not capable of,
that's not a problem with the tool.

This fixes what I believe to be the problem Thorsten has
identified with RTL assignment and hopefully he can add a
Tested-by once confirmed.  Thanks,

Alex

---

Alex Williamson (2):
  memory: Replace skip_dump flag with "ram_device"
  memory: Don't use memcpy for ram_device regions


 hw/vfio/common.c  |9 ++
 hw/vfio/spapr.c   |2 +
 include/exec/memory.h |   46 +++--
 memory.c  |   79 +++--
 memory_mapping.c  |2 +
 trace-events  |2 +
 6 files changed, 114 insertions(+), 26 deletions(-)

[Qemu-devel] [Bug 1004408] Re: BUG: Soft Lockup - CPU#0 stuck for 22s! [qemu-system-x86: 31867]

2016-10-24 Thread Thomas Huth

Thanks for the bug report, but please report kernel bugs in the kernel
bug tracker, not in the QEMU bug tracker (see http://www.linux-
kvm.org/page/Bugs for details). So if the problem still persists with
recent kernels, you should open a ticket there instead.

** Changed in: qemu
   Status: New => Invalid

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1004408

Title:
  BUG: Soft Lockup - CPU#0 stuck for 22s! [qemu-system-x86: 31867]

Status in QEMU:
  Invalid

Bug description:
  Environment:
  ---
  * Upstream git version: qemu-kvm-1.1-rc2-4-g3fd9fed
  * Host Kernel: Mainline Kernel - 3.4.0 x86_64 GNU/Linux (Arch: x86_64)
  * CPU model: Intel(R) Xeon(R) CPU X5570  @ 2.93GHz
  * Guest OS: Red Hat Enterprise Linux Server release 6.2 
  * Guest Kernel: 2.6.32-220.el6.x86_64
  * Qemu-command line: 
  /usr/local/bin/qemu-system-x86_64 -name 'vm1' -nodefaults -monitor 
unix:'/tmp/monitor-humanmonitor1-20120525-214210-Zua6',server,nowait -serial 
unix:'/tmp/serial-20120525-214210-Zua6',server,nowait -device 
ich9-usb-uhci1,id=usb1 -drive 
file='/tmp/kvm_autotest_root/images/rhel62-64.qcow2',index=0,if=ide,cache=none 
-device rtl8139,netdev=idvVySvg,mac='9a:6d:16:b9:b5:06',id='idiX1NmG' -netdev 
tap,id=idvVySvg,fd=21 -m 7198 -smp 2 -device 
usb-tablet,id=usb-tablet1,bus=usb1.0 -vnc :0 -vga std

  The qemu is started through autotest.

  Description:
  -

  While running the cgroup test through autotest, the host was hung and
  was not responding. When viewed through serial console, found the
  error "BUG: Soft lockup" error as attached in the screenshot 1.

  There are no errors displayed in /var/log/messages (no call trace) and in 
dmesg.*
  There is a call trace seen in serial console, which is show in screenshot 2. 

  Steps to reproduce:
  
  Currently am not able to consistently reproduce this error. However when I 
tried to reproduce it again by running the cgroup test, found another error 
from syslogd as shown below

  "Message from syslogd@phx3 at May 25 21:56:04 ...
kernel:Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 3"

  So this time I got a hard Lockup error. Attached is the screenshot of
  the same. (screenshot-3, see the message at the bottom of the screen).
  This time the cgroup test had completed.

  Please let me know if you require more info on this.

  -prem

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1004408/+subscriptions

[Qemu-devel] [Bug 977391] Re: BUG: soft lockup - CPU#8 stuck for 61s! [kvm:*] in lucid

2016-10-24 Thread Thomas Huth

This sounds like a kernel bug, so it should not be tracked via the QEMU
bug tracker.

** No longer affects: qemu

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/977391

Title:
  BUG: soft lockup - CPU#8 stuck for 61s! [kvm:*]   in lucid

Status in linux package in Ubuntu:
  Incomplete
Status in qemu-kvm package in Ubuntu:
  Confirmed

Bug description:
  Two days back  my KVM base machine got hung up all of a sudden.
  Not sure what exactly happened.

  cat /proc/version_signature 
  Ubuntu 2.6.32-28.55-server 2.6.32.27+drm33.12

  
  -Rahul N.

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/977391/+subscriptions

Re: [Qemu-devel] [RFC PATCH] memory: Don't use memcpy for ram marked as skip_dump

2016-10-24 Thread Thorsten Kohfeldt



Am 22.10.2016 um 17:09 schrieb Alex Williamson:

On Sat, 22 Oct 2016 11:10:59 +0200
Thorsten Kohfeldt  wrote:


Hi *,

this came to my mind when browsing the sources in the patch's vicinity.

It is just a collection of thoughts, so please don't feel offended
about how I phrased certain statements.


Questions

Is mr->opaque always unused ?
i.e. should we assert NULL before assignment ?


Really I think it's probably unnecessary even to check the mr->ops
pointer.  I don't see a path where we can have both mr->ram true and
either mr->ops or mr->opaque set to anything other than defaults.
However, mr->opaque is consumed by mr->ops, so if mr->ops is set to
_mem_ops, then we know opaque is unused.


mr->ops vs. mr->iommu_ops
i.e. can we set mr->opaque if mr->iommu_ops is not NULL ?
or should we even assert mr->iommu_ops NULL, because a
skip_dump mr is not supposed to be addr-translated again ?


Show me where a MemoryRegion with iommu_ops can be mr->ram.


I probably can't.
I was merely expressing a gut feeling.


There is a _shared_ 'io_mem_unassigned' mr.
Are we in danger to modify it ?
Would that hurt ?


No.


Are we generally switching mrops "back and forth",
or is this a first ?


mr->ram is expected to have mr->ops set to the default unassigned
handler via memory_region_initfn().  All MemoryRegions start this way
and have their ops reassigned for certain initialization types.


Can we afford not to implement size 8 or should we rather
force 8 -> 2*4 by setting specific mrop flags if possible ?
Or just hard code case 8: handle longword[1]; fallthru 4:


The memory API code will automatically split a qword into multiple
dword accesses.


I thought that would only happen if any of the mrop constraint flags were set ?
Like min size = 1, max_size = 4 ?
Are those default or propagated to the new op context from somewhere else
(i.e. dealt with before the new mrop callbacks are invoked) ?


When/where is memory_region_set_skip_dump() (supposed to be) called ?


Use the source, it's called by vfio code after initializing the
MemoryRegion to indicate that memory dumps should skip this region as
it's backed by a physical device.


I should have phrased that completely differently.
I was wondering whether any future user of the skip_dump mechanism
would be aware enough of the new implications.
But I see Paolo has suggested a renaming, skip_dump to device_memory.
That looks good enough to me for raising that awareness.


Recommendations

Add comment in skip_dump_mem_read/write NOT to support 64b,
because an error will not be recognised unless specific HW is present
(maybe even give examples of specific HW combinations).


It's not clear to me that this is the correct long term behavior.  RTL
does not support qword access, but other devices might.  The
expectation would be that a guest driver does not use accesses beyond
the capabilities of the device.  It's convenient that limiting to dword
accesses fixes xp memory inspection in the monitor, but that's not a
sufficient reason not to implement qword should we want it for other
devices.


Add comments at more code locations that are break-subpage/mmap-sensible.
For example default vfio slow path mrops should also not support 64b ?


Nope, same.


Add a trace message for each mrop.


Yes, this is on my todo list post-RFC.


Additional patch suggestion(s)

During former investigations I found it not easy to
identify runtime active/current mrops per mr, so:
Add .name to mr->ops/iommu_ops
 to be able to mon-list them together with mr names
OR
(this questions flag reuse/overlay)
skip_dump_flag should rather get a brother
 so (unamed) ops can be easily concluded for listing ?
 But is this the only mr<->mrop ambiguosity ?


This is beyond the scope of this patch, you're welcome to pursue.


And would I find people willing to review ?
See, I did not have that much resonance for my recent patch initiative
'hmp: Improve 'info mtree' with optional parm for mapinfo'.


Most importantly, you haven't indicated whether this patch resolves the
issues you've been having.  Thanks,


I planned on testing the reviewed/revised patch,
but anyway,
I will only find time for testing later this week.


Alex


Regards,

Thorsten

Re: [Qemu-devel] [PATCH v9 05/12] vfio: Introduce common function to add capabilities

2016-10-24 Thread Alex Williamson

On Tue, 25 Oct 2016 02:57:58 +0530
Kirti Wankhede  wrote:

> On 10/21/2016 12:54 AM, Alex Williamson wrote:
> > On Tue, 18 Oct 2016 02:52:05 +0530
> > Kirti Wankhede  wrote:
> >   
> >> Vendor driver using mediated device framework should use
> >> vfio_info_add_capability() to add capabilities.
> >> Introduced this function to reduce code duplication in vendor drivers.
> >>
> >> Signed-off-by: Kirti Wankhede 
> >> Signed-off-by: Neo Jia 
> >> Change-Id: I6fca329fa2291f37a2c859d0bc97574d9e2ce1a6
> >> ---
> >>  drivers/vfio/vfio.c  | 78 
> >> 
> >>  include/linux/vfio.h |  4 +++
> >>  2 files changed, 82 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index a5a210005b65..e96cb3f7a23c 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -1799,6 +1799,84 @@ void vfio_info_cap_shift(struct vfio_info_cap 
> >> *caps, size_t offset)
> >>  }
> >>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
> >>  
> >> +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
> >> +{
> >> +  struct vfio_info_cap_header *header;
> >> +  struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
> >> +  size_t size;
> >> +
> >> +  size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
> >> +  header = vfio_info_cap_add(caps, size,
> >> + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
> >> +  if (IS_ERR(header))
> >> +  return PTR_ERR(header);
> >> +
> >> +  sparse_cap = container_of(header,
> >> +  struct vfio_region_info_cap_sparse_mmap, header);
> >> +  sparse_cap->nr_areas = sparse->nr_areas;
> >> +  memcpy(sparse_cap->areas, sparse->areas,
> >> + sparse->nr_areas * sizeof(*sparse->areas));
> >> +  return 0;
> >> +}
> >> +
> >> +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
> >> +{
> >> +  struct vfio_info_cap_header *header;
> >> +  struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
> >> +
> >> +  header = vfio_info_cap_add(caps, sizeof(*cap),
> >> + VFIO_REGION_INFO_CAP_TYPE, 1);
> >> +  if (IS_ERR(header))
> >> +  return PTR_ERR(header);
> >> +
> >> +  type_cap = container_of(header, struct vfio_region_info_cap_type,
> >> +  header);
> >> +  type_cap->type = cap->type;
> >> +  type_cap->subtype = cap->subtype;
> >> +  return 0;
> >> +}
> >> +
> >> +int vfio_info_add_capability(struct vfio_region_info *info,
> >> +   struct vfio_info_cap *caps,
> >> +   int cap_type_id,
> >> +   void *cap_type)
> >> +{
> >> +  int ret;
> >> +
> >> +  if (!cap_type)
> >> +  return 0;
> >> +
> >> +  switch (cap_type_id) {
> >> +  case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
> >> +  ret = sparse_mmap_cap(caps, cap_type);
> >> +  if (ret)
> >> +  return ret;
> >> +  break;
> >> +
> >> +  case VFIO_REGION_INFO_CAP_TYPE:
> >> +  ret = region_type_cap(caps, cap_type);
> >> +  if (ret)
> >> +  return ret;
> >> +  break;
> >> +  default:
> >> +  return -EINVAL;
> >> +  }
> >> +
> >> +  info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
> >> +
> >> +  if (caps->size) {
> >> +  if (info->argsz < sizeof(*info) + caps->size) {
> >> +  info->argsz = sizeof(*info) + caps->size;
> >> +  info->cap_offset = 0;
> >> +  } else {
> >> +  vfio_info_cap_shift(caps, sizeof(*info));
> >> +  info->cap_offset = sizeof(*info);  
> > 
> > This doesn't work.  We build the capability chain in a buffer and
> > vfio_info_cap_add() expects the chain to be zero-based as each
> > capability is added.  vfio_info_cap_shift() is meant to be called once
> > on that buffer immediately before copying it back to the user buffer to
> > adjust the chain offsets to account for the offset within the buffer.
> > vfio_info_cap_shift() cannot be called repeatedly on the buffer as we
> > do support multiple capabilities in a chain.
> >   
> 
> From the code I see, we add one type of capability at a time, either
> VFIO_REGION_INFO_CAP_SPARSE_MMAP or VFIO_REGION_INFO_CAP_TYPE. Both are
> not the part of same case in the switch, right?
> I do tested VFIO_REGION_INFO_CAP_SPARSE_MMAP by mapping some part of
> BAR0 and that works.

That simply means that we don't _currently_ have a user that implements
multiple chain entries.  The interface is however designed to support
multiple entries and this breaks that goal.  Thanks,

Alex

Re: [Qemu-devel] [PATCH v9 06/12] vfio_pci: Update vfio_pci to use vfio_info_add_capability()

2016-10-24 Thread Alex Williamson

On Tue, 25 Oct 2016 02:52:39 +0530
Kirti Wankhede  wrote:

> On 10/21/2016 12:54 AM, Alex Williamson wrote:
> > On Tue, 18 Oct 2016 02:52:06 +0530
> > Kirti Wankhede  wrote:
> >   
> >> Update msix_sparse_mmap_cap() to use vfio_info_add_capability()
> >> Update region type capability to use vfio_info_add_capability()
> >> Can't split this commit for MSIx and region_type cap since there is a
> >> common code which need to be updated for both the cases.
> >>
> >> Signed-off-by: Kirti Wankhede 
> >> Signed-off-by: Neo Jia 
> >> Change-Id: I52bb28c7875a6da5a79ddad1843e6088aff58a45
> >> ---
> >>  drivers/vfio/pci/vfio_pci.c | 72 
> >> +
> >>  1 file changed, 27 insertions(+), 45 deletions(-)
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> >> index d624a52f..1ec0565b48ea 100644
> >> --- a/drivers/vfio/pci/vfio_pci.c
> >> +++ b/drivers/vfio/pci/vfio_pci.c
> >> @@ -556,12 +556,12 @@ static int vfio_pci_for_each_slot_or_bus(struct 
> >> pci_dev *pdev,
> >>  }
> >>  
> >>  static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
> >> +  struct vfio_region_info *info,
> >>struct vfio_info_cap *caps)
> >>  {
> >> -  struct vfio_info_cap_header *header;
> >>struct vfio_region_info_cap_sparse_mmap *sparse;
> >>size_t end, size;
> >> -  int nr_areas = 2, i = 0;
> >> +  int nr_areas = 2, i = 0, ret;
> >>  
> >>end = pci_resource_len(vdev->pdev, vdev->msix_bar);
> >>  
> >> @@ -572,13 +572,10 @@ static int msix_sparse_mmap_cap(struct 
> >> vfio_pci_device *vdev,
> >>  
> >>size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
> >>  
> >> -  header = vfio_info_cap_add(caps, size,
> >> - VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
> >> -  if (IS_ERR(header))
> >> -  return PTR_ERR(header);
> >> +  sparse = kzalloc(size, GFP_KERNEL);
> >> +  if (!sparse)
> >> +  return -ENOMEM;
> >>  
> >> -  sparse = container_of(header,
> >> -struct vfio_region_info_cap_sparse_mmap, header);
> >>sparse->nr_areas = nr_areas;
> >>  
> >>if (vdev->msix_offset & PAGE_MASK) {
> >> @@ -594,26 +591,11 @@ static int msix_sparse_mmap_cap(struct 
> >> vfio_pci_device *vdev,
> >>i++;
> >>}
> >>  
> >> -  return 0;
> >> -}
> >> -
> >> -static int region_type_cap(struct vfio_pci_device *vdev,
> >> - struct vfio_info_cap *caps,
> >> - unsigned int type, unsigned int subtype)
> >> -{
> >> -  struct vfio_info_cap_header *header;
> >> -  struct vfio_region_info_cap_type *cap;
> >> -
> >> -  header = vfio_info_cap_add(caps, sizeof(*cap),
> >> - VFIO_REGION_INFO_CAP_TYPE, 1);
> >> -  if (IS_ERR(header))
> >> -  return PTR_ERR(header);
> >> +  ret = vfio_info_add_capability(info, caps,
> >> +VFIO_REGION_INFO_CAP_SPARSE_MMAP, sparse);
> >> +  kfree(sparse);
> >>  
> >> -  cap = container_of(header, struct vfio_region_info_cap_type, header);
> >> -  cap->type = type;
> >> -  cap->subtype = subtype;
> >> -
> >> -  return 0;
> >> +  return ret;
> >>  }
> >>  
> >>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> >> @@ -704,7 +686,8 @@ static long vfio_pci_ioctl(void *device_data,
> >>if (vdev->bar_mmap_supported[info.index]) {
> >>info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
> >>if (info.index == vdev->msix_bar) {
> >> -  ret = msix_sparse_mmap_cap(vdev, );
> >> +  ret = msix_sparse_mmap_cap(vdev, ,
> >> + );
> >>if (ret)
> >>return ret;
> >>}
> >> @@ -752,6 +735,9 @@ static long vfio_pci_ioctl(void *device_data,
> >>  
> >>break;
> >>default:
> >> +  {
> >> +  struct vfio_region_info_cap_type cap_type;
> >> +
> >>if (info.index >=
> >>VFIO_PCI_NUM_REGIONS + vdev->num_regions)
> >>return -EINVAL;
> >> @@ -762,27 +748,23 @@ static long vfio_pci_ioctl(void *device_data,
> >>info.size = vdev->region[i].size;
> >>info.flags = vdev->region[i].flags;
> >>  
> >> -  ret = region_type_cap(vdev, ,
> >> -vdev->region[i].type,
> >> -vdev->region[i].subtype);
> >> +  cap_type.type = vdev->region[i].type;
> >> +  cap_type.subtype = vdev->region[i].subtype;
> >> +
> >> +  ret = vfio_info_add_capability(, ,
> >> +

[Qemu-devel] [Bug 995758] Re: Possibly inaccurate statement in PC Platform Docs

2016-10-24 Thread Thomas Huth

OK, I just read the text again, and the sentences before the one with
the 0xf indeed sounded like the the start address was at the last
byte. I've reworded the text now a little bit so that it should be more
accurate.

** Changed in: qemu
   Status: Invalid => Fix Released

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/995758

Title:
  Possibly inaccurate statement in PC Platform Docs

Status in QEMU:
  Fix Released

Bug description:
  The documentation at:

  http://wiki.qemu.org/Documentation/Platforms/PC

  Contains the statement that the processor, after reset, executes code
  starting from address 0xF, corresponding to the last byte of the
  single megabyte of memory in the old 8086 address range.

  From my recollection of working in the microcomputer industry in the
  late 1980's, execution actually starts in real mode at the start of
  the last 16 bytes of addressable memory, at 0x0.  Think about it -
  if it's the last byte there's no room for an address operand to
  accompany a 1-byte opcode.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/995758/+subscriptions

Re: [Qemu-devel] [V7 1/1] fsdev: add IO throttle support to fsdev devices

2016-10-24 Thread Greg Kurz

Re-post (I had hit the send button by error :)

On Sat, 22 Oct 2016 11:07:22 -0400
Pradeep Jagadeesh  wrote:

> Signed-off-by: Pradeep Jagadeesh 
> ---

Hi Pradeep,

I see that Berto already did a thorough review for this patch and I agree for
all the suggestions he made.

I have some more to add. First: this patch doesn't apply, please rebase.

More remarks below.

>  fsdev/Makefile.objs |   1 +
>  fsdev/file-op-9p.h  |   3 +
>  fsdev/qemu-fsdev-opts.c |  76 +++
>  fsdev/qemu-fsdev-throttle.c | 147 
> 
>  fsdev/qemu-fsdev-throttle.h |  37 +++
>  hw/9pfs/9p-local.c  |   9 ++-
>  hw/9pfs/9p.c|   6 ++
>  hw/9pfs/cofile.c|   5 ++
>  8 files changed, 282 insertions(+), 2 deletions(-)
>  create mode 100644 fsdev/qemu-fsdev-throttle.c
>  create mode 100644 fsdev/qemu-fsdev-throttle.h
> 
> This adds the support for the 9p-local driver.
> For now this functionality can be enabled only through qemu cli options.
> QMP interface and support to other drivers need further extensions.
> To make it simple for other drivers, the throttle code has been put in
> separate files.
> 

The above lines are the changelog for the patch. We want this to be displayed
when running 'git log'. For this to happen, please move these lines above your
SoB tag.

Only the vN -> vN+1 changes are not relevant (we don't need to record all the
intermediate reviews in git) and should stay here.

> v1 -> v2:
> 
> -Fixed FsContext redeclaration issue
> -Removed couple of function declarations from 9p-throttle.h
> -Fixed some of the .help messages
> 
> v2 -> v3:
> 
> -Addressed follwing comments by Claudio Fontana
>  -Removed redundant memset calls in fsdev_throttle_configure_iolimits function
>  -Checking throttle structure validity before initializing other structures
>   in fsdev_throttle_configure_iolimits
> 
> -Addressed following comments by Greg Kurz
>  -Moved the code from 9pfs directory to fsdev directory, because the 
> throttling
>   is for the fsdev devices.Renamed the files and functions to fsdev_ from 
> 9pfs_
>  -Renamed throttling cli options to throttling.*, as in QMP cli options
>  -Removed some of the unwanted .h files from qemu-fsdev-throttle.[ch]
>  -Using throttle_enabled() function to set the thottle enabled flag for fsdev.
> 
> v3 -> v4:
> 
> -Addressed following comments by Alberto Garcia
>  -Removed the unwanted locking and other data structures in 
> qemu-fsdev-throttle.[ch]
> 
> -Addressed following comments by Greg Kurz
>  -Removed fsdev_iolimitsenable/disable functions, instead using 
> throttle_enabled function
> 
> v4 -> V5:
>  -Fixed the issue with the larger block size accounting.
>  (i.e, when the 9pfs mounted using msize=xxx option)
> 
> V5 -> V6:
> -Addressed the comments by Alberto Garcia
>  -Removed the fsdev_throttle_timer_cb()
>  -Simplified the  fsdev_throttle_schedule_next_request() as suggested
> 
> V6 -> V7:
> -Addressed the comments by Alberto Garcia
>  -changed the  fsdev_throttle_schedule_next_request() as suggested
> 
> 
> diff --git a/fsdev/Makefile.objs b/fsdev/Makefile.objs
> index 1b120a4..2c6da2d 100644
> --- a/fsdev/Makefile.objs
> +++ b/fsdev/Makefile.objs
> @@ -7,6 +7,7 @@ common-obj-y = qemu-fsdev-dummy.o
>  endif
>  common-obj-y += qemu-fsdev-opts.o
>  
> +common-obj-y += qemu-fsdev-throttle.o
>  # Toplevel always builds this; targets without virtio will put it in
>  # common-obj-y
>  common-obj-$(CONFIG_ALL) += qemu-fsdev-dummy.o
> diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
> index 6db9fea..33fe822 100644
> --- a/fsdev/file-op-9p.h
> +++ b/fsdev/file-op-9p.h
> @@ -17,6 +17,7 @@
>  #include 
>  #include 
>  #include 
> +#include "qemu-fsdev-throttle.h"
>  
>  #define SM_LOCAL_MODE_BITS0600
>  #define SM_LOCAL_DIR_MODE_BITS0700
> @@ -74,6 +75,7 @@ typedef struct FsDriverEntry {
>  char *path;
>  int export_flags;
>  FileOperations *ops;
> +FsThrottle fst;
>  } FsDriverEntry;
>  
>  typedef struct FsContext
> @@ -83,6 +85,7 @@ typedef struct FsContext
>  int export_flags;
>  struct xattr_operations **xops;
>  struct extended_ops exops;
> +FsThrottle *fst;
>  /* fs driver specific data */
>  void *private;
>  } FsContext;
> diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
> index 1dd8c7a..395d497 100644
> --- a/fsdev/qemu-fsdev-opts.c
> +++ b/fsdev/qemu-fsdev-opts.c
> @@ -37,6 +37,82 @@ static QemuOptsList qemu_fsdev_opts = {
>  }, {
>  .name = "sock_fd",
>  .type = QEMU_OPT_NUMBER,
> +}, {
> +.name = "throttling.iops-total",
> +.type = QEMU_OPT_NUMBER,
> +.help = "limit total I/O operations per second",
> +},{
> +.name = "throttling.iops-read",
> +.type = QEMU_OPT_NUMBER,
> +.help = "limit read operations per second",
> +

Re: [Qemu-devel] [PATCH v9 05/12] vfio: Introduce common function to add capabilities

2016-10-24 Thread Kirti Wankhede



On 10/21/2016 12:54 AM, Alex Williamson wrote:
> On Tue, 18 Oct 2016 02:52:05 +0530
> Kirti Wankhede  wrote:
> 
>> Vendor driver using mediated device framework should use
>> vfio_info_add_capability() to add capabilities.
>> Introduced this function to reduce code duplication in vendor drivers.
>>
>> Signed-off-by: Kirti Wankhede 
>> Signed-off-by: Neo Jia 
>> Change-Id: I6fca329fa2291f37a2c859d0bc97574d9e2ce1a6
>> ---
>>  drivers/vfio/vfio.c  | 78 
>> 
>>  include/linux/vfio.h |  4 +++
>>  2 files changed, 82 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index a5a210005b65..e96cb3f7a23c 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1799,6 +1799,84 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, 
>> size_t offset)
>>  }
>>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>>  
>> +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
>> +{
>> +struct vfio_info_cap_header *header;
>> +struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
>> +size_t size;
>> +
>> +size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
>> +header = vfio_info_cap_add(caps, size,
>> +   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
>> +if (IS_ERR(header))
>> +return PTR_ERR(header);
>> +
>> +sparse_cap = container_of(header,
>> +struct vfio_region_info_cap_sparse_mmap, header);
>> +sparse_cap->nr_areas = sparse->nr_areas;
>> +memcpy(sparse_cap->areas, sparse->areas,
>> +   sparse->nr_areas * sizeof(*sparse->areas));
>> +return 0;
>> +}
>> +
>> +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
>> +{
>> +struct vfio_info_cap_header *header;
>> +struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
>> +
>> +header = vfio_info_cap_add(caps, sizeof(*cap),
>> +   VFIO_REGION_INFO_CAP_TYPE, 1);
>> +if (IS_ERR(header))
>> +return PTR_ERR(header);
>> +
>> +type_cap = container_of(header, struct vfio_region_info_cap_type,
>> +header);
>> +type_cap->type = cap->type;
>> +type_cap->subtype = cap->subtype;
>> +return 0;
>> +}
>> +
>> +int vfio_info_add_capability(struct vfio_region_info *info,
>> + struct vfio_info_cap *caps,
>> + int cap_type_id,
>> + void *cap_type)
>> +{
>> +int ret;
>> +
>> +if (!cap_type)
>> +return 0;
>> +
>> +switch (cap_type_id) {
>> +case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
>> +ret = sparse_mmap_cap(caps, cap_type);
>> +if (ret)
>> +return ret;
>> +break;
>> +
>> +case VFIO_REGION_INFO_CAP_TYPE:
>> +ret = region_type_cap(caps, cap_type);
>> +if (ret)
>> +return ret;
>> +break;
>> +default:
>> +return -EINVAL;
>> +}
>> +
>> +info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
>> +
>> +if (caps->size) {
>> +if (info->argsz < sizeof(*info) + caps->size) {
>> +info->argsz = sizeof(*info) + caps->size;
>> +info->cap_offset = 0;
>> +} else {
>> +vfio_info_cap_shift(caps, sizeof(*info));
>> +info->cap_offset = sizeof(*info);
> 
> This doesn't work.  We build the capability chain in a buffer and
> vfio_info_cap_add() expects the chain to be zero-based as each
> capability is added.  vfio_info_cap_shift() is meant to be called once
> on that buffer immediately before copying it back to the user buffer to
> adjust the chain offsets to account for the offset within the buffer.
> vfio_info_cap_shift() cannot be called repeatedly on the buffer as we
> do support multiple capabilities in a chain.
> 

>From the code I see, we add one type of capability at a time, either
VFIO_REGION_INFO_CAP_SPARSE_MMAP or VFIO_REGION_INFO_CAP_TYPE. Both are
not the part of same case in the switch, right?
I do tested VFIO_REGION_INFO_CAP_SPARSE_MMAP by mapping some part of
BAR0 and that works.

Kirti.

Re: [Qemu-devel] [V7 1/1] fsdev: add IO throttle support to fsdev devices

2016-10-24 Thread Greg Kurz

On Sat, 22 Oct 2016 11:07:22 -0400
Pradeep Jagadeesh  wrote:

> Signed-off-by: Pradeep Jagadeesh 
> ---

Hi Pradeep,

I see that Berto already did a thorough review for this patch and I agree for
all the suggestions he made.

I have some more to add. First: this patch doesn't apply cleanly, please
rebase. More remarks below.

>  fsdev/Makefile.objs |   1 +
>  fsdev/file-op-9p.h  |   3 +
>  fsdev/qemu-fsdev-opts.c |  76 +++
>  fsdev/qemu-fsdev-throttle.c | 147 
> 
>  fsdev/qemu-fsdev-throttle.h |  37 +++
>  hw/9pfs/9p-local.c  |   9 ++-
>  hw/9pfs/9p.c|   6 ++
>  hw/9pfs/cofile.c|   5 ++
>  8 files changed, 282 insertions(+), 2 deletions(-)
>  create mode 100644 fsdev/qemu-fsdev-throttle.c
>  create mode 100644 fsdev/qemu-fsdev-throttle.h
> 
> This adds the support for the 9p-local driver.
> For now this functionality can be enabled only through qemu cli options.
> QMP interface and support to other drivers need further extensions.
> To make it simple for other drivers, the throttle code has been put in
> separate files.
> 

The above lines are the changelog for the patch. We want this to be displayed
when running 'git log'. For this to happen, please move these lines above your
SoB tag.

Only the vN -> vN+1 changes are not relevant (we don't need to record all the
intermediate reviews in git) and should stay here.

> v1 -> v2:
> 
> -Fixed FsContext redeclaration issue
> -Removed couple of function declarations from 9p-throttle.h
> -Fixed some of the .help messages
> 
> v2 -> v3:
> 
> -Addressed follwing comments by Claudio Fontana
>  -Removed redundant memset calls in fsdev_throttle_configure_iolimits function
>  -Checking throttle structure validity before initializing other structures
>   in fsdev_throttle_configure_iolimits
> 
> -Addressed following comments by Greg Kurz
>  -Moved the code from 9pfs directory to fsdev directory, because the 
> throttling
>   is for the fsdev devices.Renamed the files and functions to fsdev_ from 
> 9pfs_
>  -Renamed throttling cli options to throttling.*, as in QMP cli options
>  -Removed some of the unwanted .h files from qemu-fsdev-throttle.[ch]
>  -Using throttle_enabled() function to set the thottle enabled flag for fsdev.
> 
> v3 -> v4:
> 
> -Addressed following comments by Alberto Garcia
>  -Removed the unwanted locking and other data structures in 
> qemu-fsdev-throttle.[ch]
> 
> -Addressed following comments by Greg Kurz
>  -Removed fsdev_iolimitsenable/disable functions, instead using 
> throttle_enabled function
> 
> v4 -> V5:
>  -Fixed the issue with the larger block size accounting.
>  (i.e, when the 9pfs mounted using msize=xxx option)
> 
> V5 -> V6:
> -Addressed the comments by Alberto Garcia
>  -Removed the fsdev_throttle_timer_cb()
>  -Simplified the  fsdev_throttle_schedule_next_request() as suggested
> 
> V6 -> V7:
> -Addressed the comments by Alberto Garcia
>  -changed the  fsdev_throttle_schedule_next_request() as suggested
> 
> 
> diff --git a/fsdev/Makefile.objs b/fsdev/Makefile.objs
> index 1b120a4..2c6da2d 100644
> --- a/fsdev/Makefile.objs
> +++ b/fsdev/Makefile.objs
> @@ -7,6 +7,7 @@ common-obj-y = qemu-fsdev-dummy.o
>  endif
>  common-obj-y += qemu-fsdev-opts.o
>  
> +common-obj-y += qemu-fsdev-throttle.o
>  # Toplevel always builds this; targets without virtio will put it in
>  # common-obj-y
>  common-obj-$(CONFIG_ALL) += qemu-fsdev-dummy.o
> diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
> index 6db9fea..33fe822 100644
> --- a/fsdev/file-op-9p.h
> +++ b/fsdev/file-op-9p.h
> @@ -17,6 +17,7 @@
>  #include 
>  #include 
>  #include 
> +#include "qemu-fsdev-throttle.h"
>  
>  #define SM_LOCAL_MODE_BITS0600
>  #define SM_LOCAL_DIR_MODE_BITS0700
> @@ -74,6 +75,7 @@ typedef struct FsDriverEntry {
>  char *path;
>  int export_flags;
>  FileOperations *ops;
> +FsThrottle fst;
>  } FsDriverEntry;
>  
>  typedef struct FsContext
> @@ -83,6 +85,7 @@ typedef struct FsContext
>  int export_flags;
>  struct xattr_operations **xops;
>  struct extended_ops exops;
> +FsThrottle *fst;
>  /* fs driver specific data */
>  void *private;
>  } FsContext;
> diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
> index 1dd8c7a..395d497 100644
> --- a/fsdev/qemu-fsdev-opts.c
> +++ b/fsdev/qemu-fsdev-opts.c
> @@ -37,6 +37,82 @@ static QemuOptsList qemu_fsdev_opts = {
>  }, {
>  .name = "sock_fd",
>  .type = QEMU_OPT_NUMBER,
> +}, {
> +.name = "throttling.iops-total",
> +.type = QEMU_OPT_NUMBER,
> +.help = "limit total I/O operations per second",
> +},{
> +.name = "throttling.iops-read",
> +.type = QEMU_OPT_NUMBER,
> +.help = "limit read operations per second",
> +},{
> +.name =

Re: [Qemu-devel] [PATCH v9 06/12] vfio_pci: Update vfio_pci to use vfio_info_add_capability()

2016-10-24 Thread Kirti Wankhede



On 10/21/2016 12:54 AM, Alex Williamson wrote:
> On Tue, 18 Oct 2016 02:52:06 +0530
> Kirti Wankhede  wrote:
> 
>> Update msix_sparse_mmap_cap() to use vfio_info_add_capability()
>> Update region type capability to use vfio_info_add_capability()
>> Can't split this commit for MSIx and region_type cap since there is a
>> common code which need to be updated for both the cases.
>>
>> Signed-off-by: Kirti Wankhede 
>> Signed-off-by: Neo Jia 
>> Change-Id: I52bb28c7875a6da5a79ddad1843e6088aff58a45
>> ---
>>  drivers/vfio/pci/vfio_pci.c | 72 
>> +
>>  1 file changed, 27 insertions(+), 45 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index d624a52f..1ec0565b48ea 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -556,12 +556,12 @@ static int vfio_pci_for_each_slot_or_bus(struct 
>> pci_dev *pdev,
>>  }
>>  
>>  static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
>> +struct vfio_region_info *info,
>>  struct vfio_info_cap *caps)
>>  {
>> -struct vfio_info_cap_header *header;
>>  struct vfio_region_info_cap_sparse_mmap *sparse;
>>  size_t end, size;
>> -int nr_areas = 2, i = 0;
>> +int nr_areas = 2, i = 0, ret;
>>  
>>  end = pci_resource_len(vdev->pdev, vdev->msix_bar);
>>  
>> @@ -572,13 +572,10 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device 
>> *vdev,
>>  
>>  size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
>>  
>> -header = vfio_info_cap_add(caps, size,
>> -   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
>> -if (IS_ERR(header))
>> -return PTR_ERR(header);
>> +sparse = kzalloc(size, GFP_KERNEL);
>> +if (!sparse)
>> +return -ENOMEM;
>>  
>> -sparse = container_of(header,
>> -  struct vfio_region_info_cap_sparse_mmap, header);
>>  sparse->nr_areas = nr_areas;
>>  
>>  if (vdev->msix_offset & PAGE_MASK) {
>> @@ -594,26 +591,11 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device 
>> *vdev,
>>  i++;
>>  }
>>  
>> -return 0;
>> -}
>> -
>> -static int region_type_cap(struct vfio_pci_device *vdev,
>> -   struct vfio_info_cap *caps,
>> -   unsigned int type, unsigned int subtype)
>> -{
>> -struct vfio_info_cap_header *header;
>> -struct vfio_region_info_cap_type *cap;
>> -
>> -header = vfio_info_cap_add(caps, sizeof(*cap),
>> -   VFIO_REGION_INFO_CAP_TYPE, 1);
>> -if (IS_ERR(header))
>> -return PTR_ERR(header);
>> +ret = vfio_info_add_capability(info, caps,
>> +  VFIO_REGION_INFO_CAP_SPARSE_MMAP, sparse);
>> +kfree(sparse);
>>  
>> -cap = container_of(header, struct vfio_region_info_cap_type, header);
>> -cap->type = type;
>> -cap->subtype = subtype;
>> -
>> -return 0;
>> +return ret;
>>  }
>>  
>>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
>> @@ -704,7 +686,8 @@ static long vfio_pci_ioctl(void *device_data,
>>  if (vdev->bar_mmap_supported[info.index]) {
>>  info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>>  if (info.index == vdev->msix_bar) {
>> -ret = msix_sparse_mmap_cap(vdev, );
>> +ret = msix_sparse_mmap_cap(vdev, ,
>> +   );
>>  if (ret)
>>  return ret;
>>  }
>> @@ -752,6 +735,9 @@ static long vfio_pci_ioctl(void *device_data,
>>  
>>  break;
>>  default:
>> +{
>> +struct vfio_region_info_cap_type cap_type;
>> +
>>  if (info.index >=
>>  VFIO_PCI_NUM_REGIONS + vdev->num_regions)
>>  return -EINVAL;
>> @@ -762,27 +748,23 @@ static long vfio_pci_ioctl(void *device_data,
>>  info.size = vdev->region[i].size;
>>  info.flags = vdev->region[i].flags;
>>  
>> -ret = region_type_cap(vdev, ,
>> -  vdev->region[i].type,
>> -  vdev->region[i].subtype);
>> +cap_type.type = vdev->region[i].type;
>> +cap_type.subtype = vdev->region[i].subtype;
>> +
>> +ret = vfio_info_add_capability(, ,
>> +  VFIO_REGION_INFO_CAP_TYPE,
>> +  _type);
>>  if (ret)
>>

Re: [Qemu-devel] Assertion failure on qcow2 disk with cluster_size != 64k

2016-10-24 Thread Eric Blake

On 10/20/2016 07:24 PM, Ed Swierk wrote:
> Shortly after I start qemu 2.7.0 with a qcow2 disk image created with
> -o cluster_size=1048576, it prints the following and dies:
> 
> block/qcow2.c:2451: qcow2_co_pwrite_zeroes: Assertion `head + count <=
> s->cluster_size' failed.
> 
> I narrowed the problem to bdrv_co_do_pwrite_zeroes(), called by
> bdrv_aligned_pwritev() with flags & BDRV_REQ_ZERO_WRITE set.
> 
> On the first loop iteration, offset=8003584, count=2093056,
> head=663552, tail=659456 and num=2093056. qcow2_co_pwrite_zeroes() is
> called with offset=8003584 and count=385024 and finds that the head
> portion is not already zero, so it returns -ENOTSUP.
> bdrv_co_do_pwrite_zeroes() falls back to a normal write, with
> max_transfer=65536.

How are you getting max_transfer == 65536?  I can't reproduce it with
the following setup:

$ qemu-img create -f qcow2 -o cluster_size=1M file 10M
$ qemu-io -f qcow2 -c 'w 7m 1k' file
$ qemu-io -f qcow2 -c 'w -z 8003584 2093056' file

although I did confirm that the above sequence was enough to get the
-ENOTSUP failure and fall into the code calculating max_transfer.

I'm guessing that you are using something other than a file system as
the backing protocol for your qcow2 image.  But do you really have a
protocol that takes AT MOST 64k per transaction, while still trying to a
cluster size of 1M in the qcow2 format?  That's rather awkward, as it
means that you are required to do 16 transactions per cluster (the whole
point of using larger clusters is usually to get fewer transactions).  I
think we need to get to a root cause of why you are seeing such a small
max_transfer, before I can propose the right patch, since I haven't been
able to reproduce it locally yet (although I admit I haven't tried to
see if blkdebug could reliably introduce artificial limits to simulate
your setup).  And it may turn out that I just have to fix the
bdrv_co_do_pwrite_zeroes() code to loop multiple times if the size of
the unaligned head really does exceed the max_transfer size that the
underlying protocol is able to support, rather than assuming that the
unaligned head/tail always fit in a single fallback write.

Can you also try this patch? If I'm right, you'll still fail, but the
assertion will be slightly different.  (Again, I'm passing locally, but
that's because I'm using the file protocol, and my file system does not
impose a puny 64k max transfer).

diff --git i/block/io.c w/block/io.c
index b136c89..8757063 100644
--- i/block/io.c
+++ w/block/io.c
@@ -1179,6 +1179,8 @@ static int coroutine_fn
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
 bs->bl.request_alignment);
+int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+MAX_WRITE_ZEROES_BOUNCE_BUFFER);

 assert(alignment % bs->bl.request_alignment == 0);
 head = offset % alignment;
@@ -1197,6 +1199,8 @@ static int coroutine_fn
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 /* Make a small request up to the first aligned sector.  */
 num = MIN(count, alignment - head);
 head = 0;
+assert(num < max_write_zeroes);
+assert(num < max_transfer);
 } else if (tail && num > alignment) {
 /* Shorten the request to the last aligned sector.  */
 num -= tail;
@@ -1222,8 +1226,6 @@ static int coroutine_fn
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,

 if (ret == -ENOTSUP) {
 /* Fall back to bounce buffer if write zeroes is unsupported */
-int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
-
MAX_WRITE_ZEROES_BOUNCE_BUFFER);
 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;

 if ((flags & BDRV_REQ_FUA) &&

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [Bug 995758] Re: Possibly inaccurate statement in PC Platform Docs

2016-10-24 Thread Thomas Huth

As far as I can see, the wording on the page only says that the BIOS
ends at address 0xF, not that it starts execution at exactly that
address. So I think that page is ok.

** Changed in: qemu
   Status: New => Invalid

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/995758

Title:
  Possibly inaccurate statement in PC Platform Docs

Status in QEMU:
  Fix Released

Bug description:
  The documentation at:

  http://wiki.qemu.org/Documentation/Platforms/PC

  Contains the statement that the processor, after reset, executes code
  starting from address 0xF, corresponding to the last byte of the
  single megabyte of memory in the old 8086 address range.

  From my recollection of working in the microcomputer industry in the
  late 1980's, execution actually starts in real mode at the start of
  the last 16 bytes of addressable memory, at 0x0.  Think about it -
  if it's the last byte there's no room for an address operand to
  accompany a 1-byte opcode.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/995758/+subscriptions

Re: [Qemu-devel] vt-x support for qemu

2016-10-24 Thread Anand J

Hi,

I have the following setup. I'm running qemu in software mode, installed
KVM inside qemu. And on top of that I need to run qemu again but with kvm
enabled. But I'm getting the following error when I try to do that.
Can somebody please help me with this?

KVM internal error. Suberror: 1
emulation failure
EAX= EBX=404b ECX= EDX=000f5ea0
ESI= EDI= EBP= ESP=6fd0
EIP=4000 EFL=0086 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0010   00c09300 DPL=0 DS   [-WA]
CS =0008   00c09b00 DPL=0 CS32 [-RA]
SS =0010   00c09300 DPL=0 DS   [-WA]
DS =0010   00c09300 DPL=0 DS   [-WA]
FS =0010   00c09300 DPL=0 DS   [-WA]
GS =0010   00c09300 DPL=0 DS   [-WA]
LDT=   8200 DPL=0 LDT
TR =   8b00 DPL=0 TSS32-busy
GDT= 000f7180 0037
IDT= 000f71be 
CR0=0011 CR2= CR3= CR4=
DR0= DR1= DR2=
DR3=
DR6=0ff0 DR7=0400
EFER=
Code=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <00> 00 00
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00

Thanks,
Anand

On Wed, Oct 12, 2016 at 5:35 PM, Paolo Bonzini  wrote:

>
>
> On 12/10/2016 14:03, Anand J wrote:
> > I'm doing this as part of my academic project on virtualization. I'm
> > ready to
> > spend some time on this in order to understand the entire code base.
> >
> > Also I'm assuming that qemu does not have support for vt-x yet.
>
> No, it doesn't.
>
> Paolo
>

[Qemu-devel] [PATCH v2 4/6] blockjob: add block_job_start

2016-10-24 Thread John Snow

Instead of automatically starting jobs at creation time via backup_start
et al, we'd like to return a job object pointer that can be started
manually at later point in time.

For now, add the block_job_start mechanism and start the jobs
automatically as we have been doing, with conversions job-by-job coming
in later patches.

Of note: cancellation of unstarted jobs will perform all the normal
cleanup as if the job had started, particularly abort and clean. The
only difference is that we will not emit any events, because the job
never actually started.

Signed-off-by: John Snow 
---
 block/backup.c|  3 +--
 block/commit.c|  3 +--
 block/mirror.c|  3 +--
 block/stream.c|  3 +--
 blockjob.c| 51 ---
 include/block/blockjob.h  |  9 +
 tests/test-blockjob-txn.c | 12 +--
 7 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 622f64e..2ce5115 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -638,9 +638,8 @@ void backup_start(const char *job_id, BlockDriverState *bs,
 
 bdrv_op_block_all(target, job->common.blocker);
 job->common.len = len;
-job->common.co = qemu_coroutine_create(job->common.driver->start, job);
 block_job_txn_add_job(txn, >common);
-qemu_coroutine_enter(job->common.co);
+block_job_start(>common);
 return;
 
  error:
diff --git a/block/commit.c b/block/commit.c
index cc2030d..89820d7 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -275,10 +275,9 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 s->backing_file_str = g_strdup(backing_file_str);
 
 s->on_error = on_error;
-s->common.co = qemu_coroutine_create(s->common.driver->start, s);
 
 trace_commit_start(bs, base, top, s, s->common.co);
-qemu_coroutine_enter(s->common.co);
+block_job_start(>common);
 }
 
 
diff --git a/block/mirror.c b/block/mirror.c
index 3a29b94..8130474 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -970,9 +970,8 @@ static void mirror_start_job(const char *job_id, 
BlockDriverState *bs,
 
 bdrv_op_block_all(target, s->common.blocker);
 
-s->common.co = qemu_coroutine_create(s->common.driver->start, s);
 trace_mirror_start(bs, s, s->common.co, opaque);
-qemu_coroutine_enter(s->common.co);
+block_job_start(>common);
 }
 
 void mirror_start(const char *job_id, BlockDriverState *bs,
diff --git a/block/stream.c b/block/stream.c
index 8ffed9c..3e3a7d3 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -231,7 +231,6 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 s->backing_file_str = g_strdup(backing_file_str);
 
 s->on_error = on_error;
-s->common.co = qemu_coroutine_create(s->common.driver->start, s);
 trace_stream_start(bs, base, s, s->common.co);
-qemu_coroutine_enter(s->common.co);
+block_job_start(>common);
 }
diff --git a/blockjob.c b/blockjob.c
index 150b87e..f574bc8 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -171,7 +171,8 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 job->blk   = blk;
 job->cb= cb;
 job->opaque= opaque;
-job->busy  = true;
+job->busy  = false;
+job->paused= true;
 job->refcnt= 1;
 bs->job = job;
 
@@ -199,6 +200,21 @@ bool block_job_is_internal(BlockJob *job)
 return (job->id == NULL);
 }
 
+static bool block_job_started(BlockJob *job)
+{
+return job->co;
+}
+
+void block_job_start(BlockJob *job)
+{
+assert(job && !block_job_started(job) && job->paused &&
+   !job->busy && job->driver->start);
+job->paused = false;
+job->busy = true;
+job->co = qemu_coroutine_create(job->driver->start, job);
+qemu_coroutine_enter(job->co);
+}
+
 void block_job_ref(BlockJob *job)
 {
 ++job->refcnt;
@@ -239,14 +255,18 @@ static void block_job_completed_single(BlockJob *job)
 if (job->cb) {
 job->cb(job->opaque, job->ret);
 }
-if (block_job_is_cancelled(job)) {
-block_job_event_cancelled(job);
-} else {
-const char *msg = NULL;
-if (job->ret < 0) {
-msg = strerror(-job->ret);
+
+/* Emit events only if we actually started */
+if (block_job_started(job)) {
+if (block_job_is_cancelled(job)) {
+block_job_event_cancelled(job);
+} else {
+const char *msg = NULL;
+if (job->ret < 0) {
+msg = strerror(-job->ret);
+}
+block_job_event_completed(job, msg);
 }
-block_job_event_completed(job, msg);
 }
 
 if (job->txn) {
@@ -354,7 +374,8 @@ void block_job_complete(BlockJob *job, Error **errp)
 {
 /* Should not be reachable via external interface for internal jobs */
 assert(job->id);
-if (job->pause_count || job->cancelled || !job->driver->complete) {
+if

[Qemu-devel] [PATCH v2 5/6] blockjob: refactor backup_start as backup_job_create

2016-10-24 Thread John Snow

Refactor backup_start as backup_job_create, which only creates the job,
but does not automatically start it. The old interface, 'backup_start',
is not kept in favor of limiting the number of nearly-identical interfaces
that would have to be edited to keep up with QAPI changes in the future.

Callers that wish to synchronously start the backup_block_job can
instead just call block_job_start immediately after calling
backup_job_create.

Transactions are updated to use the new interface, calling block_job_start
only during the .commit phase, which helps prevent race conditions where
jobs may finish before we even finish building the transaction. This may
happen, for instance, during empty block backup jobs.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: John Snow 
---
 block/backup.c| 26 ---
 block/replication.c   | 12 ---
 blockdev.c| 83 ++-
 include/block/block_int.h | 23 ++---
 4 files changed, 87 insertions(+), 57 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 2ce5115..d7e5c48 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -527,7 +527,7 @@ static const BlockJobDriver backup_job_driver = {
 .attached_aio_context   = backup_attached_aio_context,
 };
 
-void backup_start(const char *job_id, BlockDriverState *bs,
+BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
   BlockDriverState *target, int64_t speed,
   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
   bool compress,
@@ -547,52 +547,52 @@ void backup_start(const char *job_id, BlockDriverState 
*bs,
 
 if (bs == target) {
 error_setg(errp, "Source and target cannot be the same");
-return;
+return NULL;
 }
 
 if (!bdrv_is_inserted(bs)) {
 error_setg(errp, "Device is not inserted: %s",
bdrv_get_device_name(bs));
-return;
+return NULL;
 }
 
 if (!bdrv_is_inserted(target)) {
 error_setg(errp, "Device is not inserted: %s",
bdrv_get_device_name(target));
-return;
+return NULL;
 }
 
 if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
 error_setg(errp, "Compression is not supported for this drive %s",
bdrv_get_device_name(target));
-return;
+return NULL;
 }
 
 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
-return;
+return NULL;
 }
 
 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
-return;
+return NULL;
 }
 
 if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
 if (!sync_bitmap) {
 error_setg(errp, "must provide a valid bitmap name for "
  "\"incremental\" sync mode");
-return;
+return NULL;
 }
 
 /* Create a new bitmap, and freeze/disable this one. */
 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
-return;
+return NULL;
 }
 } else if (sync_bitmap) {
 error_setg(errp,
"a sync_bitmap was provided to backup_run, "
"but received an incompatible sync_mode (%s)",
MirrorSyncMode_lookup[sync_mode]);
-return;
+return NULL;
 }
 
 len = bdrv_getlength(bs);
@@ -639,8 +639,8 @@ void backup_start(const char *job_id, BlockDriverState *bs,
 bdrv_op_block_all(target, job->common.blocker);
 job->common.len = len;
 block_job_txn_add_job(txn, >common);
-block_job_start(>common);
-return;
+
+return >common;
 
  error:
 if (sync_bitmap) {
@@ -650,4 +650,6 @@ void backup_start(const char *job_id, BlockDriverState *bs,
 backup_clean(>common);
 block_job_unref(>common);
 }
+
+return NULL;
 }
diff --git a/block/replication.c b/block/replication.c
index d4f4a7b..ca4a381 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -409,6 +409,7 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 int64_t active_length, hidden_length, disk_length;
 AioContext *aio_context;
 Error *local_err = NULL;
+BlockJob *job;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
@@ -496,17 +497,18 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 bdrv_op_block_all(top_bs, s->blocker);
 bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-backup_start(NULL, s->secondary_disk->bs, s->hidden_disk->bs, 0,
- MIRROR_SYNC_MODE_NONE, NULL, false,
- BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
- BLOCK_JOB_INTERNAL, backup_job_completed, s,
- NULL,

[Qemu-devel] [PATCH v2 0/6] jobs: fix transactional race condition

2016-10-24 Thread John Snow

Requires: [Qemu-devel] [PATCH 0/7] blockjobs: preliminary refactoring work, Pt 1

There are a few problems with transactional job completion right now.

First, if jobs complete so quickly they complete before remaining jobs
get a chance to join the transaction, the completion mode can leave well
known state and the QLIST can get corrupted and the transactional jobs
can complete in batches or phases instead of all together.

Second, if two or more jobs defer to the main loop at roughly the same
time, it's possible for one job's cleanup to directly invoke the other
job's cleanup from within the same thread, leading to a situation that
will deadlock the entire transaction.

Thanks to Vladimir for pointing out these modes of failure.

I have omitted the test for right now, but wanted to air the patches on-list.
It makes no attempt to change the locking mechanisms around qmp_transaction
right now, asserting instead that things are no more broken than they were,
especially in the case of dataplane. I will make further attempts to clarify
the locking mechanisms around qmp_transaction after Paolo's changes go in.

===
v2:
===

- Correct Vladimir's email (Sorry!)
- Add test as a variant of an existing test [Vladimir]



For convenience, this branch is available at:
https://github.com/jnsnow/qemu.git branch job-fix-race-condition
https://github.com/jnsnow/qemu/tree/job-fix-race-condition

This version is tagged job-fix-race-condition-v2:
https://github.com/jnsnow/qemu/releases/tag/job-fix-race-condition-v2

John Snow (5):
  blockjob: add .clean property
  blockjob: add .start field
  blockjob: add block_job_start
  blockjob: refactor backup_start as backup_job_create
  iotests: add transactional failure race test

Vladimir Sementsov-Ogievskiy (1):
  blockjob: fix dead pointer in txn list

 block/backup.c   | 59 +--
 block/commit.c   |  4 +--
 block/mirror.c   |  5 +--
 block/replication.c  | 12 ---
 block/stream.c   |  4 +--
 blockdev.c   | 83 
 blockjob.c   | 55 ++---
 include/block/block_int.h| 23 ++--
 include/block/blockjob.h |  9 +
 include/block/blockjob_int.h | 11 ++
 tests/qemu-iotests/124   | 53 ++--
 tests/qemu-iotests/124.out   |  4 +--
 tests/test-blockjob-txn.c| 12 +++
 13 files changed, 219 insertions(+), 115 deletions(-)

-- 
2.7.4

[Qemu-devel] [PATCH v2 6/6] iotests: add transactional failure race test

2016-10-24 Thread John Snow

Add a regression test for the case found by Vladimir.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: John Snow 
---
 tests/qemu-iotests/124 | 53 ++
 tests/qemu-iotests/124.out |  4 ++--
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
index 2f0bc24..bc855ed 100644
--- a/tests/qemu-iotests/124
+++ b/tests/qemu-iotests/124
@@ -395,19 +395,7 @@ class TestIncrementalBackup(TestIncrementalBackupBase):
 self.check_backups()
 
 
-def test_transaction_failure(self):
-'''Test: Verify backups made from a transaction that partially fails.
-
-Add a second drive with its own unique pattern, and add a bitmap to 
each
-drive. Use blkdebug to interfere with the backup on just one drive and
-attempt to create a coherent incremental backup across both drives.
-
-verify a failure in one but not both, then delete the failed stubs and
-re-run the same transaction.
-
-verify that both incrementals are created successfully.
-'''
-
+def do_transaction_failure_test(self, race=False):
 # Create a second drive, with pattern:
 drive1 = self.add_node('drive1')
 self.img_create(drive1['file'], drive1['fmt'])
@@ -451,9 +439,10 @@ class TestIncrementalBackup(TestIncrementalBackupBase):
 self.assertFalse(self.vm.get_qmp_events(wait=False))
 
 # Emulate some writes
-self.hmp_io_writes(drive0['id'], (('0xab', 0, 512),
-  ('0xfe', '16M', '256k'),
-  ('0x64', '32736k', '64k')))
+if not race:
+self.hmp_io_writes(drive0['id'], (('0xab', 0, 512),
+  ('0xfe', '16M', '256k'),
+  ('0x64', '32736k', '64k')))
 self.hmp_io_writes(drive1['id'], (('0xba', 0, 512),
   ('0xef', '16M', '256k'),
   ('0x46', '32736k', '64k')))
@@ -463,7 +452,8 @@ class TestIncrementalBackup(TestIncrementalBackupBase):
 target1 = self.prepare_backup(dr1bm0)
 
 # Ask for a new incremental backup per-each drive,
-# expecting drive1's backup to fail:
+# expecting drive1's backup to fail. In the 'race' test,
+# we expect drive1 to attempt to cancel the empty drive0 job.
 transaction = [
 transaction_drive_backup(drive0['id'], target0, sync='incremental',
  format=drive0['fmt'], mode='existing',
@@ -488,9 +478,15 @@ class TestIncrementalBackup(TestIncrementalBackupBase):
 self.assert_no_active_block_jobs()
 
 # Delete drive0's successful target and eliminate our record of the
-# unsuccessful drive1 target. Then re-run the same transaction.
+# unsuccessful drive1 target.
 dr0bm0.del_target()
 dr1bm0.del_target()
+if race:
+# Don't re-run the transaction, we only wanted to test the race.
+self.vm.shutdown()
+return
+
+# Re-run the same transaction:
 target0 = self.prepare_backup(dr0bm0)
 target1 = self.prepare_backup(dr1bm0)
 
@@ -511,6 +507,27 @@ class TestIncrementalBackup(TestIncrementalBackupBase):
 self.vm.shutdown()
 self.check_backups()
 
+def test_transaction_failure(self):
+'''Test: Verify backups made from a transaction that partially fails.
+
+Add a second drive with its own unique pattern, and add a bitmap to 
each
+drive. Use blkdebug to interfere with the backup on just one drive and
+attempt to create a coherent incremental backup across both drives.
+
+verify a failure in one but not both, then delete the failed stubs and
+re-run the same transaction.
+
+verify that both incrementals are created successfully.
+'''
+self.do_transaction_failure_test()
+
+def test_transaction_failure_race(self):
+'''Test: Verify that transactions with jobs that have no data to
+transfer do not cause race conditions in the cancellation of the entire
+transaction job group.
+'''
+self.do_transaction_failure_test(race=True)
+
 
 def test_sync_dirty_bitmap_missing(self):
 self.assert_no_active_block_jobs()
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
index 36376be..e56cae0 100644
--- a/tests/qemu-iotests/124.out
+++ b/tests/qemu-iotests/124.out
@@ -1,5 +1,5 @@
-..
+...
 --
-Ran 10 tests
+Ran 11 tests
 
 OK
-- 
2.7.4

[Qemu-devel] [PATCH v2 1/6] blockjob: fix dead pointer in txn list

2016-10-24 Thread John Snow

From: Vladimir Sementsov-Ogievskiy 

Though it is not intended to be reached through normal circumstances,
if we do not gracefully deconstruct the transaction QLIST, we may wind
up with stale pointers in the list.

The rest of this series attempts to address the underlying issues,
but this should fix list inconsistencies.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Tested-by: John Snow 
Reviewed-by: John Snow 
[Rewrote commit message. --js]
Signed-off-by: John Snow 
Reviewed-by: Eric Blake 
Reviewed-by: Kevin Wolf 

Signed-off-by: John Snow 
---
 blockjob.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/blockjob.c b/blockjob.c
index e1d0382..f55bfec 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -247,6 +247,7 @@ static void block_job_completed_single(BlockJob *job)
 }
 
 if (job->txn) {
+QLIST_REMOVE(job, txn_list);
 block_job_txn_unref(job->txn);
 }
 block_job_unref(job);
-- 
2.7.4

[Qemu-devel] [PATCH v2 2/6] blockjob: add .clean property

2016-10-24 Thread John Snow

Cleaning up after we have deferred to the main thread but before the
transaction has converged can be dangerous and result in deadlocks
if the job cleanup invokes any BH polling loops.

A job may attempt to begin cleaning up, but may induce another job to
enter its cleanup routine. The second job, part of our same transaction,
will block waiting for the first job to finish, so neither job may now
make progress.

To rectify this, allow jobs to register a cleanup operation that will
always run regardless of if the job was in a transaction or not, and
if the transaction job group completed successfully or not.

Move sensitive cleanup to this callback instead which is guaranteed to
be run only after the transaction has converged, which removes sensitive
timing constraints from said cleanup.

Furthermore, in future patches these cleanup operations will be performed
regardless of whether or not we actually started the job. Therefore,
cleanup callbacks should essentially confine themselves to undoing create
operations, e.g. setup actions taken in what is now backup_start.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: John Snow 
---
 block/backup.c   | 13 +
 blockjob.c   |  3 +++
 include/block/blockjob_int.h |  8 
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 6d12100..ed6d74a 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -242,6 +242,13 @@ static void backup_abort(BlockJob *job)
 }
 }
 
+static void backup_clean(BlockJob *job)
+{
+BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+assert(s->target);
+blk_unref(s->target);
+}
+
 static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common);
@@ -306,6 +313,7 @@ static const BlockJobDriver backup_job_driver = {
 .set_speed  = backup_set_speed,
 .commit = backup_commit,
 .abort  = backup_abort,
+.clean  = backup_clean,
 .attached_aio_context   = backup_attached_aio_context,
 };
 
@@ -327,11 +335,8 @@ typedef struct {
 
 static void backup_complete(BlockJob *job, void *opaque)
 {
-BackupBlockJob *s = container_of(job, BackupBlockJob, common);
 BackupCompleteData *data = opaque;
 
-blk_unref(s->target);
-
 block_job_completed(job, data->ret);
 g_free(data);
 }
@@ -642,7 +647,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
 }
 if (job) {
-blk_unref(job->target);
+backup_clean(>common);
 block_job_unref(>common);
 }
 }
diff --git a/blockjob.c b/blockjob.c
index f55bfec..150b87e 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -232,6 +232,9 @@ static void block_job_completed_single(BlockJob *job)
 job->driver->abort(job);
 }
 }
+if (job->driver->clean) {
+job->driver->clean(job);
+}
 
 if (job->cb) {
 job->cb(job->opaque, job->ret);
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index 10ebb38..1c4bc90 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -74,6 +74,14 @@ struct BlockJobDriver {
 void (*abort)(BlockJob *job);
 
 /**
+ * If the callback is not NULL, it will be invoked after a call to either
+ * .commit() or .abort(). Regardless of which callback is invoked after
+ * completion, .clean() will always be called, even if the job does not
+ * belong to a transaction group.
+ */
+void (*clean)(BlockJob *job);
+
+/**
  * If the callback is not NULL, it will be invoked when the job transitions
  * into the paused state.  Paused jobs must not perform any asynchronous
  * I/O or event loop activity.  This callback is used to quiesce jobs.
-- 
2.7.4

[Qemu-devel] [PATCH v2 3/6] blockjob: add .start field

2016-10-24 Thread John Snow

Add an explicit start field to specify the entrypoint. We already have
ownership of the coroutine itself AND managing the lifetime of the
coroutine, let's take control of creation of the coroutine, too.

This will allow us to delay creation of the actual coroutine until we
know we'll actually start a BlockJob in block_job_start. This avoids
the sticky question of how to "un-create" a Coroutine that hasn't been
started yet.

Signed-off-by: John Snow 
---
 block/backup.c   | 23 ---
 block/commit.c   |  3 ++-
 block/mirror.c   |  4 +++-
 block/stream.c   |  3 ++-
 include/block/blockjob_int.h |  3 +++
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index ed6d74a..622f64e 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -307,16 +307,6 @@ void backup_cow_request_end(CowRequest *req)
 cow_request_end(req);
 }
 
-static const BlockJobDriver backup_job_driver = {
-.instance_size  = sizeof(BackupBlockJob),
-.job_type   = BLOCK_JOB_TYPE_BACKUP,
-.set_speed  = backup_set_speed,
-.commit = backup_commit,
-.abort  = backup_abort,
-.clean  = backup_clean,
-.attached_aio_context   = backup_attached_aio_context,
-};
-
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
 bool read, int error)
 {
@@ -526,6 +516,17 @@ static void coroutine_fn backup_run(void *opaque)
 block_job_defer_to_main_loop(>common, backup_complete, data);
 }
 
+static const BlockJobDriver backup_job_driver = {
+.instance_size  = sizeof(BackupBlockJob),
+.job_type   = BLOCK_JOB_TYPE_BACKUP,
+.start  = backup_run,
+.set_speed  = backup_set_speed,
+.commit = backup_commit,
+.abort  = backup_abort,
+.clean  = backup_clean,
+.attached_aio_context   = backup_attached_aio_context,
+};
+
 void backup_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *target, int64_t speed,
   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
@@ -637,7 +638,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
 
 bdrv_op_block_all(target, job->common.blocker);
 job->common.len = len;
-job->common.co = qemu_coroutine_create(backup_run, job);
+job->common.co = qemu_coroutine_create(job->common.driver->start, job);
 block_job_txn_add_job(txn, >common);
 qemu_coroutine_enter(job->common.co);
 return;
diff --git a/block/commit.c b/block/commit.c
index d555600..cc2030d 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -205,6 +205,7 @@ static const BlockJobDriver commit_job_driver = {
 .instance_size = sizeof(CommitBlockJob),
 .job_type  = BLOCK_JOB_TYPE_COMMIT,
 .set_speed = commit_set_speed,
+.start = commit_run,
 };
 
 void commit_start(const char *job_id, BlockDriverState *bs,
@@ -274,7 +275,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 s->backing_file_str = g_strdup(backing_file_str);
 
 s->on_error = on_error;
-s->common.co = qemu_coroutine_create(commit_run, s);
+s->common.co = qemu_coroutine_create(s->common.driver->start, s);
 
 trace_commit_start(bs, base, top, s, s->common.co);
 qemu_coroutine_enter(s->common.co);
diff --git a/block/mirror.c b/block/mirror.c
index c81b5e0..3a29b94 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -891,6 +891,7 @@ static const BlockJobDriver mirror_job_driver = {
 .instance_size  = sizeof(MirrorBlockJob),
 .job_type   = BLOCK_JOB_TYPE_MIRROR,
 .set_speed  = mirror_set_speed,
+.start  = mirror_run,
 .complete   = mirror_complete,
 .pause  = mirror_pause,
 .attached_aio_context   = mirror_attached_aio_context,
@@ -900,6 +901,7 @@ static const BlockJobDriver commit_active_job_driver = {
 .instance_size  = sizeof(MirrorBlockJob),
 .job_type   = BLOCK_JOB_TYPE_COMMIT,
 .set_speed  = mirror_set_speed,
+.start  = mirror_run,
 .complete   = mirror_complete,
 .pause  = mirror_pause,
 .attached_aio_context   = mirror_attached_aio_context,
@@ -968,7 +970,7 @@ static void mirror_start_job(const char *job_id, 
BlockDriverState *bs,
 
 bdrv_op_block_all(target, s->common.blocker);
 
-s->common.co = qemu_coroutine_create(mirror_run, s);
+s->common.co = qemu_coroutine_create(s->common.driver->start, s);
 trace_mirror_start(bs, s, s->common.co, opaque);
 qemu_coroutine_enter(s->common.co);
 }
diff --git a/block/stream.c b/block/stream.c
index 906f7f3..8ffed9c 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -212,6 +212,7 @@

[Qemu-devel] [Bug 1017793] Re: S3 Trio64V+ support

2016-10-24 Thread Thomas Huth

** Changed in: qemu
   Importance: Undecided => Wishlist

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1017793

Title:
  S3 Trio64V+ support

Status in QEMU:
  New

Bug description:
  Is it possible to add S3 Trio emulation to QEMU at all? Since 0.12.3
  the Cirrus Logic seems no longer working properly (bad font
  render/corrupted video). Also, S3 is a widely supported device on many
  OSes and architectures, which will give more compatibility for QEMU.

  Thanks!

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1017793/+subscriptions

[Qemu-devel] [Bug 1013691] Re: ppc64 + virtio-scsi: only first scsi disk shows up in the guest

2016-10-24 Thread Thomas Huth

** Tags added: ppc

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1013691

Title:
  ppc64 + virtio-scsi: only first scsi disk shows up in the guest

Status in QEMU:
  New

Bug description:
  When adding two virtio-scsi targets to a single guest, only the first
  disk is seen inside the guest.  For some unknown reason the guest
  doesn't enumerate the second disk.

  For full qemu-system-ppc64 command line and 'dmesg' output, see:

  http://lists.nongnu.org/archive/html/qemu-devel/2012-06/msg02430.html

  I have also tried this with Linus's git tree (3.5.0-rc2+ at time of writing),
  same thing.

  In both cases I'm using qemu from git.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1013691/+subscriptions

[Qemu-devel] [Bug 1543057] Re: Warnings are treated as errors

2016-10-24 Thread Thomas Huth

Closing this as invalid - unless you can reproduce this with the latest
release version or the current master branch again, then please feel
free to open this ticket again.

** Changed in: qemu
   Status: New => Invalid

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1543057

Title:
  Warnings are treated as errors

Status in QEMU:
  Invalid

Bug description:
  System: Ubuntu 14.04, 32bit
  Kernel: 3.13.0-55-generic
  Qemu: v. 2.2.50

  Error msg:

  hw/acpi/pcihp.c: In function ‘acpi_pcihp_pc_no_hotplug’:
  hw/acpi/pcihp.c:117:34: error: ‘PCIDevice’ has no member named ‘qdev’
   return (pc->is_bridge && !dev->qdev.hotplugged) || !dc->hotpluggable;
^
  hw/acpi/pcihp.c:118:1: error: control reaches end of non-void function 
[-Werror=return-type]
   }
   ^
  cc1: all warnings being treated as errors

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1543057/+subscriptions

[Qemu-devel] [PATCH 7/7] acpi/ipmi: Initialize the fwinfo before fetching it

2016-10-24 Thread minyard

From: Corey Minyard 

The initialization was missed before, resulting in some
bad data in the smbus case.

Signed-off-by: Corey Minyard 
---
 hw/acpi/ipmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/acpi/ipmi.c b/hw/acpi/ipmi.c
index 7e74ce4..651e2e9 100644
--- a/hw/acpi/ipmi.c
+++ b/hw/acpi/ipmi.c
@@ -99,6 +99,7 @@ void build_acpi_ipmi_devices(Aml *scope, BusState *bus)
 
 ii = IPMI_INTERFACE(obj);
 iic = IPMI_INTERFACE_GET_CLASS(obj);
+memset(, 0, sizeof(info));
 iic->get_fwinfo(ii, );
 aml_append(scope, aml_ipmi_device());
 }
-- 
2.7.4

[Qemu-devel] [PATCH 3/7] ipmi: chassis poweroff should use qemu_system_shutdown_request()

2016-10-24 Thread minyard

From: Cédric Le Goater 

When issuing a chassis 'powerdown' control command, the routine
qemu_system_shutdown_request() should be used to exit the guest.
qemu_system_powerdown_request() will initiate a soft shutdown which is
not what is required by the IPMI (28.3 Chassis Control Command):

0h = power down. Force system into soft off (S4/S45) state. This
is for 'emergency' management power down actions. The command does
not initiate a clean shut-down of the operating system prior to
powering down the system

Signed-off-by: Cédric Le Goater 
Signed-off-by: Corey Minyard 
---
 hw/ipmi/ipmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ipmi/ipmi.c b/hw/ipmi/ipmi.c
index f09f217..f91c7b7 100644
--- a/hw/ipmi/ipmi.c
+++ b/hw/ipmi/ipmi.c
@@ -51,7 +51,7 @@ static int ipmi_do_hw_op(IPMIInterface *s, enum ipmi_op op, 
int checkonly)
 if (checkonly) {
 return 0;
 }
-qemu_system_powerdown_request();
+qemu_system_shutdown_request();
 return 0;
 
 case IPMI_SEND_NMI:
-- 
2.7.4

[Qemu-devel] [PATCH 2/7] ipmi_bmc_sim: Remove an unnecessary mutex

2016-10-24 Thread minyard

From: Corey Minyard 

Get rid of the unnecessary mutex, it was a vestige
of something else that was not done.  That way we don't
have to free it.

Signed-off-by: Corey Minyard 
Reviewed-by: Marc-André Lureau 
---
 hw/ipmi/ipmi_bmc_sim.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/hw/ipmi/ipmi_bmc_sim.c b/hw/ipmi/ipmi_bmc_sim.c
index a0282cb..c7883d6 100644
--- a/hw/ipmi/ipmi_bmc_sim.c
+++ b/hw/ipmi/ipmi_bmc_sim.c
@@ -217,7 +217,6 @@ struct IPMIBmcSim {
 /* Odd netfns are for responses, so we only need the even ones. */
 const IPMINetfn *netfns[MAX_NETFNS / 2];
 
-QemuMutex lock;
 /* We allow one event in the buffer */
 uint8_t evtbuf[16];
 
@@ -940,7 +939,6 @@ static void get_msg(IPMIBmcSim *ibs,
 {
 IPMIRcvBufEntry *msg;
 
-qemu_mutex_lock(>lock);
 if (QTAILQ_EMPTY(>rcvbufs)) {
 rsp_buffer_set_error(rsp, 0x80); /* Queue empty */
 goto out;
@@ -960,7 +958,6 @@ static void get_msg(IPMIBmcSim *ibs,
 }
 
 out:
-qemu_mutex_unlock(>lock);
 return;
 }
 
@@ -1055,11 +1052,9 @@ static void send_msg(IPMIBmcSim *ibs,
  end_msg:
 msg->buf[msg->len] = ipmb_checksum(msg->buf, msg->len, 0);
 msg->len++;
-qemu_mutex_lock(>lock);
 QTAILQ_INSERT_TAIL(>rcvbufs, msg, entry);
 ibs->msg_flags |= IPMI_BMC_MSG_FLAG_RCV_MSG_QUEUE;
 k->set_atn(s, 1, attn_irq_enabled(ibs));
-qemu_mutex_unlock(>lock);
 }
 
 static void do_watchdog_reset(IPMIBmcSim *ibs)
@@ -1753,7 +1748,6 @@ static void ipmi_sim_realize(DeviceState *dev, Error 
**errp)
 unsigned int i;
 IPMIBmcSim *ibs = IPMI_BMC_SIMULATOR(b);
 
-qemu_mutex_init(>lock);
 QTAILQ_INIT(>rcvbufs);
 
 ibs->bmc_global_enables = (1 << IPMI_BMC_EVENT_LOG_BIT);
-- 
2.7.4

[Qemu-devel] [PATCH 0/7] ipmi: Various little IPMI fixes

2016-10-24 Thread minyard

I've posted these before in various forms, but they haven't been
picked up.  These are little fixed noticed by others and myself,
nothing huge, but things that needed attention.

-corey

Re: [Qemu-devel] target-ppc: gdbstub breakpoints get stuck in an infinite loop on next/continue

2016-10-24 Thread Benjamin Herrenschmidt

On Mon, 2016-10-24 at 12:00 +1100, David Gibson wrote:
> Ben, does it look like the other extraneous changes in bd6fefe are at
> least correct, apart from being in the wrong patch?

It looks like part of my big rewrite of the exception stuff, so I'd
assume it's mostly correct minus a few bugs I fixed separately such
as the one we are just talking about :-)

Cheers,
Ben.

Re: [Qemu-devel] Assertion failure on qcow2 disk with cluster_size != 64k

2016-10-24 Thread Eric Blake

On 10/21/2016 08:14 AM, Ed Swierk wrote:
> On Thu, Oct 20, 2016 at 6:38 PM, Eric Blake  wrote:
>> On 10/20/2016 07:24 PM, Ed Swierk wrote:
>>> Changing max_transfer in the normal write case to
>>> MIN_NON_ZERO(alignment, MAX_WRITE_ZEROES_BOUNCE_BUFFER) appears to fix
>>> the problem, but I don't pretend to understand all the subtleties
>>> here.
>>
>> That actually sounds like the right fix.  But since the bug was probably
>> caused by my code, I'll formalize it into a patch and see if I can
>> modify the testsuite to give it coverage.
> 
> If alignment > MAX_WRITE_ZEROES_BOUNCE_BUFFER (however unlikely) we
> have the same problem, so maybe this would be better?

Our qcow2 support is currently limited to a maximum of 2M clusters;
while MAX_WRITE_ZEROES_BOUNCE_BUFFER is 32k * 512, or 16M.  The
maximum-size bounce buffer should not be the problem here; but for some
reason, it looks like alignment is larger than max_transfer which should
not normally be possible.  I'm still playing with what should be the
right patch, but hope to have something posted soon.

> 
> max_transfer = alignment > 0 ? alignment : MAX_WRITE_ZEROES_BOUNCE_BUFFER
> 
> --Ed
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [PULL v2 02/16] pc: acpi: x2APIC support for SRAT table

2016-10-24 Thread Eduardo Habkost

From: Igor Mammedov 

Signed-off-by: Igor Mammedov 
Reviewed-by: Eduardo Habkost 
Signed-off-by: Eduardo Habkost 
---
 hw/i386/acpi-build.c| 34 --
 include/hw/acpi/acpi-defs.h | 11 +++
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 385f9fc..93be96f 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2421,7 +2421,6 @@ static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
 AcpiSystemResourceAffinityTable *srat;
-AcpiSratProcessorAffinity *core;
 AcpiSratMemoryAffinity *numamem;
 
 int i;
@@ -2441,18 +2440,33 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 
 for (i = 0; i < apic_ids->len; i++) {
 int j = numa_get_node_for_cpu(i);
-int apic_id = apic_ids->cpus[i].arch_id;
+uint32_t apic_id = apic_ids->cpus[i].arch_id;
 
-core = acpi_data_push(table_data, sizeof *core);
-core->type = ACPI_SRAT_PROCESSOR_APIC;
-core->length = sizeof(*core);
-core->local_apic_id = apic_id;
-if (j < nb_numa_nodes) {
+if (apic_id < 255) {
+AcpiSratProcessorAffinity *core;
+
+core = acpi_data_push(table_data, sizeof *core);
+core->type = ACPI_SRAT_PROCESSOR_APIC;
+core->length = sizeof(*core);
+core->local_apic_id = apic_id;
+if (j < nb_numa_nodes) {
 core->proximity_lo = j;
+}
+memset(core->proximity_hi, 0, 3);
+core->local_sapic_eid = 0;
+core->flags = cpu_to_le32(1);
+} else {
+AcpiSratProcessorX2ApicAffinity *core;
+
+core = acpi_data_push(table_data, sizeof *core);
+core->type = ACPI_SRAT_PROCESSOR_x2APIC;
+core->length = sizeof(*core);
+core->x2apic_id = cpu_to_le32(apic_id);
+if (j < nb_numa_nodes) {
+core->proximity_domain = cpu_to_le32(j);
+}
+core->flags = cpu_to_le32(1);
 }
-memset(core->proximity_hi, 0, 3);
-core->local_sapic_eid = 0;
-core->flags = cpu_to_le32(1);
 }
 
 
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index e94123c..fa89abc 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -503,6 +503,17 @@ struct AcpiSratProcessorAffinity
 } QEMU_PACKED;
 typedef struct AcpiSratProcessorAffinity AcpiSratProcessorAffinity;
 
+struct AcpiSratProcessorX2ApicAffinity {
+ACPI_SUB_HEADER_DEF
+uint16_treserved;
+uint32_tproximity_domain;
+uint32_tx2apic_id;
+uint32_tflags;
+uint32_tclk_domain;
+uint32_treserved2;
+} QEMU_PACKED;
+typedef struct AcpiSratProcessorX2ApicAffinity AcpiSratProcessorX2ApicAffinity;
+
 struct AcpiSratMemoryAffinity
 {
 ACPI_SUB_HEADER_DEF
-- 
2.7.4

[Qemu-devel] [PULL v2 01/16] pc: acpi: x2APIC support for MADT table and _MAT method

2016-10-24 Thread Eduardo Habkost

From: Igor Mammedov 

Signed-off-by: Igor Mammedov 
Reviewed-by: Eduardo Habkost 
Signed-off-by: Eduardo Habkost 
---
 hw/acpi/cpu.c   |  5 +++
 hw/i386/acpi-build.c| 78 +++--
 include/hw/acpi/acpi-defs.h | 18 +++
 3 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index 902f5c9..5ac89fe 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -531,6 +531,11 @@ void build_cpus_aml(Aml *table, MachineState *machine, 
CPUHotplugFeatures opts,
 apic->flags = cpu_to_le32(1);
 break;
 }
+case ACPI_APIC_LOCAL_X2APIC: {
+AcpiMadtProcessorX2Apic *apic = (void *)madt_buf->data;
+apic->flags = cpu_to_le32(1);
+break;
+}
 default:
 assert(0);
 }
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index e999654..385f9fc 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -340,24 +340,38 @@ build_fadt(GArray *table_data, BIOSLinker *linker, 
AcpiPmInfo *pm,
 void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
CPUArchIdList *apic_ids, GArray *entry)
 {
-int apic_id;
-AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic);
-
-apic_id = apic_ids->cpus[uid].arch_id;
-apic->type = ACPI_APIC_PROCESSOR;
-apic->length = sizeof(*apic);
-apic->processor_id = uid;
-apic->local_apic_id = apic_id;
-if (apic_ids->cpus[uid].cpu != NULL) {
-apic->flags = cpu_to_le32(1);
+uint32_t apic_id = apic_ids->cpus[uid].arch_id;
+
+/* ACPI spec says that LAPIC entry for non present
+ * CPU may be omitted from MADT or it must be marked
+ * as disabled. However omitting non present CPU from
+ * MADT breaks hotplug on linux. So possible CPUs
+ * should be put in MADT but kept disabled.
+ */
+if (apic_id < 255) {
+AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic);
+
+apic->type = ACPI_APIC_PROCESSOR;
+apic->length = sizeof(*apic);
+apic->processor_id = uid;
+apic->local_apic_id = apic_id;
+if (apic_ids->cpus[uid].cpu != NULL) {
+apic->flags = cpu_to_le32(1);
+} else {
+apic->flags = cpu_to_le32(0);
+}
 } else {
-/* ACPI spec says that LAPIC entry for non present
- * CPU may be omitted from MADT or it must be marked
- * as disabled. However omitting non present CPU from
- * MADT breaks hotplug on linux. So possible CPUs
- * should be put in MADT but kept disabled.
- */
-apic->flags = cpu_to_le32(0);
+AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic);
+
+apic->type = ACPI_APIC_LOCAL_X2APIC;
+apic->length = sizeof(*apic);
+apic->uid = cpu_to_le32(uid);
+apic->x2apic_id = cpu_to_le32(apic_id);
+if (apic_ids->cpus[uid].cpu != NULL) {
+apic->flags = cpu_to_le32(1);
+} else {
+apic->flags = cpu_to_le32(0);
+}
 }
 }
 
@@ -369,11 +383,11 @@ build_madt(GArray *table_data, BIOSLinker *linker, 
PCMachineState *pcms)
 int madt_start = table_data->len;
 AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(pcms->acpi_dev);
 AcpiDeviceIf *adev = ACPI_DEVICE_IF(pcms->acpi_dev);
+bool x2apic_mode = false;
 
 AcpiMultipleApicTable *madt;
 AcpiMadtIoApic *io_apic;
 AcpiMadtIntsrcovr *intsrcovr;
-AcpiMadtLocalNmi *local_nmi;
 int i;
 
 madt = acpi_data_push(table_data, sizeof *madt);
@@ -382,6 +396,9 @@ build_madt(GArray *table_data, BIOSLinker *linker, 
PCMachineState *pcms)
 
 for (i = 0; i < apic_ids->len; i++) {
 adevc->madt_cpu(adev, i, apic_ids, table_data);
+if (apic_ids->cpus[i].arch_id > 254) {
+x2apic_mode = true;
+}
 }
 g_free(apic_ids);
 
@@ -414,12 +431,25 @@ build_madt(GArray *table_data, BIOSLinker *linker, 
PCMachineState *pcms)
 intsrcovr->flags  = cpu_to_le16(0xd); /* active high, level triggered 
*/
 }
 
-local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
-local_nmi->type = ACPI_APIC_LOCAL_NMI;
-local_nmi->length   = sizeof(*local_nmi);
-local_nmi->processor_id = 0xff; /* all processors */
-local_nmi->flags= cpu_to_le16(0);
-local_nmi->lint = 1; /* ACPI_LINT1 */
+if (x2apic_mode) {
+AcpiMadtLocalX2ApicNmi *local_nmi;
+
+local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
+local_nmi->type   = ACPI_APIC_LOCAL_X2APIC_NMI;
+local_nmi->length = sizeof(*local_nmi);
+local_nmi->uid= 0x; /* all processors */
+local_nmi->flags  = cpu_to_le16(0);
+local_nmi->lint   = 1; /* ACPI_LINT1 */
+} else

[Qemu-devel] [PULL v2 00/16] x86 and CPU queue, 2016-10-24

2016-10-24 Thread Eduardo Habkost

Change in v2:
* Removed patch: "target-i386: Print warning when mixing [+-]foo
  and foo=(on|off)"

The following changes since commit a3ae21ec3fe036f536dc94cad735931777143103:

  Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging 
(2016-10-24 15:03:09 +0100)

are available in the git repository at:

  git://github.com/ehabkost/qemu.git tags/x86-pull-request

for you to fetch changes up to 7bbc124e7e8fb544288ccd1f5185643a7d0554b8:

  exec: call cpu_exec_exit() from a CPU unrealize common function (2016-10-24 
17:29:16 -0200)


x86 and CPU queue, 2016-10-24

x2APIC support to APIC code, cpu_exec_init() refactor on all
architectures, and other x86 changes.



Igor Mammedov (13):
  pc: acpi: x2APIC support for MADT table and _MAT method
  pc: acpi: x2APIC support for SRAT table
  acpi: cphp: Force switch to modern cpu hotplug if APIC ID > 254
  pc: Leave max apic_id_limit only in legacy cpu hotplug code
  pc: apic_common: Extend APIC ID property to 32bit
  pc: apic_common: Restore APIC ID to initial ID on reset
  pc: apic_common: Reset APIC ID to initial ID when switching into
x2APIC mode
  pc: kvm_apic: Pass APIC ID depending on xAPIC/x2APIC mode
  pc: Clarify FW_CFG_MAX_CPUS usage comment
  Increase MAX_CPUMASK_BITS from 255 to 288
  pc: Add 'etc/boot-cpus' fw_cfg file for machine with more than 255
CPUs
  pc: Require IRQ remapping and EIM if there could be x2APIC CPUs
  pc: q35: Bump max_cpus to 288

Laurent Vivier (3):
  exec: split cpu_exec_init()
  exec: move cpu_exec_init() calls to realize functions
  exec: call cpu_exec_exit() from a CPU unrealize common function

 exec.c  |  12 +++--
 hw/acpi/cpu.c   |   5 ++
 hw/acpi/cpu_hotplug.c   |  17 --
 hw/arm/virt.c   |   2 +-
 hw/i386/acpi-build.c| 112 
 hw/i386/kvm/apic.c  |  12 -
 hw/i386/pc.c|  82 ++---
 hw/i386/pc_q35.c|   2 +
 hw/intc/apic_common.c   |  52 ++-
 hw/ppc/spapr.c  |   2 +-
 include/exec/exec-all.h |   1 -
 include/hw/acpi/acpi-defs.h |  29 +++
 include/hw/i386/apic_internal.h |   3 +-
 include/hw/i386/pc.h|   2 +
 include/qom/cpu.h   |   4 +-
 include/sysemu/sysemu.h |   2 +-
 qom/cpu.c   |  10 +++-
 target-alpha/cpu.c  |  15 +++---
 target-arm/cpu-qom.h|   2 +
 target-arm/cpu.c|  45 +++-
 target-cris/cpu.c   |  15 +++---
 target-i386/cpu-qom.h   |   1 +
 target-i386/cpu.c   |  17 +-
 target-i386/cpu.h   |   1 +
 target-i386/kvm.c   |  13 +++--
 target-i386/kvm_i386.h  |   1 +
 target-lm32/cpu.c   |  15 +++---
 target-m68k/cpu.c   |  15 +++---
 target-microblaze/cpu.c |  14 ++---
 target-mips/cpu.c   |  15 +++---
 target-moxie/cpu.c  |  15 +++---
 target-openrisc/cpu.c   |  15 +++---
 target-ppc/cpu-qom.h|   1 +
 target-ppc/translate_init.c |  11 +++-
 target-s390x/cpu.c  |   8 +--
 target-sh4/cpu.c|  15 +++---
 target-sparc/cpu.c  |  18 +++
 target-tilegx/cpu.c |  15 +++---
 target-tricore/cpu.c|  15 +++---
 target-unicore32/cpu.c  |  18 +++
 target-xtensa/cpu.c |  15 +++---
 41 files changed, 427 insertions(+), 237 deletions(-)

-- 
2.7.4

[Qemu-devel] [PATCH 6/7] ipmi: Add graceful shutdown handling to the external BMC

2016-10-24 Thread minyard

From: Corey Minyard 

I misunderstood the workings of the power settings, the power off
is a force off operation and there needs to be a separate graceful
shutdown operation.  So replace the force off operation with a
graceful shutdown.

Signed-off-by: Corey Minyard 
---
 hw/ipmi/ipmi_bmc_extern.c | 11 ---
 tests/ipmi-bt-test.c  |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/hw/ipmi/ipmi_bmc_extern.c b/hw/ipmi/ipmi_bmc_extern.c
index d30b286..e8e3d25 100644
--- a/hw/ipmi/ipmi_bmc_extern.c
+++ b/hw/ipmi/ipmi_bmc_extern.c
@@ -54,7 +54,8 @@
 #define   VM_CAPABILITIES_IRQ  0x04
 #define   VM_CAPABILITIES_NMI  0x08
 #define   VM_CAPABILITIES_ATTN 0x10
-#define VM_CMD_FORCEOFF0x09
+#define   VM_CAPABILITIES_GRACEFUL_SHUTDOWN 0x20
+#define VM_CMD_GRACEFUL_SHUTDOWN   0x09
 
 #define TYPE_IPMI_BMC_EXTERN "ipmi-bmc-extern"
 #define IPMI_BMC_EXTERN(obj) OBJECT_CHECK(IPMIBmcExtern, (obj), \
@@ -276,8 +277,8 @@ static void handle_hw_op(IPMIBmcExtern *ibe, unsigned char 
hw_op)
 k->do_hw_op(s, IPMI_SEND_NMI, 0);
 break;
 
-case VM_CMD_FORCEOFF:
-qemu_system_shutdown_request();
+case VM_CMD_GRACEFUL_SHUTDOWN:
+k->do_hw_op(s, IPMI_SHUTDOWN_VIA_ACPI_OVERTEMP, 0);
 break;
 }
 }
@@ -401,6 +402,10 @@ static void chr_event(void *opaque, int event)
 if (k->do_hw_op(ibe->parent.intf, IPMI_POWEROFF_CHASSIS, 1) == 0) {
 v |= VM_CAPABILITIES_POWER;
 }
+if (k->do_hw_op(ibe->parent.intf, IPMI_SHUTDOWN_VIA_ACPI_OVERTEMP, 1)
+== 0) {
+v |= VM_CAPABILITIES_GRACEFUL_SHUTDOWN;
+}
 if (k->do_hw_op(ibe->parent.intf, IPMI_RESET_CHASSIS, 1) == 0) {
 v |= VM_CAPABILITIES_RESET;
 }
diff --git a/tests/ipmi-bt-test.c b/tests/ipmi-bt-test.c
index be9005e..65d05b3 100644
--- a/tests/ipmi-bt-test.c
+++ b/tests/ipmi-bt-test.c
@@ -309,7 +309,7 @@ static void test_connect(void)
 uint8_t msg[100];
 unsigned int msglen;
 static uint8_t exp1[] = { 0xff, 0x01, 0xa1 }; /* A protocol version */
-static uint8_t exp2[] = { 0x08, 0x1f, 0xa1 }; /* A capabilities cmd */
+static uint8_t exp2[] = { 0x08, 0x3f, 0xa1 }; /* A capabilities cmd */
 
 FD_ZERO();
 FD_SET(emu_lfd, );
-- 
2.7.4

[Qemu-devel] [PATCH 5/7] ipmi: fix build config variable name for ipmi_bmc_extern.o

2016-10-24 Thread minyard

From: "Daniel P. Berrange" 

The original commit:

  commit 67aa56fc03bea44ccf384ea400515a8a58844a50
  Author: Corey Minyard 
  Date:   Thu Dec 17 12:50:06 2015 -0600

ipmi: Add an external connection simulation interface

defined a new variable CONFIG_IPMI_EXTERN, but then went
on to mistakely use the pre-existing CONFIG_IPMI_LOCAL
variable.

Signed-off-by: Daniel P. Berrange 
Signed-off-by: Corey Minyard 
---
 hw/ipmi/Makefile.objs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ipmi/Makefile.objs b/hw/ipmi/Makefile.objs
index a90318d..1b422bb 100644
--- a/hw/ipmi/Makefile.objs
+++ b/hw/ipmi/Makefile.objs
@@ -1,5 +1,5 @@
 common-obj-$(CONFIG_IPMI) += ipmi.o
 common-obj-$(CONFIG_IPMI_LOCAL) += ipmi_bmc_sim.o
-common-obj-$(CONFIG_IPMI_LOCAL) += ipmi_bmc_extern.o
+common-obj-$(CONFIG_IPMI_EXTERN) += ipmi_bmc_extern.o
 common-obj-$(CONFIG_ISA_IPMI_KCS) += isa_ipmi_kcs.o
 common-obj-$(CONFIG_ISA_IPMI_BT) += isa_ipmi_bt.o
-- 
2.7.4

1 2 3 4 5 >

1 - 100 of 476 matches

Mail list logo