date:20200519

Re: [PATCH v2 06/11] ui/sdl2: fix handling of AltGr key on Windows

2020-05-19 Thread Gerd Hoffmann

On Sat, May 16, 2020 at 09:20:09AM +0200, Volker RÃ¼melin wrote:
> Wire up the keyboard hooking code on Windows to fix the AltGr
> key and improve keyboard grabbing.

Ah, I see, you've kept any gtk dependency out of win32-kbd-hook for
sharing with sdl.  Hmm.  I guess there is no easy way around the #ifdefs
then.

take care,
  Gerd

Re: [PATCH v3] block: make BlockConf.*_size properties 32-bit

2020-05-19 Thread Roman Kagan

On Wed, Apr 29, 2020 at 12:18:13PM +0300, Roman Kagan wrote:
> Devices (virtio-blk, scsi, etc.) and the block layer are happy to use
> 32-bit for logical_block_size, physical_block_size, and min_io_size.
> However, the properties in BlockConf are defined as uint16_t limiting
> the values to 32768.
> 
> This appears unnecessary tight, and we've seen bigger block sizes handy
> at times.
> 
> Make them 32 bit instead and lift the limitation up to 2 MiB which
> appears to be good enough for everybody, and matches the qcow2 cluster
> size limit.
> 
> As the values can now be fairly big and awkward to type, make the
> property setter accept common size suffixes (k, m).
> 
> Signed-off-by: Roman Kagan 
> Reviewed-by: Eric Blake 
> ---
> v2 -> v3:
> - mention qcow2 cluster size limit in the log and comment [Eric]
> 
> v1 -> v2:
> - cap the property at 2 MiB [Eric]
> - accept size suffixes
> 
>  include/hw/block/block.h |  8 
>  include/hw/qdev-properties.h |  2 +-
>  hw/core/qdev-properties.c| 34 --
>  3 files changed, 29 insertions(+), 15 deletions(-)
> 
> diff --git a/include/hw/block/block.h b/include/hw/block/block.h
> index d7246f3862..9dd6bba56a 100644
> --- a/include/hw/block/block.h
> +++ b/include/hw/block/block.h
> @@ -18,9 +18,9 @@
>  
>  typedef struct BlockConf {
>  BlockBackend *blk;
> -uint16_t physical_block_size;
> -uint16_t logical_block_size;
> -uint16_t min_io_size;
> +uint32_t physical_block_size;
> +uint32_t logical_block_size;
> +uint32_t min_io_size;
>  uint32_t opt_io_size;
>  int32_t bootindex;
>  uint32_t discard_granularity;
> @@ -51,7 +51,7 @@ static inline unsigned int get_physical_block_exp(BlockConf 
> *conf)
>_conf.logical_block_size),\
>  DEFINE_PROP_BLOCKSIZE("physical_block_size", _state,\
>_conf.physical_block_size),   \
> -DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),\
> +DEFINE_PROP_UINT32("min_io_size", _state, _conf.min_io_size, 0),\
>  DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0),\
>  DEFINE_PROP_UINT32("discard_granularity", _state,   \
> _conf.discard_granularity, -1),  \
> diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
> index f161604fb6..f9e0f8c041 100644
> --- a/include/hw/qdev-properties.h
> +++ b/include/hw/qdev-properties.h
> @@ -197,7 +197,7 @@ extern const PropertyInfo qdev_prop_pcie_link_width;
>  #define DEFINE_PROP_BIOS_CHS_TRANS(_n, _s, _f, _d) \
>  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_bios_chs_trans, int)
>  #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f) \
> -DEFINE_PROP_UNSIGNED(_n, _s, _f, 0, qdev_prop_blocksize, uint16_t)
> +DEFINE_PROP_UNSIGNED(_n, _s, _f, 0, qdev_prop_blocksize, uint32_t)
>  #define DEFINE_PROP_PCI_HOST_DEVADDR(_n, _s, _f) \
>  DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
>  #define DEFINE_PROP_OFF_AUTO_PCIBAR(_n, _s, _f, _d) \
> diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
> index 2047114fca..e673f3c43f 100644
> --- a/hw/core/qdev-properties.c
> +++ b/hw/core/qdev-properties.c
> @@ -14,6 +14,7 @@
>  #include "qapi/visitor.h"
>  #include "chardev/char.h"
>  #include "qemu/uuid.h"
> +#include "qemu/units.h"
>  
>  void qdev_prop_set_after_realize(DeviceState *dev, const char *name,
>Error **errp)
> @@ -729,30 +730,42 @@ const PropertyInfo qdev_prop_pci_devfn = {
>  
>  /* --- blocksize --- */
>  
> +/* lower limit is sector size */
> +#define MIN_BLOCK_SIZE  512
> +#define MIN_BLOCK_SIZE_STR  "512 B"
> +/*
> + * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, 
> and
> + * matches qcow2 cluster size limit
> + */
> +#define MAX_BLOCK_SIZE  (2 * MiB)
> +#define MAX_BLOCK_SIZE_STR  "2 MiB"
> +
>  static void set_blocksize(Object *obj, Visitor *v, const char *name,
>void *opaque, Error **errp)
>  {
>  DeviceState *dev = DEVICE(obj);
>  Property *prop = opaque;
> -uint16_t value, *ptr = qdev_get_prop_ptr(dev, prop);
> +uint32_t *ptr = qdev_get_prop_ptr(dev, prop);
> +uint64_t value;
>  Error *local_err = NULL;
> -const int64_t min = 512;
> -const int64_t max = 32768;
>  
>  if (dev->realized) {
>  qdev_prop_set_after_realize(dev, name, errp);
>  return;
>  }
>  
> -visit_type_uint16(v, name, , _err);
> +visit_type_size(v, name, , _err);
>  if (local_err) {
>  error_propagate(errp, local_err);
>  return;
>  }
>  /* value of 0 means "unset" */
> -if (value && (value < min || value > max)) {
> -error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
> -   dev->id ? : "", name, (int64_t)value, min, max);
> +if

Re: [PATCH 15/24] macio: Fix macio-bus to be a subtype of System bus

2020-05-19 Thread Markus Armbruster

Mark Cave-Ayland  writes:

> On 18/05/2020 06:03, Markus Armbruster wrote:
>
>> The devices we plug into the macio-bus are all sysbus devices
>> (DeviceClass member bus_type is TYPE_SYSTEM_BUS), but macio-bus does
>> not derive from TYPE_SYSTEM_BUS.  Fix that.
>> 
>> "info qtree" now shows the devices' mmio ranges, as it should
>> 
>> Cc: Mark Cave-Ayland 
>> Cc: David Gibson 
>> Cc: qemu-...@nongnu.org
>> Signed-off-by: Markus Armbruster 
>> ---
>>  hw/misc/macio/macio.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/hw/misc/macio/macio.c b/hw/misc/macio/macio.c
>> index ebc96cc8f6..53a9fd5696 100644
>> --- a/hw/misc/macio/macio.c
>> +++ b/hw/misc/macio/macio.c
>> @@ -492,7 +492,7 @@ static void macio_class_init(ObjectClass *klass, void 
>> *data)
>>  
>>  static const TypeInfo macio_bus_info = {
>>  .name = TYPE_MACIO_BUS,
>> -.parent = TYPE_BUS,
>> +.parent = TYPE_SYSTEM_BUS,
>>  .instance_size = sizeof(MacIOBusState),
>>  };
>
> Here I learned something new: a device that has a class TYPE_SYS_BUS_DEVICE 
> should be
> attached to a bus that derives from TYPE_SYSTEM_BUS. I have a feeling that 
> there are
> going to be quite a few instances of this around, particularly in places where
> existing sysbus devices have been borrowed from the PC world and reused.

Not that many.  I clean them up this series, and "[PATCH 22/24] qdev:
Assert devices are plugged into a bus that can take them" should ensure
we stay clean.

Re: [PATCH 13/24] ppc4xx: Drop redundant device realization

2020-05-19 Thread Markus Armbruster

BALATON Zoltan  writes:

> On Mon, 18 May 2020, Markus Armbruster wrote:
>> object_property_set_bool(OBJECT(dev), true, "realized", ...) right
>> after qdev_init_nofail(dev) does nothing, because qdev_init_nofail()
>> already realizes.  Drop.
>>
>> Cc: BALATON Zoltan 
>
> Shouldn't this Cc line come after the --- so it's not included in the
> final commit? Thanks.

We routinely include it in git history.

> Reviewed-by: BALATON Zoltan 

Thanks!

Re: [PATCH 18/24] display/sm501 display/ati: Fix to realize "i2c-ddc"

2020-05-19 Thread Markus Armbruster

BALATON Zoltan  writes:

> On Mon, 18 May 2020, Markus Armbruster wrote:
>> sm501_init() and ati_vga_realize() create an "i2c-ddc" device, but
>> neglect to realize it.  Affects machines sam460ex, shix, r2d, and
>> fulong2e.
>>
>> I wonder how this ever worked.  If the "device becomes real only on
>> realize" thing actually works, then we've always been missing the
>> device, yet nobody noticed.
>
> No idea why it worked but guests can read EDID info fine with or
> without this patch, so
>
> Tested-by: BALATON Zoltan 

Thanks!

> Maybe device is created and working after init as it has nothing
> special to do at realize (it doesn't even have a realize method) so
> all realize would do is to link it in qtree?

Plausible.

Re: [PATCH 17/24] pnv/psi: Correct the pnv-psi* devices not to be sysbus devices

2020-05-19 Thread Markus Armbruster

Cédric Le Goater  writes:

> On 5/18/20 7:04 AM, Markus Armbruster wrote:
>> pnv_chip_power8_instance_init() creates a "pnv-psi-POWER8" sysbus
>> device in a way that leaves it unplugged.
>> pnv_chip_power9_instance_init() and pnv_chip_power10_instance_init()
>> do the same for "pnv-psi-POWER9" and "pnv-psi-POWER10", respectively.
>> 
>> These devices aren't actually sysbus devices.  Correct that.
>
> I might have done things wrong regarding sysbus in the PowerNV machine.
>
> For some devices (PHBs), I have added :
>
>   qdev_set_parent_bus(DEVICE(...), sysbus_get_default());

It's not wrong.

My next series will rework how devices get plugged into their buses.

> Should we do the same for the PSI device ?

No, because the PSI device is not a sysbus device.

Re: [PATCH 10/24] macio: Bury unwanted "macio-gpio" devices

2020-05-19 Thread Markus Armbruster

Mark Cave-Ayland  writes:

> On 18/05/2020 06:03, Markus Armbruster wrote:
>
>> These devices go with the "via-pmu" device, which is controlled by
>> property "has-pmu".  macio_newworld_init() creates it unconditionally,
>> because the property has not been set then.  macio_newworld_realize()
>> realizes it only when the property is true.  Works, although it can
>> leave an unrealized device hanging around in the QOM composition tree.
>> Affects machine mac99 with via=cuda (default).
>> 
>> Bury the unwanted device by making macio_newworld_realize() unparent
>> it.  Visible in "info qom-tree":
>> 
>>  /machine (mac99-machine)
>>[...]
>>/unattached (container)
>>  /device[9] (macio-newworld)
>>[...]
>>/escc-legacy-port[8] (qemu:memory-region)
>>/escc-legacy-port[9] (qemu:memory-region)
>>/escc-legacy[0] (qemu:memory-region)
>> -  /gpio (macio-gpio)
>> -/gpio[0] (qemu:memory-region)
>>/ide[0] (macio-ide)
>>  /ide.0 (IDE)
>>  /pmac-ide[0] (qemu:memory-region)
>> 
>> Cc: Mark Cave-Ayland 
>> Cc: David Gibson 
>> Cc: qemu-...@nongnu.org
>> Signed-off-by: Markus Armbruster 
>> ---
>>  hw/misc/macio/macio.c | 2 ++
>>  1 file changed, 2 insertions(+)
>> 
>> diff --git a/hw/misc/macio/macio.c b/hw/misc/macio/macio.c
>> index 3779865ab2..b3dddf8be7 100644
>> --- a/hw/misc/macio/macio.c
>> +++ b/hw/misc/macio/macio.c
>> @@ -368,6 +368,8 @@ static void macio_newworld_realize(PCIDevice *d, Error 
>> **errp)
>>  memory_region_add_subregion(>bar, 0x16000,
>>  sysbus_mmio_get_region(sysbus_dev, 0));
>>  } else {
>> +object_unparent(OBJECT(>gpio));
>> +
>>  /* CUDA */
>>  object_initialize_child(OBJECT(s), "cuda", >cuda, 
>> sizeof(s->cuda),
>>  TYPE_CUDA, _abort, NULL);
>
> This one is a little more interesting because it comes back to the previous
> discussions around if you have a device that contains other devices, should 
> you init
> all the children in your container device init, and the realize all your 
> children in
> your container device realize?

You have to initialize them in the container's instance_init method to
make their properties accessible.

You have to realize them in the container's realize method if
realization can fail, or if it has visible side effects.

Many, many places keep initialization and realization together.
Historical reasons, ignorance, laziness, all excusable.

Doing both in realize is safe (I think), but you'll have to refactor
when you need to expose the properties for configuration.  Cleaning that
up proactively feels unnecessary.

Doing both in instance_init necessitates a fragile, non-local
correctness argument around "can't fail" and "doesn't do anything
untoward".  Best avoided, I think.

> If so I guess this patch isn't technically wrong, but it is somewhat 
> misleading given
> that the existing init/realize pattern here is incorrect. Perhaps it should 
> go ahead
> and make everything work the "right way"?

The code being patched here works the nice way: instance_init method
macio_newworld_init() initializes ns->gpio, and realize method
macio_realize_ide() realizes it.  Let's keep it that way.

Re: [PATCH 16/24] ppc/pnv: Put "*-pnv-chip" and "pnv-xive" on the main system bus

2020-05-19 Thread Markus Armbruster

Cédric Le Goater  writes:

> On 5/18/20 7:04 AM, Markus Armbruster wrote:
>> pnv_init() creates "power10_v1.0-pnv-chip", "power8_v2.0-pnv-chip",
>> "power8e_v2.1-pnv-chip", "power8nvl_v1.0-pnv-chip", or
>> "power9_v2.0-pnv-chip" sysbus devices in a way that leaves them
>> unplugged.
>> 
>> pnv_chip_power9_instance_init() creates a "pnv-xive" sysbus device in
>> a way that leaves it unplugged.
>> 
>> Create them the common way that puts them into the main system bus.
>> Affects machines powernv8, powernv9, and powernv10.  Visible in "info
>> qtree".  Here's the change for powernv9:
>> 
>>  bus: main-system-bus
>>type System
>> +  dev: power9_v2.0-pnv-chip, id ""
>> +chip-id = 0 (0x0)
>> +ram-start = 0 (0x0)
>> +ram-size = 1879048192 (0x7000)
>> +nr-cores = 1 (0x1)
>> +cores-mask = 72057594037927935 (0xff)
>> +nr-threads = 1 (0x1)
>> +num-phbs = 6 (0x6)
>> +mmio 000603fc/0004
>> [...]
>> +  dev: pnv-xive, id ""
>> +ic-bar = 1692157036462080 (0x603020310)
>> +vc-bar = 1689949371891712 (0x60100)
>> +pc-bar = 1690499127705600 (0x60180)
>> +tm-bar = 1692157036986368 (0x603020318)
>> 
>> Cc: "Cédric Le Goater" 
>> Cc: David Gibson 
>> Cc: qemu-...@nongnu.org
>> Signed-off-by: Markus Armbruster 
>> ---
>>  hw/ppc/pnv.c | 6 +++---
>>  1 file changed, 3 insertions(+), 3 deletions(-)
>> 
>> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
>> index da637822f9..8d4fc8109a 100644
>> --- a/hw/ppc/pnv.c
>> +++ b/hw/ppc/pnv.c
>> @@ -818,7 +818,7 @@ static void pnv_init(MachineState *machine)
>>  pnv->chips = g_new0(PnvChip *, pnv->num_chips);
>>  for (i = 0; i < pnv->num_chips; i++) {
>>  char chip_name[32];
>> -Object *chip = object_new(chip_typename);
>> +Object *chip = OBJECT(qdev_create(NULL, chip_typename));
>>  
>>  pnv->chips[i] = PNV_CHIP(chip);
>>  
>> @@ -1317,8 +1317,8 @@ static void pnv_chip_power9_instance_init(Object *obj)
>>  PnvChipClass *pcc = PNV_CHIP_GET_CLASS(obj);
>>  int i;
>>  
>> -object_initialize_child(obj, "xive", >xive, sizeof(chip9->xive),
>> -TYPE_PNV_XIVE, _abort, NULL);
>> +sysbus_init_child_obj(obj, "xive", >xive, sizeof(chip9->xive),
>> +  TYPE_PNV_XIVE);
>>  object_property_add_alias(obj, "xive-fabric", OBJECT(>xive),
>>"xive-fabric");
>
> OK. But why only XIVE and not all sub-devices of the PnvChip device ? 
>
> Shouldn't they be initialized in the same way, calling sysbus_init_child_obj 
> ? 
No, your code is just fine there.

sysbus_init_child_obj() is a convenience wrapper around
object_initialize_child() and qdev_set_parent_bus().  Only sysbus
devices may use it.  The other sub-devices are not susbus devices:

* TYPE_PNV8_PSI, TYPE_PNV9_PSI, TYPE_PNV10_PSI

  Subtypes of TYPE_PNV_PSI, which is a subtype of TYPE_DEVICE.

* TYPE_PNV8_LPC, TYPE_PNV9_LPC, TYPE_PNV10_LPC

  Subtypes of TYPE_PNV_LPC, which is a subtype of TYPE_DEVICE.

* TYPE_PNV8_OCC, TYPE_PNV9_OCC

  Subtypes of TYPE_PNV_OCC, which is a subtype of TYPE_DEVICE.

* TYPE_PNV8_HOMER, TYPE_PNV9_HOMER

  Subtypes of TYPE_PNV_HOMER, which is a subtype of TYPE_DEVICE.

* TYPE_PNV_PHB4_PEC

  Subtype of TYPE_DEVICE.

* TYPE_PNV_QUAD

  Subtype of TYPE_DEVICE.

Except for:

* TYPE_PNV_PHB3

  Subtype of TYPE_PCIE_HOST_BRIDGE, which is a subtype of
  TYPE_PCI_HOST_BRIDGE, which is a subtype of TYPE_SYS_BUS_DEVICE.

where you use object_initialize_child() and qdev_set_parent_bus()
directly.  Works.  We could perhaps change it to use
sysbus_init_child_obj(), but it would be a waste; my next series will
kill that helper :)

Re: [PATCH 18/24] display/sm501 display/ati: Fix to realize "i2c-ddc"

2020-05-19 Thread Markus Armbruster

Philippe Mathieu-Daudé  writes:

> On 5/18/20 12:39 PM, BALATON Zoltan wrote:
>> On Mon, 18 May 2020, Markus Armbruster wrote:
>>> sm501_init() and ati_vga_realize() create an "i2c-ddc" device, but
>>> neglect to realize it.  Affects machines sam460ex, shix, r2d, and
>>> fulong2e.
>>>
>>> I wonder how this ever worked.  If the "device becomes real only on
>>> realize" thing actually works, then we've always been missing the
>>> device, yet nobody noticed.
>>>
>>> Fix by realizing it right away.  Visible in "info qom-tree"; here's
>>> the change for sam460ex:
>>>
>>>     /machine (sam460ex-machine)
>>>   [...]
>>>   /unattached (container)
>>>     [...]
>>>    -    /device[14] (sii3112)
>>>    +    /device[14] (i2c-ddc)
>>>    +    /device[15] (sii3112)
>>>     [rest of device[*] renumbered...]
>>>
>>> Fixes: 4a1f253adb45ac6019971193d5077c4d5d55886a
>>> Fixes: 4a1f253adb45ac6019971193d5077c4d5d55886a
>>
>> One of these is probably meant to be
>> c82c7336de58876862e6b4dccbda29e9240fd388

Pasto, thanks!

> :)
>
>> although I'm not sure having a Fixes tag makes sense for this commit.
>
> AFAIK the 'Fixes' tag is not well defined in QEMU.

True.

> I personally find it handy to navigate between commits in gitk, not
> having to go via unrelated commits, which is why I use it.
> Linux kernel seems to have a stricter approach, only using it for
> security bug fixes. For this QEMU uses 'Cc: qemu-stable'.

We cc: qemu-stable for show-stoppers without security impact, too.

> Do we need to clarify its use on the wiki?

If we can build rough consensus on how we want it used, yes.

Re: [PATCH not-for-merge 2/5] qom: Make "info qom-tree" show children sorted

2020-05-19 Thread Markus Armbruster

Eric Blake  writes:

> On 5/18/20 12:19 AM, Markus Armbruster wrote:
>> "info qom-tree" prints children in unstable order.  This is a pain
>> when diffing output for different versions to find change.  Print it
>> sorted.
>
> Yes, this does seem reasonable to include even without the rest of the
> series.

Noted.

>> Signed-off-by: Markus Armbruster 
>> ---
>>   qom/qom-hmp-cmds.c | 40 +++-
>>   1 file changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/qom/qom-hmp-cmds.c b/qom/qom-hmp-cmds.c
>> index 4a61ee1b8c..cf0af8f6b5 100644
>> --- a/qom/qom-hmp-cmds.c
>> +++ b/qom/qom-hmp-cmds.c
>> @@ -78,6 +78,35 @@ static int print_qom_composition_child(Object *obj, void 
>> *opaque)
>>   return 0;
>>   }
>>   +static int qom_composition_compare(const void *a, const void *b,
>> void *ignore)
>> +{
>> +Object *obja = (void *)a, *objb = (void *)b;
>
> Casting away const...
>
>> +const char *namea, *nameb;
>> +
>> +if (obja == object_get_root()) {
>> +namea = g_strdup("");
>> +} else {
>> +namea = object_get_canonical_path_component(obja);
>
> ...should we instead improve object_get_canonical_path_component to
> work with 'const Object *'?

Go right ahead :)

I need to sit on my hands to have a chance getting my task queue back
under control.

>> +}
>> +
>> +if (objb == object_get_root()) {
>> +nameb = g_strdup("");
>> +} else {
>> +nameb = object_get_canonical_path_component(objb);
>> +}
>> +
>> +
>> +return strcmp(namea, nameb);
>
> Why the two blank lines?  This leaks namea and/or nameb if either
> object is the object root.  Should you instead use g_strcmp0 here,
> with namea/b set to NULL instead of g_strdup("") above?

My not-for-merge proves prudent ;)

>> @@ -105,7 +134,16 @@ static void print_qom_composition(Monitor *mon, Object 
>> *obj, int indent)
>>   monitor_printf(mon, "%*s/%s (%s)\n", indent, "", name,
>>  object_get_typename(obj));
>>   g_free(name);
>> -object_child_foreach(obj, print_qom_composition_child, );
>> +
>> +GQueue children;
>> +Object *child;
>
> Mid-function declarations - I assume you'd clean this up if we want
> this for real?

Yes.  I prioritized diff over maintainability, because not-for-merge.

>> +g_queue_init();
>> +object_child_foreach(obj, insert_qom_composition_child, );
>> +while ((child = g_queue_pop_head())) {
>> +print_qom_composition(mon, child, indent + 2);
>> +}
>> +(void)s;
>> +(void)print_qom_composition_child;
>
> Also, this looks like leftover debugger aids?

Shut up the compiler so I don't have to remove code.  Shorter diff,
not-for-merge.

>>   }
>> void hmp_info_qom_tree(Monitor *mon, const QDict *dict)
>>

Thanks!

Re: [PATCH v2 02/11] ui/gtk: fix handling of AltGr key on Windows

2020-05-19 Thread Gerd Hoffmann

> +static void *gd_win32_get_hwnd(VirtualConsole *vc)
> +{
> +#ifdef G_OS_WIN32
> +return gdk_win32_window_get_impl_hwnd(
> +gtk_widget_get_window(vc->window ? vc->window : vc->s->window));

Can we move the gdk_win32_window_get_impl_hwnd() call to win32_kbd_set_window()?
That should remove the G_OS_WIN32 #ifdefs completely.

thanks,
  Gerd

Re: [PATCH Kernel v22 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-19 Thread Kirti Wankhede





On 5/19/2020 3:23 AM, Alex Williamson wrote:

On Mon, 18 May 2020 11:26:34 +0530
Kirti Wankhede  wrote:


VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
  drivers/vfio/vfio_iommu_type1.c | 313 +++-
  1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..bf740fef196f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
  };
  
  struct vfio_domain {

@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
  };
  
  struct vfio_group {

@@ -126,6 +128,19 @@ struct vfio_regions {
  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
(!list_empty(>domain_list))
  
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
  static int put_pfn(unsigned long pfn, int prot);
  
  /*

@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
  }
  
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);


Curious that the extra 8-bytes are added in the next patch, but they're
just as necessary here.



Yes, moving it in this patch.
While resolving patches, I had to update 6/8 and 8/8 patches also. So 
updating 3 patches.



We also have the explanation above about why we have the signed int
size limitation, but we sort of ignore that when adding the bytes here.
That limitation is derived from __bitmap_set(), whereas we only need
these extra bits for bitmap_shift_left(), where I can't spot a signed
int limitation.  Do you come to the same conclusion?  


That's right.


Maybe worth a
comment why we think we can exceed DIRTY_BITMAP_PAGES_MAX for that
extra padding.



ok.


+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {


Nit, the previous function above sets the initial value in the for()
statement, it looks like it would fit in 80 columns here too.  We have
examples either way in the code, so not a must fix.


+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if

[PATCH Kernel v22 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-19 Thread Kirti Wankhede

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 313 +++-
 1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..0a420594483a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,80 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   /*
+* Allocate extra 64 bits that are used to calculate shift required for
+* bitmap_shift_left() to manipulate and club unaligned number of pages
+* in adjacent vfio_dma ranges.
+*/
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+  GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p;
+
+   for (p = rb_prev(n); p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+

Re: [PATCH not-for-merge 0/5] Instrumentation for "Fixes around device realization"

2020-05-19 Thread Markus Armbruster

Mark Cave-Ayland  writes:

> On 18/05/2020 06:19, Markus Armbruster wrote:
>
>> This is the instrumentation mentioned in "[PATCH 00/24] Fixes around
>> device realization".
>> 
>> PATCH 2/5 might have value on its own.  You tell me.
>> 
>> Shell script to smoke-test all machines:
>> 
>> #!/bin/sh
>> success=0
>> fail=0
>> ulimit -c 0
>> git-describe --dirty --match v\*
>> git-log --oneline -1
>> for i in bld/*-softmmu
>> do
>> t=${i%-softmmu}
>> t=${t##*/}
>> q=$i/qemu-system-$t
>> echo "= $t ="
>> 
>> for m in `$q -M help | sed -n '/(alias of/d;2,$s/ .*//p'`
>> do
>>  echo "== $m =="
>>  echo -e 'info qom-tree\ninfo qtree\nq' | $q -S -accel qtest -display 
>> none -L smoke-mon-roms -M $m -monitor stdio
>>  if [ $? -eq 0 ]
>>  then echo "*** Success: $m ***"; let success++
>>  else echo "*** Fail: $m"; let fail++
>>  fi
>> done
>> done
>> echo $success succeeded, $fail failed
>> 
>> 
>> Markus Armbruster (5):
>>   qom: Instrument to detect missed realize
>>   qom: Make "info qom-tree" show children sorted
>>   qdev: Make "info qtree" show child devices sorted by QOM path
>>   qdev: Instrument to detect missed QOM parenting
>>   qdev: Instrument to detect bus mismatch
>> 
>>  hw/core/qdev.c | 17 
>>  qdev-monitor.c | 32 -
>>  qom/qom-hmp-cmds.c | 51 +-
>>  3 files changed, 98 insertions(+), 2 deletions(-)
>
> Thanks for sharing these patches! I certainly think that they have value and 
> after a
> quick read through I'm thinking:
>
> - Patch 1 I assume is no longer needed once you previous series is merged

Correct, "[PATCH 24/24] qdev: Assert onboard devices all get realized
properly" supersedes.

> - Patches 2 & 3 would be really useful at the start of your previous series 
> (as
> someone who has gone crossed-eyed enough trying to spot these differences, 
> this is
> really helpful)

It's where they sat while I developed my fixes, so I don't go
cross-eyed, too :)

> - Patches 4 and 5 are good sanity checks for developers but I'm wondering 
> what is the
> extent of work that needs to be done? Could existing failures be whitelisted 
> with the
> aim of removal which would then at least prevent new devices being added that 
> aren't
> correct?

Since realization assigns a QOM parent to orphans, "[PATCH 24/24] qdev:
Assert onboard devices all get realized properly" supersedes PATCH 4,
too.  I needed both PATCH 1 and 4 to track down missing realizes,
because some were visible only in "info qtree" (PATCH 4), and some only
in "info qom-tree" (PATCH 1).

"[PATCH 22/24] qdev: Assert devices are plugged into a bus that can take
them" supersedes PATCH 5.

Thanks!

[PATCH Kernel v22 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-19 Thread Kirti Wankhede

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 62 +
 include/uapi/linux/vfio.h   | 10 +++
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0a420594483a..963ae4348b3c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1018,23 +1018,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1045,9 +1047,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1085,6 +1093,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1128,6 +1137,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(>lock);
goto again;
}
+
+   if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -2466,17 +2483,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
-   long ret;
+   struct vfio_bitmap bitmap = { 0 };
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
-   if (unmap.argsz < minsz || unmap.flags)
+   if (unmap.argsz < minsz ||
+   unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
return -EINVAL;
 
-   ret = vfio_dma_do_unmap(iommu, );
+   if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   unsigned long pgshift;
+
+   if (unmap.argsz < (minsz + sizeof(bitmap)))
+   return -EINVAL;
+
+   if (copy_from_user(,
+  (void __user *)(arg + minsz),
+  sizeof(bitmap)))
+   return -EFAULT;
+
+   if (!access_ok((void __user

[PATCH Kernel v22 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-19 Thread Kirti Wankhede

Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 103 +---
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d74b76919cbb..f5b79a71e9f7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -592,11 +598,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   struct vfio_group *group;
int i, j, ret;
unsigned long remote_vaddr;
struct vfio_dma *dma;
@@

[PULL 4/8] tests/fuzz: Remove unuseful/unused typedefs

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

These typedefs are not used. Use a simple structure,
remote the typedefs.

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-5-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 tests/qtest/fuzz/i440fx_fuzz.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/qtest/fuzz/i440fx_fuzz.c b/tests/qtest/fuzz/i440fx_fuzz.c
index 96fed9ff12..c197b026db 100644
--- a/tests/qtest/fuzz/i440fx_fuzz.c
+++ b/tests/qtest/fuzz/i440fx_fuzz.c
@@ -45,12 +45,11 @@ static void i440fx_fuzz_qtest(QTestState *s,
  * loop over the Data, breaking it up into actions. each action has an
  * opcode, address offset and value
  */
-typedef struct QTestFuzzAction {
+struct {
 uint8_t opcode;
 uint8_t addr;
 uint32_t value;
-} QTestFuzzAction;
-QTestFuzzAction a;
+} a;
 
 while (Size >= sizeof(a)) {
 /* make a copy of the action so we can normalize the values in-place */
@@ -91,19 +90,18 @@ static void i440fx_fuzz_qos(QTestState *s,
  * Same as i440fx_fuzz_qtest, but using QOS. devfn is incorporated into the
  * value written over Port IO
  */
-typedef struct QOSFuzzAction {
+struct {
 uint8_t opcode;
 uint8_t offset;
 int devfn;
 uint32_t value;
-} QOSFuzzAction;
+} a;
 
 static QPCIBus *bus;
 if (!bus) {
 bus = qpci_new_pc(s, fuzz_qos_alloc);
 }
 
-QOSFuzzAction a;
 while (Size >= sizeof(a)) {
 memcpy(, Data, sizeof(a));
 switch (a.opcode % ACTION_MAX) {
-- 
2.25.3

[PULL 6/8] tests/fuzz: Extract ioport_fuzz_qtest() method

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

Extract generic ioport_fuzz_qtest() method from
i440fx_fuzz_qtest(). This will help to write tests
not specific to the i440FX controller.

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-7-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 tests/qtest/fuzz/i440fx_fuzz.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/qtest/fuzz/i440fx_fuzz.c b/tests/qtest/fuzz/i440fx_fuzz.c
index 558fa17c93..bcd6769b4c 100644
--- a/tests/qtest/fuzz/i440fx_fuzz.c
+++ b/tests/qtest/fuzz/i440fx_fuzz.c
@@ -39,7 +39,7 @@ enum action_id {
 ACTION_MAX
 };
 
-static void i440fx_fuzz_qtest(QTestState *s,
+static void ioport_fuzz_qtest(QTestState *s,
 const unsigned char *Data, size_t Size) {
 /*
  * loop over the Data, breaking it up into actions. each action has an
@@ -84,10 +84,17 @@ static void i440fx_fuzz_qtest(QTestState *s,
 flush_events(s);
 }
 
+static void i440fx_fuzz_qtest(QTestState *s,
+  const unsigned char *Data,
+  size_t Size)
+{
+ioport_fuzz_qtest(s, Data, Size);
+}
+
 static void pciconfig_fuzz_qos(QTestState *s, QPCIBus *bus,
 const unsigned char *Data, size_t Size) {
 /*
- * Same as i440fx_fuzz_qtest, but using QOS. devfn is incorporated into the
+ * Same as ioport_fuzz_qtest, but using QOS. devfn is incorporated into the
  * value written over Port IO
  */
 struct {
-- 
2.25.3

Re: [PATCH] linux-user/mmap.c: fix integer underflow in target_mremap

2020-05-19 Thread Stefano Garzarella

Hi Jonathan,
thanks for the patch!

CCing Riku and Laurent.

On Mon, May 18, 2020 at 12:13:41PM -0600, Jonathan Marler wrote:
> Been a few more days.  Not sure how often I should be pinging.  If this is
> too much to ping every few days let me know.

Is not too much, but next time is better to CC the maintainers.
You can use 'scripts/get_maintainer.pl' to get the list of maintainers
and reviewers.

Please take a look at https://wiki.qemu.org/Contribute/SubmitAPatch

> 
> On Fri, May 15, 2020 at 7:36 AM Jonathan Marler 
> wrote:
> 
> > Been a couple weeks, checking to see if anyone has looked at this.
> >
> > On Sat, May 2, 2020 at 5:43 PM Jonathan Marler 
> > wrote:
> >
> >> FYI, I applied this patch to the qemu build that zig uses to run
> >> non-native tests (
> >> https://github.com/ziglang/qemu-static/blob/master/patch/mremap-underflow.diff
> >> )
> >>
> >> After applying it, my new code that calls mremap now passes,
> >> whereas before the fix I was getting a segfault.
> >>
> >> On Sat, May 2, 2020 at 10:12 AM Jonathan Marler 
> >> wrote:
> >>
> >>> Fixes: https://bugs.launchpad.net/bugs/1876373

should be "Buglink: https://bugs.launchpad.net/bugs/1876373;

> >>>
> >>> This code path in mmap occurs when a page size is decreased with
> >>> mremap.  When a section of pages is shrunk, qemu calls mmap_reserve on the
> >>> pages that were released.  However, it has the diff operation reversed,
> >>> subtracting the larger old_size from the smaller new_size.  Instead, it
> >>> should be subtracting the smaller new_size from the larger old_size.  You
> >>> can also see in the previous line of the change that this mmap_reserve 
> >>> call
> >>> only occurs when old_size > new_size.

Please break the lines of the commit message (max 76 charactes per line):
https://wiki.qemu.org/Contribute/SubmitAPatch#Write_a_meaningful_commit_message

Thanks,
Stefano

> >>>
> >>> Signed-off-by: Jonathan Marler 
> >>> ---
> >>>  linux-user/mmap.c | 2 +-
> >>>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>>
> >>> diff --git a/linux-user/mmap.c b/linux-user/mmap.c
> >>> index e378033797..caab62909e 100644
> >>> --- a/linux-user/mmap.c
> >>> +++ b/linux-user/mmap.c
> >>> @@ -708,7 +708,7 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong
> >>> old_size,
> >>>  if (prot == 0) {
> >>>  host_addr = mremap(g2h(old_addr), old_size, new_size,
> >>> flags);
> >>>  if (host_addr != MAP_FAILED && reserved_va && old_size >
> >>> new_size) {
> >>> -mmap_reserve(old_addr + old_size, new_size - old_size);
> >>> +mmap_reserve(old_addr + old_size, old_size - new_size);
> >>>  }
> >>>  } else {
> >>>  errno = ENOMEM;
> >>> --
> >>> 2.23.1
> >>>
> >>>

Re: [PATCH 02/10] softfloat: Replace flag with bool

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> We have had this on the to-do list for quite some time.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-19 Thread Dima Stepanov

On Mon, May 18, 2020 at 10:53:59AM +0100, Dr. David Alan Gilbert wrote:
> * Dima Stepanov (dimas...@yandex-team.ru) wrote:
> > On Mon, May 18, 2020 at 10:50:39AM +0800, Jason Wang wrote:
> > > 
> > > On 2020/5/16 上午12:54, Dima Stepanov wrote:
> > > >On Thu, May 14, 2020 at 03:34:24PM +0800, Jason Wang wrote:
> > > >>On 2020/5/13 下午5:47, Dima Stepanov wrote:
> > > > case CHR_EVENT_CLOSED:
> > > > /* a close event may happen during a read/write, but vhost
> > > >  * code assumes the vhost_dev remains setup, so delay the
> > > >  * stop & clear to idle.
> > > >  * FIXME: better handle failure in vhost code, remove bh
> > > >  */
> > > > if (s->watch) {
> > > > AioContext *ctx = qemu_get_current_aio_context();
> > > >
> > > > g_source_remove(s->watch);
> > > > s->watch = 0;
> > > > qemu_chr_fe_set_handlers(>chr, NULL, NULL, NULL, 
> > > > NULL,
> > > >  NULL, NULL, false);
> > > >
> > > > aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque);
> > > > }
> > > > break;
> > > >
> > > >I think it's time we dropped the FIXME and moved the handling to 
> > > >common
> > > >code. Jason? Marc-André?
> > > I agree. Just to confirm, do you prefer bh or doing changes like what 
> > > is
> > > done in this series? It looks to me bh can have more easier codes.
> > > >>>Could it be a good idea just to make disconnect in the char device but
> > > >>>postphone clean up in the vhost-user-blk (or any other vhost-user
> > > >>>device) itself? So we are moving the postphone logic and decision from
> > > >>>the char device to vhost-user device. One of the idea i have is as
> > > >>>follows:
> > > >>>   - Put ourself in the INITIALIZATION state
> > > >>>   - Start these vhost-user "handshake" commands
> > > >>>   - If we got a disconnect error, perform disconnect, but don't clean 
> > > >>> up
> > > >>> device (it will be clean up on the roll back). I can be done by
> > > >>> checking the state in vhost_user_..._disconnect routine or smth 
> > > >>> like it
> > > >>
> > > >>Any issue you saw just using the aio bh as Michael posted above.
> > > >>
> > > >>Then we don't need to deal with the silent vhost_dev_stop() and we will 
> > > >>have
> > > >>codes that is much more easier to understand.
> > > >I've implemented this solution inside
> > > >hw/block/vhost-user-blk.c:vhost_user_blk_event() in the similar way by
> > > >using the s->connected field. Looks good and more correct fix ). I have
> > > >two questions here before i'll rework the fixes:
> > > >1. Is it okay to make the similar fix inside vhost_user_blk_event() or
> > > >we are looking for more generic vhost-user solution? What do you think?
> > > 
> > > 
> > > I think I agree with Michael, it's better to have a generic vhost-user
> > > solution. But if it turns out to be not easy, we can start from fixing
> > > vhost-user-blk.
> > I also agree, but as i see it right now the connect/disconnect events
> > are handled inside each vhost-user device implementation file. So it will
> > need some global refactoring. So i suggest having this fix first and
> > after it refactoring the code:
> >  - more devices will be involved
> >  - i see there is some difference in device handling
> 
> I'm following bits of this discussion, some thoughts;
> if your device doesn't support reconnect, then if, at the start of
> migration you find that you can't start the log what is the correct
> behaviour?
I'm not sure here, but it looks like that in this case the device state
will be:
  disconnect -> stopped (will not be changed during migration, because
  reconnect isn't supported)
And because of it the device state will not be changed during migration,
so there is no need for log and migration could be completed
successfully.
So as i see it (i could be wrong here) that:
 - it is okay: if device is not started and we will not change this
   state during migration + log start is failed
 - it is not okay: if device is started + log start is failed (because
   we can't handle the dirty pages and so on during migration)

> You can't carry on with the migration because you'd have an
> inconsistent migration state; so I guess that's why the abort() is there
> - but I think I'd generally prefer to fail the migration and hope the
> vhsot device is still working for anything other than the log.
> 
> You're going to have to be pretty careful with the ordering of reconect
> - reconnecting on the source during a migration sounds pretty hairy, but
> a migration can take many minutes, so if you really want to survive this
> I guess you have to.
Maybe if we get a disconnect during migration then we could postphone or
just don't make reconnect at all till the end of migration on the source
side. This will make a device left in the stopped state.

[PATCH] arm/aspeed: Compute the number of CPUs from the SoC definition

2020-05-19 Thread Cédric Le Goater

Commit ece09beec457 ("aspeed: introduce a configurable number of CPU
per machine") was a convient change during bringup but the Aspeed SoCs
have a fixed number of CPUs : one for the AST2400 and AST2500, and two
for the AST2600.

Remove the "num-cpu" property from the SoC state and use the fixed
number of CPUs defined in the SoC class instead. Compute the default,
min, max number of CPUs of the machine directly from the SoC class
definition.

Signed-off-by: Cédric Le Goater 
---
 include/hw/arm/aspeed_soc.h |  1 -
 hw/arm/aspeed.c | 29 -
 hw/arm/aspeed_ast2600.c | 20 +++-
 hw/arm/aspeed_soc.c |  9 +
 4 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/include/hw/arm/aspeed_soc.h b/include/hw/arm/aspeed_soc.h
index 78b9f6ae532f..914115f3ef77 100644
--- a/include/hw/arm/aspeed_soc.h
+++ b/include/hw/arm/aspeed_soc.h
@@ -40,7 +40,6 @@ typedef struct AspeedSoCState {
 
 /*< public >*/
 ARMCPU cpu[ASPEED_CPUS_NUM];
-uint32_t num_cpus;
 A15MPPrivState a7mpcore;
 MemoryRegion *dram_mr;
 MemoryRegion sram;
diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 4d57d1e4363b..6f8f4b88f8ab 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -283,8 +283,6 @@ static void aspeed_machine_init(MachineState *machine)
 _abort);
 object_property_set_int(OBJECT(>soc), amc->num_cs, "num-cs",
 _abort);
-object_property_set_int(OBJECT(>soc), machine->smp.cpus, "num-cpus",
-_abort);
 object_property_set_link(OBJECT(>soc), OBJECT(>ram_container),
  "dram", _abort);
 if (machine->kernel_filename) {
@@ -337,7 +335,7 @@ static void aspeed_machine_init(MachineState *machine)
 }
 }
 
-if (machine->kernel_filename && bmc->soc.num_cpus > 1) {
+if (machine->kernel_filename && sc->num_cpus > 1) {
 /* With no u-boot we must set up a boot stub for the secondary CPU */
 MemoryRegion *smpboot = g_new(MemoryRegion, 1);
 memory_region_init_ram(smpboot, OBJECT(bmc), "aspeed.smpboot",
@@ -352,7 +350,7 @@ static void aspeed_machine_init(MachineState *machine)
 
 aspeed_board_binfo.ram_size = ram_size;
 aspeed_board_binfo.loader_start = sc->memmap[ASPEED_SDRAM];
-aspeed_board_binfo.nb_cpus = bmc->soc.num_cpus;
+aspeed_board_binfo.nb_cpus = sc->num_cpus;
 
 if (amc->i2c_init) {
 amc->i2c_init(bmc);
@@ -549,12 +547,17 @@ static void aspeed_machine_class_props_init(ObjectClass 
*oc)
"boot directly from CE0 flash device", 
_abort);
 }
 
+static int aspeed_soc_num_cpus(const char *soc_name)
+{
+   AspeedSoCClass *sc = ASPEED_SOC_CLASS(object_class_by_name(soc_name));
+   return sc->num_cpus;
+}
+
 static void aspeed_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
 
 mc->init = aspeed_machine_init;
-mc->max_cpus = ASPEED_CPUS_NUM;
 mc->no_floppy = 1;
 mc->no_cdrom = 1;
 mc->no_parallel = 1;
@@ -576,6 +579,8 @@ static void aspeed_machine_palmetto_class_init(ObjectClass 
*oc, void *data)
 amc->num_cs= 1;
 amc->i2c_init  = palmetto_bmc_i2c_init;
 mc->default_ram_size   = 256 * MiB;
+mc->default_cpus = mc->min_cpus = mc->max_cpus =
+aspeed_soc_num_cpus(amc->soc_name);
 };
 
 static void aspeed_machine_ast2500_evb_class_init(ObjectClass *oc, void *data)
@@ -591,6 +596,8 @@ static void 
aspeed_machine_ast2500_evb_class_init(ObjectClass *oc, void *data)
 amc->num_cs= 1;
 amc->i2c_init  = ast2500_evb_i2c_init;
 mc->default_ram_size   = 512 * MiB;
+mc->default_cpus = mc->min_cpus = mc->max_cpus =
+aspeed_soc_num_cpus(amc->soc_name);
 };
 
 static void aspeed_machine_romulus_class_init(ObjectClass *oc, void *data)
@@ -606,6 +613,8 @@ static void aspeed_machine_romulus_class_init(ObjectClass 
*oc, void *data)
 amc->num_cs= 2;
 amc->i2c_init  = romulus_bmc_i2c_init;
 mc->default_ram_size   = 512 * MiB;
+mc->default_cpus = mc->min_cpus = mc->max_cpus =
+aspeed_soc_num_cpus(amc->soc_name);
 };
 
 static void aspeed_machine_sonorapass_class_init(ObjectClass *oc, void *data)
@@ -621,6 +630,8 @@ static void 
aspeed_machine_sonorapass_class_init(ObjectClass *oc, void *data)
 amc->num_cs= 2;
 amc->i2c_init  = sonorapass_bmc_i2c_init;
 mc->default_ram_size   = 512 * MiB;
+mc->default_cpus = mc->min_cpus = mc->max_cpus =
+aspeed_soc_num_cpus(amc->soc_name);
 };
 
 static void aspeed_machine_swift_class_init(ObjectClass *oc, void *data)
@@ -636,6 +647,8 @@ static void aspeed_machine_swift_class_init(ObjectClass 
*oc, void *data)
 amc->num_cs= 2;
 amc->i2c_init  = swift_bmc_i2c_init;
 mc->default_ram_size   = 512 * MiB;
+mc->default_cpus = mc->min_cpus = mc->max_cpus =
+aspeed_soc_num_cpus(amc->soc_name);
 };

Re: [PATCH 06/10] softfloat: Inline float32 compare specializations

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Replace the float32 compare specializations with inline functions
> that call the standard float32_compare{,_quiet} functions.
> Use bool as the return type.
>
> Signed-off-by: Richard Henderson 
> ---
>  include/fpu/softfloat.h |  49 +++--
>  fpu/softfloat.c | 216 
>  2 files changed, 41 insertions(+), 224 deletions(-)

\o/

Reviewed-by: Alex Bennée 



-- 
Alex Bennée

[PULL 0/8] Block patches

2020-05-19 Thread Stefan Hajnoczi

The following changes since commit 013a18edbbc59cdad019100c7d03c0494642b74c:

  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-2020051=
4' into staging (2020-05-14 16:17:55 +0100)

are available in the Git repository at:

  https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to ba607ca8bff4d2c2062902f8355657c865ac7c29:

  aio-posix: disable fdmon-io_uring when GSource is used (2020-05-18 18:16:00=
 +0100)


Pull request



Philippe Mathieu-Daud=C3=A9 (6):
  tests/fuzz/Makefile: Do not link code using unavailable devices
  Makefile: List fuzz targets in 'make help'
  tests/fuzz: Add missing space in test description
  tests/fuzz: Remove unuseful/unused typedefs
  tests/fuzz: Extract pciconfig_fuzz_qos() method
  tests/fuzz: Extract ioport_fuzz_qtest() method

Stefan Hajnoczi (2):
  aio-posix: don't duplicate fd handler deletion in
fdmon_io_uring_destroy()
  aio-posix: disable fdmon-io_uring when GSource is used

 Makefile  |  6 +++-
 tests/qtest/fuzz/Makefile.include |  6 ++--
 include/block/aio.h   |  3 ++
 tests/qtest/fuzz/i440fx_fuzz.c| 47 ---
 util/aio-posix.c  | 13 +
 util/aio-win32.c  |  4 +++
 util/async.c  |  1 +
 util/fdmon-io_uring.c | 13 +++--
 8 files changed, 69 insertions(+), 24 deletions(-)

--=20
2.25.3

[PULL 5/8] tests/fuzz: Extract pciconfig_fuzz_qos() method

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

Extract the generic pciconfig_fuzz_qos() method from
i440fx_fuzz_qos(). This will help to write tests not
specific to the i440FX controller.

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-6-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 tests/qtest/fuzz/i440fx_fuzz.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/qtest/fuzz/i440fx_fuzz.c b/tests/qtest/fuzz/i440fx_fuzz.c
index c197b026db..558fa17c93 100644
--- a/tests/qtest/fuzz/i440fx_fuzz.c
+++ b/tests/qtest/fuzz/i440fx_fuzz.c
@@ -84,7 +84,7 @@ static void i440fx_fuzz_qtest(QTestState *s,
 flush_events(s);
 }
 
-static void i440fx_fuzz_qos(QTestState *s,
+static void pciconfig_fuzz_qos(QTestState *s, QPCIBus *bus,
 const unsigned char *Data, size_t Size) {
 /*
  * Same as i440fx_fuzz_qtest, but using QOS. devfn is incorporated into the
@@ -97,11 +97,6 @@ static void i440fx_fuzz_qos(QTestState *s,
 uint32_t value;
 } a;
 
-static QPCIBus *bus;
-if (!bus) {
-bus = qpci_new_pc(s, fuzz_qos_alloc);
-}
-
 while (Size >= sizeof(a)) {
 memcpy(, Data, sizeof(a));
 switch (a.opcode % ACTION_MAX) {
@@ -130,6 +125,19 @@ static void i440fx_fuzz_qos(QTestState *s,
 flush_events(s);
 }
 
+static void i440fx_fuzz_qos(QTestState *s,
+const unsigned char *Data,
+size_t Size)
+{
+static QPCIBus *bus;
+
+if (!bus) {
+bus = qpci_new_pc(s, fuzz_qos_alloc);
+}
+
+pciconfig_fuzz_qos(s, bus, Data, Size);
+}
+
 static void i440fx_fuzz_qos_fork(QTestState *s,
 const unsigned char *Data, size_t Size) {
 if (fork() == 0) {
-- 
2.25.3

Re: [PATCH RFC v2 1/5] block: add bitmap-populate job

2020-05-19 Thread Peter Krempa

On Mon, May 18, 2020 at 15:49:02 -0500, Eric Blake wrote:
> On 5/13/20 10:49 PM, John Snow wrote:

[...]

> > +
> > +/* NB: new bitmap is anonymous and enabled */
> > +cluster_size = bdrv_dirty_bitmap_granularity(target_bitmap);
> > +new_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
> > +if (!new_bitmap) {
> > +return NULL;
> > +}
> 
> This means if the guest writes to the disk while the job is ongoing, the
> bitmap will be updated to mark that portion of the bitmap as set, even if it
> was not allocated at the time the job started.  But then again, the guest
> writes are causing allocation, so this seems like the right thing to do.

Well, this could be made problem of the caller by skipping any newly
allocated sectors to be written to the bitmap. The caller then can
decide whether a snapshot of the allocation map is needed and thus a new
inactive bitmap should be used as the destination, or whether new writes
should be tracked by using an active bitmap.

> Do we need to worry about the converse case where the job started with
> something allocated but runs in parallel with the guest trimming, such that
> our bitmap marks something as set even though at the conclusion of our job
> it is no longer allocated?

Given the semantics above this would conveniently not be a problem of
the population job. If you create a snapshot of the allocation map any
any point we'd care about that state.

Anyways, from the point of view of the bitmap code any write to a sector
sets the bit so the trimming should not be treated differently.

Speicifically libvirt plans to use it on overlay(snapshot) images where
the btimaps are not present so in that case even trimmed sectors need to
mask the data in the backing image so they can technically be considered
as allocated too.

Re: [PATCH 01/10] softfloat: Use post test for floatN_mul

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> The existing f{32,64}_addsub_post test, which checks for zero
> inputs, is identical to f{32,64}_mul_fast_test.  Which means
> we can eliminate the fast_test/fast_op hooks in favor of
> reusing the same post hook.
>
> This means we have one fewer test along the fast path for multiply.

I was worried that we were missing an opportunity for the early fast
path but fp-bench disagrees.

Before:
  ./fp-bench -o mul   
  139.01 MFlops

After:
  ./fp-bench -o mul
  155.28 MFlops

So:

Tested-by: Alex Bennée 
Reviewed-by: Alex Bennée 

>
> Signed-off-by: Richard Henderson 
> ---
>  fpu/softfloat.c | 65 +++--
>  1 file changed, 14 insertions(+), 51 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index a362bf89ca..5fb4ef75bb 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -339,12 +339,10 @@ static inline bool f64_is_inf(union_float64 a)
>  return float64_is_infinity(a.s);
>  }
>  
> -/* Note: @fast_test and @post can be NULL */
>  static inline float32
>  float32_gen2(float32 xa, float32 xb, float_status *s,
>   hard_f32_op2_fn hard, soft_f32_op2_fn soft,
> - f32_check_fn pre, f32_check_fn post,
> - f32_check_fn fast_test, soft_f32_op2_fn fast_op)
> + f32_check_fn pre, f32_check_fn post)
>  {
>  union_float32 ua, ub, ur;
>  
> @@ -359,17 +357,12 @@ float32_gen2(float32 xa, float32 xb, float_status *s,
>  if (unlikely(!pre(ua, ub))) {
>  goto soft;
>  }
> -if (fast_test && fast_test(ua, ub)) {
> -return fast_op(ua.s, ub.s, s);
> -}
>  
>  ur.h = hard(ua.h, ub.h);
>  if (unlikely(f32_is_inf(ur))) {
>  s->float_exception_flags |= float_flag_overflow;
> -} else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
> -if (post == NULL || post(ua, ub)) {
> -goto soft;
> -}
> +} else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
> +goto soft;
>  }
>  return ur.s;
>  
> @@ -380,8 +373,7 @@ float32_gen2(float32 xa, float32 xb, float_status *s,
>  static inline float64
>  float64_gen2(float64 xa, float64 xb, float_status *s,
>   hard_f64_op2_fn hard, soft_f64_op2_fn soft,
> - f64_check_fn pre, f64_check_fn post,
> - f64_check_fn fast_test, soft_f64_op2_fn fast_op)
> + f64_check_fn pre, f64_check_fn post)
>  {
>  union_float64 ua, ub, ur;
>  
> @@ -396,17 +388,12 @@ float64_gen2(float64 xa, float64 xb, float_status *s,
>  if (unlikely(!pre(ua, ub))) {
>  goto soft;
>  }
> -if (fast_test && fast_test(ua, ub)) {
> -return fast_op(ua.s, ub.s, s);
> -}
>  
>  ur.h = hard(ua.h, ub.h);
>  if (unlikely(f64_is_inf(ur))) {
>  s->float_exception_flags |= float_flag_overflow;
> -} else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
> -if (post == NULL || post(ua, ub)) {
> -goto soft;
> -}
> +} else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
> +goto soft;
>  }
>  return ur.s;
>  
> @@ -1115,7 +1102,7 @@ static double hard_f64_sub(double a, double b)
>  return a - b;
>  }
>  
> -static bool f32_addsub_post(union_float32 a, union_float32 b)
> +static bool f32_addsubmul_post(union_float32 a, union_float32 b)
>  {
>  if (QEMU_HARDFLOAT_2F32_USE_FP) {
>  return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
> @@ -1123,7 +1110,7 @@ static bool f32_addsub_post(union_float32 a, 
> union_float32 b)
>  return !(float32_is_zero(a.s) && float32_is_zero(b.s));
>  }
>  
> -static bool f64_addsub_post(union_float64 a, union_float64 b)
> +static bool f64_addsubmul_post(union_float64 a, union_float64 b)
>  {
>  if (QEMU_HARDFLOAT_2F64_USE_FP) {
>  return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
> @@ -1136,14 +1123,14 @@ static float32 float32_addsub(float32 a, float32 b, 
> float_status *s,
>hard_f32_op2_fn hard, soft_f32_op2_fn soft)
>  {
>  return float32_gen2(a, b, s, hard, soft,
> -f32_is_zon2, f32_addsub_post, NULL, NULL);
> +f32_is_zon2, f32_addsubmul_post);
>  }
>  
>  static float64 float64_addsub(float64 a, float64 b, float_status *s,
>hard_f64_op2_fn hard, soft_f64_op2_fn soft)
>  {
>  return float64_gen2(a, b, s, hard, soft,
> -f64_is_zon2, f64_addsub_post, NULL, NULL);
> +f64_is_zon2, f64_addsubmul_post);
>  }
>  
>  float32 QEMU_FLATTEN
> @@ -1258,42 +1245,18 @@ static double hard_f64_mul(double a, double b)
>  return a * b;
>  }
>  
> -static bool f32_mul_fast_test(union_float32 a, union_float32 b)
> -{
> -return float32_is_zero(a.s) || float32_is_zero(b.s);
> -}
> -
> -static bool f64_mul_fast_test(union_float64 a, union_float64 b)
> -{
> -

Re: [PATCH 03/10] softfloat: Change tininess_before_rounding to bool

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Slightly tidies the usage within softfloat.c and the
> representation in float_status.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

Re: [PATCH v4 9/9] iotests: rename and move 169 and 199 tests

2020-05-19 Thread Kevin Wolf

Am 18.05.2020 um 18:12 hat Thomas Huth geschrieben:
> On 15/05/2020 23.15, Vladimir Sementsov-Ogievskiy wrote:
> > Rename bitmaps migration tests and move them to tests subdirectory to
> > demonstrate new human-friendly test naming.
> > 
> > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > ---
> >  tests/qemu-iotests/{199 => tests/migrate-bitmaps-postcopy-test}   | 0
> >  .../{199.out => tests/migrate-bitmaps-postcopy-test.out}  | 0
> >  tests/qemu-iotests/{169 => tests/migrate-bitmaps-test}| 0
> >  tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out}| 0
> >  4 files changed, 0 insertions(+), 0 deletions(-)
> >  rename tests/qemu-iotests/{199 => tests/migrate-bitmaps-postcopy-test} 
> > (100%)
> >  rename tests/qemu-iotests/{199.out => 
> > tests/migrate-bitmaps-postcopy-test.out} (100%)
> >  rename tests/qemu-iotests/{169 => tests/migrate-bitmaps-test} (100%)
> >  rename tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out} 
> > (100%)
> > 
> > diff --git a/tests/qemu-iotests/199 
> > b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
> > similarity index 100%
> > rename from tests/qemu-iotests/199
> > rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
> > diff --git a/tests/qemu-iotests/199.out 
> > b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
> > similarity index 100%
> > rename from tests/qemu-iotests/199.out
> > rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
> > diff --git a/tests/qemu-iotests/169 
> > b/tests/qemu-iotests/tests/migrate-bitmaps-test
> > similarity index 100%
> > rename from tests/qemu-iotests/169
> > rename to tests/qemu-iotests/tests/migrate-bitmaps-test
> > diff --git a/tests/qemu-iotests/169.out 
> > b/tests/qemu-iotests/tests/migrate-bitmaps-test.out
> > similarity index 100%
> > rename from tests/qemu-iotests/169.out
> > rename to tests/qemu-iotests/tests/migrate-bitmaps-test.out
> 
> I like the idea ... but the path name + file names get now quite long.
> While you're at it, what about renaming the "qemu-iotests" directory to
> just "iotests" or even just "io" now?

Renames are always kind of painful. Do we have a real reason for the
rename except that the paths feel a bit long subjectively?

Of course, if we're renaming all files anyway, changing the directory
name at the same time shouldn't give any additional pain, so it would be
completely reasonable then. We're not renaming the test harness files,
though, and even only two test cases in this patch.

Maybe this final patch should stay RFC until we have the infrastructure
in and then we can have a single series that moves all tests and also
renames the directory? Maybe a not strictly necessary rename of the
tooling would be bearable in the context of a mass rename of tests.

Kevin

Re: [PATCH 04/10] softfloat: Name rounding mode enum

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Give the previously unnamed enum a typedef name.  Use the packed
> attribute so that we do not affect the layout of the float_status
> struct.  Use it in the prototypes of relevant functions.
>
> Adjust switch statements as necessary to avoid compiler warnings.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

Re: [PATCH 05/10] softfloat: Name compare relation enum

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Give the previously unnamed enum a typedef name.  Use it in the
> prototypes of compare functions.  Use it to hold the results
> of the compare functions.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée

Re: Migration with ``drive-mirror`` + NBD will let quorum qcow2 image become larger

2020-05-19 Thread Kevin Wolf

Am 19.05.2020 um 10:32 hat Tao Xu geschrieben:
> Hi,
> 
> I am using ``drive-mirror`` + NBD for live storage migration. But I find
> that if I use a qcow2 image(virtual size: 10 GiB, disk size: 1.8 GiB) as a
> child of quorum, then the destination image become larger(virtual size: 10
> GiB, disk size: 10 GiB). However if I use a qcow2 image directly, then the
> destination image(virtual size: 10 GiB, disk size: 1.8 GiB) will be equal to
> the source.
> 
> So I am wondering if my usage is wrong or it is expected with
> quorum+drive-mirror?

This seems to be because the quorum block driver doesn't implement the
.bdrv_co_block_status interface, so the mirror block job can't know that
some blocks are unallocated/zeroed, but will copy everything.

I'm not sure if quorum can even implement this interface because the
allocation status in different quorum children may differ and voting
might not make sense. But maybe it could return a limited set of flags
at least so that the mirror job can get the BDRV_BLOCK_ZERO information
if the quorum children agree on it.

tl;dr: With quorum + drive-mirror it is currently expected.

Kevin

Re: sharing intention for developing per-target, dynamically loadable accelerator modules

2020-05-19 Thread Claudio Fontana

On 5/18/20 8:18 PM, Alex Bennée wrote:
> 
> Claudio Fontana  writes:
> 
>> Hello all,
>>
>> my intention would be to develop per-target, dynamically loadable 
>> accelerator modules.
>>
>> This would allow to distribute a single QEMU base binary, and then provide 
>> accelerators as optional additional binary packages to install,
>> with the first separate optional package being TCG.
>>
>> CONFIG_TCG would become 'm' as a result, but then also CONFIG_KVM, 
>> CONFIG_HAX, CONFIG_WHPX, CONFIG_HVF.
>>
>> Here are some elements that seem to be needed:
>>
>> 1 - The module CONFIG_MODULE part of the build system would need some 
>> extension to add per-target modules. I have some tentative results that 
>> shows that this is possible (but a bit clunky atm).
>> There is some existing instability in the existing Makefile 
>> infrastructure of modules that shows up when trying to extend it.
>>
>> 2 - new "accelerator drivers" seems to be needed, either in addition or as 
>> additional functionality inside the current AccelState.
>>
>> 3 - for target/i386 in particular, there is some refactoring work needed to 
>> split even more different unrelated bits and pieces.
>> dependencies of hw/i386 machine stuff with accelerator-specific
>> stuff are also painful.
> 
> There are a couple of per-arch hacks in the main TCG cpu loops it would
> be good to excise from the code.
> 
>>
>> 4 - CPU Arch Classes could be extended with per-accelerator methods. Initial 
>> fooling around shows it should probably work.
>> One alternative would be trying to play with the dynamic linker (weak 
>> symbols, creative use of dlsym etc), but I have not sorted out the details 
>> of this option.
>>
>> 5 - cputlb, in particular tlb_flush and friends is a separate problem
>> since it is not part of the cpuclass. Should it be?
> 
> tlb_flush and friends are TCG implementation details for softmmu that
> ensure a lookup for any address will always return to a guest specific
> tlb_fill to rebuild the cache. The behaviour is not guest-specific
> per-se although we do partition the table entries based on the guest
> size.
> 
> Perhaps we can make it more dynamic but it would have to ensure both the
> slow path and the fast path are using the same mask and shifts when
> walking the table.
> 
>> 6 - a painpoint is represented by the fact that in USER mode, the accel 
>> class is not applied, which causes a lot of uncleanliness all around
>> (tcg_allowed outside of the AccelClass).
> 
> The user-mode run loops are a whole separate chunk of code. I don't know
> if it is worth trying to make them plugable as you will only ever have
> one per linux-user binary.
> 
>> 7 - I have not really thought about the KConfig aspects because I am not 
>> super-familiar
>>
>> 8 - cpus.c needs some good splitting
> 
> Although there are several different run loops in there I think they
> share a lot of commonality which is why they are bundled together. They
> all share the same backend for dealing with ioevents and generic
> pause/unpause machinery. But feel free to prove me wrong ;-)

Hi Alex, I got my first compile, and it is currently in github, I still need to 
split the series though and there is still cleanup needed.

https://github.com/hw-claudio/qemu.git
branch "cpus-refactoring"

just in case you are interested in a peek.

The idea results currently in:

 cpu-throttle.c|  154 +
 cpu-timers.c  |  784 
+
 cpus-tcg.c|  515 ++
 cpus.c| 1405 
+

New interfaces are in:

include/sysemu/cpu-throttle.h |   50 +++
include/sysemu/cpu-timers.h   |   73 +
include/sysemu/cpus.h |   44 ++-

cpu-throttle (new) is self-explanatory, contains the cpu-throttle in cpus.c
cpu-timers (new) contains the icount, ticks, clock timers from cpus.c

cpus.h adds an interface to per-accel vcpu threads:

qemu_register_start_vcpu(void (*f)(CPUState *cpu));
bool all_cpu_threads_idle(void);
bool cpu_can_run(CPUState *cpu);
void qemu_wait_io_event_common(CPUState *cpu);
void qemu_wait_io_event(CPUState *cpu);
void cpu_thread_signal_created(CPUState *cpu);
void cpu_thread_signal_destroyed(CPUState *cpu);
void cpu_handle_guest_debug(CPUState *cpu);

Very much still all WIP...

Ciao,

C


> 
>> ... more things to find out and think about ...
>>
>> Overall, I think that the activity has the potential to provide benefits 
>> overall beyond the actual goal, in the form of cleanups, leaner 
>> configurations,
>> minor fixes, maybe improving the CONFIG_MODULE instabilities if any
>> etc.
> 
> There are certainly some ugly bits we could shave down in such an
> exercise.
> 
>> As an example, the first activity I would plan to submit as RFC is point 8 
>> above,
>> there is the split between cpus.c and cpus-tcg.c that results in lots of 
>>

Re: [PATCH v2 4/7] hw/elf_ops: Do not ignore write failures when loading ELF

2020-05-19 Thread Stefano Garzarella

On Mon, May 18, 2020 at 05:53:05PM +0200, Philippe Mathieu-DaudÃ© wrote:
> Do not ignore the MemTxResult error type returned by
> address_space_write().
> 
> Signed-off-by: Philippe Mathieu-DaudÃ© 
> ---
>  include/hw/elf_ops.h | 11 ---
>  1 file changed, 8 insertions(+), 3 deletions(-)

Reviewed-by: Stefano Garzarella 

> 
> diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
> index 398a4a2c85..6fdff3dced 100644
> --- a/include/hw/elf_ops.h
> +++ b/include/hw/elf_ops.h
> @@ -553,9 +553,14 @@ static int glue(load_elf, SZ)(const char *name, int fd,
>  rom_add_elf_program(label, mapped_file, data, file_size,
>  mem_size, addr, as);
>  } else {
> -address_space_write(as ? as : _space_memory,
> -addr, MEMTXATTRS_UNSPECIFIED,
> -data, file_size);
> +MemTxResult res;
> +
> +res = address_space_write(as ? as : 
> _space_memory,
> +  addr, MEMTXATTRS_UNSPECIFIED,
> +  data, file_size);
> +if (res != MEMTX_OK) {
> +goto fail;
> +}
>  }
>  }
>  
> -- 
> 2.21.3
>

[PULL 8/8] aio-posix: disable fdmon-io_uring when GSource is used

2020-05-19 Thread Stefan Hajnoczi

The glib event loop does not call fdmon_io_uring_wait() so fd handlers
waiting to be submitted build up in the list. There is no benefit is
using io_uring when the glib GSource is being used, so disable it
instead of implementing a more complex fix.

This fixes a memory leak where AioHandlers would build up and increasing
amounts of CPU time were spent iterating them in aio_pending(). The
symptom is that guests become slow when QEMU is built with io_uring
support.

Buglink: https://bugs.launchpad.net/qemu/+bug/1877716
Fixes: 73fd282e7b6dd4e4ea1c3bbb3d302c8db51e4ccf ("aio-posix: add io_uring fd 
monitoring implementation")
Signed-off-by: Stefan Hajnoczi 
Tested-by: Oleksandr Natalenko 
Message-id: 20200511183630.279750-3-stefa...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 include/block/aio.h |  3 +++
 util/aio-posix.c| 12 
 util/aio-win32.c|  4 
 util/async.c|  1 +
 4 files changed, 20 insertions(+)

diff --git a/include/block/aio.h b/include/block/aio.h
index 62ed954344..b2f703fa3f 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -701,6 +701,9 @@ void aio_context_setup(AioContext *ctx);
  */
 void aio_context_destroy(AioContext *ctx);
 
+/* Used internally, do not call outside AioContext code */
+void aio_context_use_g_source(AioContext *ctx);
+
 /**
  * aio_context_set_poll_params:
  * @ctx: the aio context
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 8af334ab19..1b2a3af65b 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -682,6 +682,18 @@ void aio_context_destroy(AioContext *ctx)
 aio_free_deleted_handlers(ctx);
 }
 
+void aio_context_use_g_source(AioContext *ctx)
+{
+/*
+ * Disable io_uring when the glib main loop is used because it doesn't
+ * support mixed glib/aio_poll() usage. It relies on aio_poll() being
+ * called regularly so that changes to the monitored file descriptors are
+ * submitted, otherwise a list of pending fd handlers builds up.
+ */
+fdmon_io_uring_destroy(ctx);
+aio_free_deleted_handlers(ctx);
+}
+
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  int64_t grow, int64_t shrink, Error **errp)
 {
diff --git a/util/aio-win32.c b/util/aio-win32.c
index 729d533faf..953c56ab48 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -414,6 +414,10 @@ void aio_context_destroy(AioContext *ctx)
 {
 }
 
+void aio_context_use_g_source(AioContext *ctx)
+{
+}
+
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  int64_t grow, int64_t shrink, Error **errp)
 {
diff --git a/util/async.c b/util/async.c
index 3165a28f2f..1319eee3bc 100644
--- a/util/async.c
+++ b/util/async.c
@@ -362,6 +362,7 @@ static GSourceFuncs aio_source_funcs = {
 
 GSource *aio_get_g_source(AioContext *ctx)
 {
+aio_context_use_g_source(ctx);
 g_source_ref(>source);
 return >source;
 }
-- 
2.25.3

[PULL 7/8] aio-posix: don't duplicate fd handler deletion in fdmon_io_uring_destroy()

2020-05-19 Thread Stefan Hajnoczi

The io_uring file descriptor monitoring implementation has an internal
list of fd handlers that are pending submission to io_uring.
fdmon_io_uring_destroy() deletes all fd handlers on the list.

Don't delete fd handlers directly in fdmon_io_uring_destroy() for two
reasons:
1. This duplicates the aio-posix.c AioHandler deletion code and could
   become outdated if the struct changes.
2. Only handlers with the FDMON_IO_URING_REMOVE flag set are safe to
   remove. If the flag is not set then something still has a pointer to
   the fd handler. Let aio-posix.c and its user worry about that. In
   practice this isn't an issue because fdmon_io_uring_destroy() is only
   called when shutting down so all users have removed their fd
   handlers, but the next patch will need this!

Signed-off-by: Stefan Hajnoczi 
Tested-by: Oleksandr Natalenko 
Message-id: 20200511183630.279750-2-stefa...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 util/aio-posix.c  |  1 +
 util/fdmon-io_uring.c | 13 ++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index c3613d299e..8af334ab19 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -679,6 +679,7 @@ void aio_context_destroy(AioContext *ctx)
 {
 fdmon_io_uring_destroy(ctx);
 fdmon_epoll_disable(ctx);
+aio_free_deleted_handlers(ctx);
 }
 
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index d5a80ed6fb..1d14177df0 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -342,11 +342,18 @@ void fdmon_io_uring_destroy(AioContext *ctx)
 
 io_uring_queue_exit(>fdmon_io_uring);
 
-/* No need to submit these anymore, just free them. */
+/* Move handlers due to be removed onto the deleted list */
 while ((node = QSLIST_FIRST_RCU(>submit_list))) {
+unsigned flags = atomic_fetch_and(>flags,
+~(FDMON_IO_URING_PENDING |
+  FDMON_IO_URING_ADD |
+  FDMON_IO_URING_REMOVE));
+
+if (flags & FDMON_IO_URING_REMOVE) {
+QLIST_INSERT_HEAD_RCU(>deleted_aio_handlers, node, 
node_deleted);
+}
+
 QSLIST_REMOVE_HEAD_RCU(>submit_list, node_submitted);
-QLIST_REMOVE(node, node);
-g_free(node);
 }
 
 ctx->fdmon_ops = _poll_ops;
-- 
2.25.3

Re: [PATCH] replay: synchronize on every virtual timer callback

2020-05-19 Thread Alex Bennée



Pavel Dovgalyuk  writes:

> On 18.05.2020 18:56, Alex Bennée wrote:
>> Philippe Mathieu-Daudé  writes:
>>
>>> + Alex
>>>
>>> On 5/6/20 10:17 AM, Pavel Dovgalyuk wrote:
 Sometimes virtual timer callbacks depend on order
 of virtual timer processing and warping of virtual clock.
 Therefore every callback should be logged to make replay deterministic.
 This patch creates a checkpoint before every virtual timer callback.
 With these checkpoints virtual timers processing and clock warping
 events order is completely deterministic.
 Signed-off-by: Pavel Dovgalyuk 
 ---
util/qemu-timer.c |5 +
1 file changed, 5 insertions(+)
 diff --git a/util/qemu-timer.c b/util/qemu-timer.c
 index d548d3c1ad..47833f338f 100644
 --- a/util/qemu-timer.c
 +++ b/util/qemu-timer.c
 @@ -588,6 +588,11 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
qemu_mutex_lock(_list->active_timers_lock);
  progress = true;
 +/*
 + * Callback may insert new checkpoints, therefore add new 
 checkpoint
 + * for the virtual timers.
 + */
 +need_replay_checkpoint = timer_list->clock->type == 
 QEMU_CLOCK_VIRTUAL;
}
qemu_mutex_unlock(_list->active_timers_lock);
>> So the problem I have with this as with all the record/replay stuff I
>> need want to review is it's very hard to see things in action. I added a
>> *very* basic record/replay test to the aarch64 softmmu tests but they
>> won't exercise any of this code because no timers get fired. I'm
>> assuming the sort of tests that is really needed is something that not
>> only causes QEMU_CLOCK_VIRTUAL timers to fire and trigger logged HW
>> events and ensure that things don't get confused in the process.
>
> I encounter most of the bugs in different OS boot scenarios.
>
> We also have internal tests that include some computational, disk, and
> network interaction tasks.
>
> Is it possible to add a test like booting a "real" OS and replaying
> it?

Yes - for these bigger more complex setups we should use the acceptance
tests that run under Avocado. See "make check-acceptance".

>> If I read up the file I just get more questions than answers. For
>> example why do we release the qemu_timers lock before processing the
>> replay event? Is it that the replay event could cause another timer to
>
> We release the lock, because accessing the replay module may process
> some events and add more timers.

OK. I guess the adding of the timer is a side effect of processing the
event rather than something that gets added directly?


-- 
Alex Bennée

Re: [PATCH 11/24] pnv/phb4: Bury unwanted "pnv-phb4-pec-stack" devices

2020-05-19 Thread Cédric Le Goater

On 5/18/20 7:03 AM, Markus Armbruster wrote:
> The number of stacks is controlled by property "num-stacks".
> pnv_pec_instance_init() creates the maximum supported number, because
> the property has not been set then.  pnv_pec_realize() realizes only
> the wanted number.  Works, although it can leave unrealized devices
> hanging around in the QOM composition tree.  Affects machine powernv9.

I have used this pattern in many models. Is there a better one ?

Thanks,

C.

Migration with ``drive-mirror`` + NBD will let quorum qcow2 image become larger

2020-05-19 Thread Tao Xu


Hi,

I am using ``drive-mirror`` + NBD for live storage migration. But I find 
that if I use a qcow2 image(virtual size: 10 GiB, disk size: 1.8 GiB) as 
a child of quorum, then the destination image become larger(virtual 
size: 10 GiB, disk size: 10 GiB). However if I use a qcow2 image 
directly, then the destination image(virtual size: 10 GiB, disk size: 
1.8 GiB) will be equal to the source.


So I am wondering if my usage is wrong or it is expected with 
quorum+drive-mirror?


P.S. Detail:

1) [On *destination* Host]: qemu-img create -f qcow2 fedora32.qcow2 10G
Formatting 'fedora32.qcow2', fmt=qcow2 size=10737418240 
cluster_size=65536 lazy_refcounts=off refcount_bits=16


qemu-img info fedora32.qcow2
image: fedora32.qcow2
file format: qcow2
virtual size: 10 GiB (10737418240 bytes)
disk size: 196 KiB
cluster_size: 65536
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false

Boot the QEMU using:

disk_path=fedora32.qcow2
net_param="-netdev 
tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper 
-device rtl8139,id=e0,netdev=hn0"

cmdline="qemu-system-x86_64 \
-enable-kvm \
-m 2G -smp 4 -qmp stdio -bios OVMF.fd \
-monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \
-cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \
-device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
-device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \
-drive if=none,id=parent0,file.filename=$disk_path,driver=qcow2 \
-incoming tcp:0:"
exec $cmdline

[On *destination* QEMU]:
{'execute':'qmp_capabilities'}
{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 
'data': {'host': '192.168.0.33', 'port': '8889'} } } }
{'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 
'writable': true } }


2) [On *source* Host]:

Boot the QEMU using:

disk_path=fedora32.qcow2
net_param="-netdev 
tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper 
-device rtl8139,id=e0,netdev=hn0"

cmdline="qemu-system-x86_64 \
-enable-kvm \
-m 2G -smp 4 -qmp stdio -bios OVMF.fd \
-monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \
-cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \
-device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
-device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \
-drive 
if=virtio,id=colo-disk0,driver=quorum,vote-threshold=1,children.0.file.filename=$disk_path,children.0.driver=qcow2"

exec $cmdline

[On *source* QEMU]:

{'execute':'qmp_capabilities'}
{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 
'job-id': 'resync', 'target': 'nbd://192.168.0.33:8889/parent0', 'mode': 
'existing', 'format': 'nbd', 'sync': 'full'} }


{"timestamp": {"seconds": 1589902560, "microseconds": 107418}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "created", "id": "resync"}}
{"timestamp": {"seconds": 1589902560, "microseconds": 107487}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "running", "id": "resync"}}

{"return": {}}
{"timestamp": {"seconds": 1589902721, "microseconds": 439095}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "resync"}}
{"timestamp": {"seconds": 1589902721, "microseconds": 439194}, "event": 
"BLOCK_JOB_READY", "data": {"device": "resync", "len": 10739253248, 
"offset": 10739253248, "speed": 0, "type": "mirror"}}


3)[On *destination* Host]:
qemu-img info fedora32.qcow2
image: fedora32.qcow2
file format: qcow2
virtual size: 10 GiB (10737418240 bytes)
disk size: 10 GiB
cluster_size: 65536
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
4)But if [On *source* Host] boot qemu using:

disk_path=fedora32.qcow2
net_param="-netdev 
tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper 
-device rtl8139,id=e0,netdev=hn0"

cmdline="qemu-system-x86_64 \
-enable-kvm \
-m 2G -smp 4 -qmp stdio -bios OVMF.fd \
-monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \
-cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \
-device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
-device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \
-drive if=virtio,id=parent0,file.filename=$disk_path,driver=qcow2"
exec $cmdline

Then [On *destination* Host]:

qemu-img info fedora32.qcow2
image: fedora32.qcow2
file format: qcow2
virtual size: 10 GiB (10737418240 bytes)
disk size: 1.8 GiB
cluster_size: 65536
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false

Re: [PATCH] hw/ide: Make IDEDMAOps handlers take a const IDEDMA pointer

2020-05-19 Thread Kevin Wolf

Am 18.05.2020 um 20:26 hat John Snow geschrieben:
> 
> 
> On 5/15/20 4:48 AM, Kevin Wolf wrote:
> > Am 14.05.2020 um 22:21 hat John Snow geschrieben:
> >>
> >>
> >> On 5/12/20 3:49 PM, Philippe Mathieu-DaudÃ© wrote:
> >>> Handlers don't need to modify the IDEDMA structure.
> >>> Make it const.
> >>>
> >>> Signed-off-by: Philippe Mathieu-DaudÃ© 
> >>
> >> I'll trust your judgment. As long as it still compiles and passes
> >> qtests, I'm happy if you're happy.
> >>
> >> Acked-by: John Snow 
> > 
> > Does this mean you assume someone else will merge it? If that
> > someone is me, please let me know.
> > 
> > Kevin
> > 
> 
> I think I had thought this was part of a larger set, and didn't
> realize it wasn't.
> 
> Yes, if you could please stage this for inclusion, that would be
> helpful.

No problem, applied to the block branch.

Kevin

Re: [PATCH v5 1/4] hw/net/can: Introduce Xilinx ZynqMP CAN controller

2020-05-19 Thread Edgar E. Iglesias

On Sun, May 17, 2020 at 12:24:01AM -0700, Vikram Garhwal wrote:
> The Xilinx ZynqMP CAN controller is developed based on SocketCAN, QEMU CAN bus
> implementation. Bus connection and socketCAN connection for each CAN module
> can be set through command lines.

Hi Vikram,

Have a look at hw/char/cadence_uart.c, the *_reset_hold() function to
see an example on how to fix the reset issue.

Cheers,
Edgar


> 
> Signed-off-by: Vikram Garhwal 
> ---
>  hw/net/can/Makefile.objs |1 +
>  hw/net/can/xlnx-zynqmp-can.c | 1116 
> ++
>  include/hw/net/xlnx-zynqmp-can.h |   75 +++
>  3 files changed, 1192 insertions(+)
>  create mode 100644 hw/net/can/xlnx-zynqmp-can.c
>  create mode 100644 include/hw/net/xlnx-zynqmp-can.h
> 
> diff --git a/hw/net/can/Makefile.objs b/hw/net/can/Makefile.objs
> index 9f0c4ee..0fe87dd 100644
> --- a/hw/net/can/Makefile.objs
> +++ b/hw/net/can/Makefile.objs
> @@ -2,3 +2,4 @@ common-obj-$(CONFIG_CAN_SJA1000) += can_sja1000.o
>  common-obj-$(CONFIG_CAN_PCI) += can_kvaser_pci.o
>  common-obj-$(CONFIG_CAN_PCI) += can_pcm3680_pci.o
>  common-obj-$(CONFIG_CAN_PCI) += can_mioe3680_pci.o
> +common-obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp-can.o
> diff --git a/hw/net/can/xlnx-zynqmp-can.c b/hw/net/can/xlnx-zynqmp-can.c
> new file mode 100644
> index 000..751972e
> --- /dev/null
> +++ b/hw/net/can/xlnx-zynqmp-can.c
> @@ -0,0 +1,1116 @@
> +/*
> + * QEMU model of the Xilinx ZynqMP CAN controller.
> + *
> + * Copyright (c) 2020 Xilinx Inc.
> + *
> + * Written-by: Vikram Garhwal
> + *
> + * Based on QEMU CAN Device emulation implemented by Jin Yang, Deniz Eren and
> + * Pavel Pisa
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/sysbus.h"
> +#include "hw/register.h"
> +#include "hw/irq.h"
> +#include "qapi/error.h"
> +#include "qemu/bitops.h"
> +#include "qemu/log.h"
> +#include "qemu/cutils.h"
> +#include "sysemu/sysemu.h"
> +#include "migration/vmstate.h"
> +#include "hw/qdev-properties.h"
> +#include "net/can_emu.h"
> +#include "net/can_host.h"
> +#include "qemu/event_notifier.h"
> +#include "qom/object_interfaces.h"
> +#include "hw/net/xlnx-zynqmp-can.h"
> +
> +#ifndef XLNX_ZYNQMP_CAN_ERR_DEBUG
> +#define XLNX_ZYNQMP_CAN_ERR_DEBUG 0
> +#endif
> +
> +#define DB_PRINT(...) do { \
> +if (XLNX_ZYNQMP_CAN_ERR_DEBUG) { \
> +qemu_log(__VA_ARGS__); \
> +} \
> +} while (0)
> +
> +#define MAX_DLC8
> +#undef ERROR
> +
> +REG32(SOFTWARE_RESET_REGISTER, 0x0)
> +FIELD(SOFTWARE_RESET_REGISTER, CEN, 1, 1)
> +FIELD(SOFTWARE_RESET_REGISTER, SRST, 0, 1)
> +REG32(MODE_SELECT_REGISTER, 0x4)
> +FIELD(MODE_SELECT_REGISTER, SNOOP, 2, 1)
> +FIELD(MODE_SELECT_REGISTER, LBACK, 1, 1)
> +FIELD(MODE_SELECT_REGISTER, SLEEP, 0, 1)
> +REG32(ARBITRATION_PHASE_BAUD_RATE_PRESCALER_REGISTER, 0x8)
> +FIELD(ARBITRATION_PHASE_BAUD_RATE_PRESCALER_REGISTER, BRP, 0, 8)
> +REG32(ARBITRATION_PHASE_BIT_TIMING_REGISTER, 0xc)
> +FIELD(ARBITRATION_PHASE_BIT_TIMING_REGISTER, SJW, 7, 2)
> +FIELD(ARBITRATION_PHASE_BIT_TIMING_REGISTER, TS2, 4, 3)
> +FIELD(ARBITRATION_PHASE_BIT_TIMING_REGISTER, TS1, 0, 4)
> +REG32(ERROR_COUNTER_REGISTER, 0x10)
> +FIELD(ERROR_COUNTER_REGISTER, REC, 8, 8)
> +FIELD(ERROR_COUNTER_REGISTER, TEC, 0, 8)
> +REG32(ERROR_STATUS_REGISTER, 0x14)
> +FIELD(ERROR_STATUS_REGISTER, ACKER, 4, 1)
> +FIELD(ERROR_STATUS_REGISTER, BERR, 3, 1)
> +FIELD(ERROR_STATUS_REGISTER, STER, 2, 1)
> +FIELD(ERROR_STATUS_REGISTER, FMER, 1, 1)
> +FIELD(ERROR_STATUS_REGISTER, CRCER, 0, 1)
> +REG32(STATUS_REGISTER, 0x18)
> +FIELD(STATUS_REGISTER, SNOOP, 12, 1)
> +FIELD(STATUS_REGISTER, ACFBSY, 11, 1)
> +FIELD(STATUS_REGISTER, TXFLL, 10, 1)
> +FIELD(STATUS_REGISTER, TXBFLL, 9, 1)
> +FIELD(STATUS_REGISTER, ESTAT, 7, 2)
> +FIELD(STATUS_REGISTER, ERRWRN, 6, 1)
> +

[PATCH] ARM: PL061: Introduce N_GPIOS

2020-05-19 Thread Geert Uytterhoeven

Add a definition for the number of GPIO lines controlled by a PL061
instance, and use it instead of the hardcoded magic value 8.

Suggested-by: Philippe Mathieu-Daudé 
Signed-off-by: Geert Uytterhoeven 
---
 hw/gpio/pl061.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/gpio/pl061.c b/hw/gpio/pl061.c
index 2a828260bdb0b946..6d3c36bc16cf9e0d 100644
--- a/hw/gpio/pl061.c
+++ b/hw/gpio/pl061.c
@@ -36,6 +36,8 @@ static const uint8_t pl061_id_luminary[12] =
 #define TYPE_PL061 "pl061"
 #define PL061(obj) OBJECT_CHECK(PL061State, (obj), TYPE_PL061)
 
+#define N_GPIOS 8
+
 typedef struct PL061State {
 SysBusDevice parent_obj;
 
@@ -62,7 +64,7 @@ typedef struct PL061State {
 uint32_t cr;
 uint32_t amsel;
 qemu_irq irq;
-qemu_irq out[8];
+qemu_irq out[N_GPIOS];
 const unsigned char *id;
 uint32_t rsvd_start; /* reserved area: [rsvd_start, 0xfcc] */
 } PL061State;
@@ -112,7 +114,7 @@ static void pl061_update(PL061State *s)
 changed = s->old_out_data ^ out;
 if (changed) {
 s->old_out_data = out;
-for (i = 0; i < 8; i++) {
+for (i = 0; i < N_GPIOS; i++) {
 mask = 1 << i;
 if (changed & mask) {
 DPRINTF("Set output %d = %d\n", i, (out & mask) != 0);
@@ -125,7 +127,7 @@ static void pl061_update(PL061State *s)
 changed = (s->old_in_data ^ s->data) & ~s->dir;
 if (changed) {
 s->old_in_data = s->data;
-for (i = 0; i < 8; i++) {
+for (i = 0; i < N_GPIOS; i++) {
 mask = 1 << i;
 if (changed & mask) {
 DPRINTF("Changed input %d = %d\n", i, (s->data & mask) != 0);
@@ -364,8 +366,8 @@ static void pl061_init(Object *obj)
 memory_region_init_io(>iomem, obj, _ops, s, "pl061", 0x1000);
 sysbus_init_mmio(sbd, >iomem);
 sysbus_init_irq(sbd, >irq);
-qdev_init_gpio_in(dev, pl061_set_irq, 8);
-qdev_init_gpio_out(dev, s->out, 8);
+qdev_init_gpio_in(dev, pl061_set_irq, N_GPIOS);
+qdev_init_gpio_out(dev, s->out, N_GPIOS);
 }
 
 static void pl061_class_init(ObjectClass *klass, void *data)
-- 
2.17.1

Re: QEMU 5.1: Can we require each new device/machine to provided a test?

2020-05-19 Thread Daniel P . Berrangé

On Mon, May 18, 2020 at 03:56:36PM -0400, John Snow wrote:
> 
> 
> On 5/15/20 6:23 AM, Daniel P. Berrangé wrote:
> > On Fri, May 15, 2020 at 12:11:17PM +0200, Thomas Huth wrote:
> >> On 07/04/2020 12.59, Philippe Mathieu-Daudé wrote:
> >>> Hello,
> >>>
> >>> Following Markus thread on deprecating unmaintained (untested) code
> >>> (machines) [1] and the effort done to gather the information shared in
> >>> the replies [2], and the various acceptance tests added, is it
> >>> feasible to require for the next release that each new device/machine
> >>> is provided a test covering it?
> >>>
> >>> If no, what is missing?
> >>
> >> If a qtest is feasible, yes, I think we should require one for new
> >> devices. But what about machines - you normally need a test image for
> >> this. In that case, there is still the question where testing images
> >> could be hosted. Not every developer has a web space where they could
> >> put their test images onto. And what about images that contain non-free
> >> code?
> > 
> > Yep, it isn't feasible to make this a hard rule.
> > 
> > IMHO this is where a support tier classification comes into play
> > 
> >  - Tier 1: actively maintained, qtest coverage available. Expected
> >to work reliably at all times since every commit is CI
> >tested
> > 
> >   - Tier 2: actively maintained, no qtest coverage. Should usually
> >work but regression may creep in due to reliance on the
> >maintainer to manually test on adhoc basis
> > 
> >   - Tier 3: not actively maintained, unknown state but liable to
> > be broken indefinitely
> > 
> > Tier 1 is obviously the most desirable state we would like everthing to
> > be at. Contributors will have to fix problems their patches cause as
> > they will be blocked by CI.
> > 
> > Tier 2 is an admission that reality gets in the way. Ideally stuff in
> > this tier will graduate to Tier 1 at some point. Even if it doesn't
> > though, it is still valid to keep it in QEMU long term. Contributors
> > shouldn't gratuitously break stuff in these board, but if they do,
> > then the maintainer is ultimately responsible for fixing it, as the
> > contributors don't have a test rig for it.
> > 
> > Tier 3 is abandonware. If a maintainer doesn't appear, users should
> > not expect it to continue to exist long term. Contributors are free
> > to send patches which break this, and are under no obligation to
> > fix problems in these boards. We may deprecate & delete it after a
> > while
> > 
> > 
> > Over time we'll likely add more criteria to stuff in Tier 1. This
> > could lead to some things dropping from Tier 1 to Tier 2. This is
> > OK, as it doesn't make those things worse than they already were.
> > We're just saying that Tier 2 isn't as thoroughly tested as we
> > would like it to be in an ideal world.
> 
> I really like the idea of device support tiers codified directly in the
> QEMU codebase, to give upstream users some idea of which devices we
> expect to work and which we ... don't, really.
> 
> Not every last device we offer is enterprise production ready, but we
> don't necessarily do a good job of explaining which devices fall into
> which categories, and we've got quite a few of them.
> 
> I wonder if a 2.5th tier would be useful; something like a "hobbyist"
> tier for pet project SoC boards and the like -- they're not abandoned,
> but we also don't expect them to work, exactly.
> 
> Mild semantic difference from Tier 3.

I guess I was thinking such hobbyist stuff would fall into tier 2  if the
hobbyist maintainer actually responds to fixing stuff, or tier 3 if they
largely aren't active on the mailing list responding to issues/questions.

We add have a 4 tier system overall and put hobbyist stuff at tier 3,
and abandonware at tier 4.

Probably shouldn't go beyond 4 tiers though, as the more criteria we add
the harder it is to clearly decide which tier something should go into.

The tier 1 vs 2 divison is clearly split based on CI which is a simple
classification to decide on.

The tier 2 vs 3 division is moderately clearly split based on whether
there is a frequently active maintainer.

We can probably squeeze in the 4th tier without too much ambiguity in
the classisfication if we think it is adding something worthwhile either
from our POV as maintainers, or for users consuming it.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 2/9] ppc/xive: Export xive_presenter_notify()

2020-05-19 Thread Greg Kurz

On Wed, 13 May 2020 17:11:02 +0200
Cédric Le Goater  wrote:

> It's generic enough to be used from the XIVE2 router and avoid more
> duplication.
> 
> Signed-off-by: Cédric Le Goater 
> ---

Reviewed-by: Greg Kurz 

>  include/hw/ppc/xive.h | 4 
>  hw/intc/xive.c| 8 
>  2 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 112fb6fb6dbe..88b0a2a3811f 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -406,6 +406,10 @@ int xive_presenter_tctx_match(XivePresenter *xptr, 
> XiveTCTX *tctx,
>uint8_t format,
>uint8_t nvt_blk, uint32_t nvt_idx,
>bool cam_ignore, uint32_t logic_serv);
> +bool xive_presenter_notify(XiveFabric *xfb, uint8_t format,
> +   uint8_t nvt_blk, uint32_t nvt_idx,
> +   bool cam_ignore, uint8_t priority,
> +   uint32_t logic_serv);
>  
>  /*
>   * XIVE Fabric (Interface between Interrupt Controller and Machine)
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index b8825577f719..f08fcec1c039 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -1481,10 +1481,10 @@ int xive_presenter_tctx_match(XivePresenter *xptr, 
> XiveTCTX *tctx,
>   *
>   * The parameters represent what is sent on the PowerBus
>   */
> -static bool xive_presenter_notify(XiveFabric *xfb, uint8_t format,
> -  uint8_t nvt_blk, uint32_t nvt_idx,
> -  bool cam_ignore, uint8_t priority,
> -  uint32_t logic_serv)
> +bool xive_presenter_notify(XiveFabric *xfb, uint8_t format,
> +   uint8_t nvt_blk, uint32_t nvt_idx,
> +   bool cam_ignore, uint8_t priority,
> +   uint32_t logic_serv)
>  {
>  XiveFabricClass *xfc = XIVE_FABRIC_GET_CLASS(xfb);
>  XiveTCTXMatch match = { .tctx = NULL, .ring = 0 };

Re: [PATCH 05/24] aspeed: Don't create unwanted "cortex-a7-arm-cpu" devices

2020-05-19 Thread Cédric Le Goater

On 5/19/20 7:46 AM, Markus Armbruster wrote:
> Joel Stanley  writes:
> 
>> On Mon, 18 May 2020 at 12:24, Cédric Le Goater  wrote:
>>>
>>> On 5/18/20 7:03 AM, Markus Armbruster wrote:
 The number of CPUs is controlled by property "num-cpus".
 aspeed_soc_ast2600_init() creates the maximum supported number.
 aspeed_soc_ast2600_realize() realizes only the wanted number.  Works,
 although it leaves unrealized devices hanging around in the QOM
 composition tree.  Affects machines ast2600-evb and tacoma-bmc.

 Make the init functions create only the wanted ones.  Visible in "info
 qom-tree"; here's the change for ast2600-evb:

  /machine (ast2600-evb-machine)
[...]
/soc (ast2600-a1)
  [...]
  /cpu[0] (cortex-a7-arm-cpu)
/unnamed-gpio-in[0] (irq)
/unnamed-gpio-in[1] (irq)
/unnamed-gpio-in[2] (irq)
/unnamed-gpio-in[3] (irq)
 -/cpu[1] (cortex-a7-arm-cpu)
 -  /unnamed-gpio-in[0] (irq)
 -  /unnamed-gpio-in[1] (irq)
 -  /unnamed-gpio-in[2] (irq)
 -  /unnamed-gpio-in[3] (irq)
  /ehci[0] (platform-ehci-usb)

 Cc: "Cédric Le Goater" 
 Cc: Peter Maydell 
 Cc: Andrew Jeffery 
 Cc: Joel Stanley 
 Cc: qemu-...@nongnu.org
 Signed-off-by: Markus Armbruster 
>>>
>>> Reviewed-by: Cédric Le Goater 
>>>
>>> Joel, Andrew,
>>>
>>> Shouldn't we enforce a default/min/max number of CPUs of 2 for the AST2600 ?
>>> That's the SoC definition. The fact it is configurable in the Aspeed model
>>> was nice to have during bringup but we are now done.
>>
>> Agreed, we want there to always be two CPUs for the 2600.
> 
> Follow-up patch welcome!

I just sent a patch on this topic.

C.

[PULL 2/8] Makefile: List fuzz targets in 'make help'

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

List softmmu fuzz targets in 'make help' output:

  $ make help
  ...
  Architecture specific targets:
  aarch64-softmmu/all- Build for aarch64-softmmu
  aarch64-softmmu/fuzz   - Build fuzzer for aarch64-softmmu
  alpha-softmmu/all  - Build for alpha-softmmu
  alpha-softmmu/fuzz - Build fuzzer for alpha-softmmu
  arm-softmmu/all- Build for arm-softmmu
  arm-softmmu/fuzz   - Build fuzzer for arm-softmmu
  ...

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-3-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 Makefile | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 34275f57c9..40e4f7677b 100644
--- a/Makefile
+++ b/Makefile
@@ -1252,7 +1252,11 @@ endif
@$(if $(TARGET_DIRS), \
echo 'Architecture specific targets:'; \
$(foreach t, $(TARGET_DIRS), \
-   $(call print-help-run,$(t)/all,Build for $(t));) \
+   $(call print-help-run,$(t)/all,Build for $(t)); \
+   $(if $(CONFIG_FUZZ), \
+   $(if $(findstring softmmu,$(t)), \
+   $(call print-help-run,$(t)/fuzz,Build fuzzer 
for $(t)); \
+   ))) \
echo '')
@$(if $(TOOLS), \
echo 'Tools targets:'; \
-- 
2.25.3

[PULL 1/8] tests/fuzz/Makefile: Do not link code using unavailable devices

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

Some devices availability depends on CONFIG options.
Use these options to only link tests when requested device
is available.

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-2-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 tests/qtest/fuzz/Makefile.include | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qtest/fuzz/Makefile.include 
b/tests/qtest/fuzz/Makefile.include
index cde3e9636c..f259d866c9 100644
--- a/tests/qtest/fuzz/Makefile.include
+++ b/tests/qtest/fuzz/Makefile.include
@@ -7,9 +7,9 @@ fuzz-obj-y += tests/qtest/fuzz/fork_fuzz.o
 fuzz-obj-y += tests/qtest/fuzz/qos_fuzz.o
 
 # Targets
-fuzz-obj-y += tests/qtest/fuzz/i440fx_fuzz.o
-fuzz-obj-y += tests/qtest/fuzz/virtio_net_fuzz.o
-fuzz-obj-y += tests/qtest/fuzz/virtio_scsi_fuzz.o
+fuzz-obj-$(CONFIG_PCI_I440FX) += tests/qtest/fuzz/i440fx_fuzz.o
+fuzz-obj-$(CONFIG_VIRTIO_NET) += tests/qtest/fuzz/virtio_net_fuzz.o
+fuzz-obj-$(CONFIG_SCSI) += tests/qtest/fuzz/virtio_scsi_fuzz.o
 
 FUZZ_CFLAGS += -I$(SRC_PATH)/tests -I$(SRC_PATH)/tests/qtest
 
-- 
2.25.3

[PULL 3/8] tests/fuzz: Add missing space in test description

2020-05-19 Thread Stefan Hajnoczi

From: Philippe Mathieu-Daudé 

Signed-off-by: Philippe Mathieu-Daudé 
Message-id: 20200514143433.18569-4-phi...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 tests/qtest/fuzz/i440fx_fuzz.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qtest/fuzz/i440fx_fuzz.c b/tests/qtest/fuzz/i440fx_fuzz.c
index ab5f112584..96fed9ff12 100644
--- a/tests/qtest/fuzz/i440fx_fuzz.c
+++ b/tests/qtest/fuzz/i440fx_fuzz.c
@@ -159,7 +159,7 @@ static void register_pci_fuzz_targets(void)
 /* Uses simple qtest commands and reboots to reset state */
 fuzz_add_target(&(FuzzTarget){
 .name = "i440fx-qtest-reboot-fuzz",
-.description = "Fuzz the i440fx using raw qtest commands and"
+.description = "Fuzz the i440fx using raw qtest commands and "
"rebooting after each run",
 .get_init_cmdline = i440fx_argv,
 .fuzz = i440fx_fuzz_qtest});
@@ -167,7 +167,7 @@ static void register_pci_fuzz_targets(void)
 /* Uses libqos and forks to prevent state leakage */
 fuzz_add_qos_target(&(FuzzTarget){
 .name = "i440fx-qos-fork-fuzz",
-.description = "Fuzz the i440fx using raw qtest commands and"
+.description = "Fuzz the i440fx using raw qtest commands and "
"rebooting after each run",
 .pre_vm_init = _init,
 .fuzz = i440fx_fuzz_qos_fork,},
@@ -182,7 +182,7 @@ static void register_pci_fuzz_targets(void)
  */
 fuzz_add_qos_target(&(FuzzTarget){
 .name = "i440fx-qos-noreset-fuzz",
-.description = "Fuzz the i440fx using raw qtest commands and"
+.description = "Fuzz the i440fx using raw qtest commands and "
"rebooting after each run",
 .fuzz = i440fx_fuzz_qos,},
 "i440FX-pcihost",
-- 
2.25.3

Re: [virtio-dev] Re: Fwd: Qemu Support for Virtio Video V4L2 driver

2020-05-19 Thread Keiichi Watanabe

Hi Nicolas,

On Fri, May 15, 2020 at 8:38 AM Nicolas Dufresne  wrote:
>
> Le lundi 11 mai 2020 à 20:49 +0900, Keiichi Watanabe a écrit :
> > Hi,
> >
> > Thanks Saket for your feedback. As Dmitry mentioned, we're focusing on
> > video encoding and decoding, not camera. So, my reply was about how to
> > implement paravirtualized video codec devices.
> >
> > On Mon, May 11, 2020 at 8:25 PM Dmitry Sepp 
> > wrote:
> > > Hi Saket,
> > >
> > > On Montag, 11. Mai 2020 13:05:53 CEST Saket Sinha wrote:
> > > > Hi Keiichi,
> > > >
> > > > I do not support the approach of  QEMU implementation forwarding
> > > > requests to the host's vicodec module since  this can limit the scope
> > > > of the virtio-video device only for testing,
> > >
> > > That was my understanding as well.
> >
> > Not really because the API which the vicodec provides is V4L2 stateful
> > decoder interface [1], which are also used by other video drivers on
> > Linux.
> > The difference between vicodec and actual device drivers is that
> > vicodec performs decoding in the kernel space without using special
> > video devices. In other words, vicodec is a software decoder in kernel
> > space which provides the same interface with actual video drivers.
> > Thus, if the QEMU implementation can forward virtio-video requests to
> > vicodec, it can forward them to the actual V4L2 video decoder devices
> > as well and VM gets access to a paravirtualized video device.
> >
> > The reason why we discussed vicodec in the previous thread was it'll
> > allow us to test the virtio-video driver without hardware requirement.
> >
> > [1] https://www.kernel.org/doc/html/latest/media/uapi/v4l/dev-decoder.html
> >
> > > > which instead can be used with multiple use cases such as -
> > > >
> > > > 1. VM gets access to paravirtualized  camera devices which shares the
> > > > video frames input through actual HW camera attached to Host.
> > >
> > > This use-case is out of the scope of virtio-video. Initially I had a plan 
> > > to
> > > support capture-only streams like camera as well, but later the decision 
> > > was
> > > made upstream that camera should be implemented as separate device type. 
> > > We
> > > still plan to implement a simple frame capture capability as a downstream
> > > patch though.
> > >
> > > > 2. If Host has multiple video devices (especially in ARM SOCs over
> > > > MIPI interfaces or USB), different VM can be started or hotplugged
> > > > with selective video streams from actual HW video devices.
> > >
> > > We do support this in our device implementation. But spec in general has 
> > > no
> > > requirements or instructions regarding this. And it is in fact flexible
> > > enough
> > > to provide abstraction on top of several HW devices.
> > >
> > > > Also instead of using libraries like Gstreamer in Host userspace, they
> > > > can also be used inside the VM userspace after getting access to
> > > > paravirtualized HW camera devices .
> >
> > Regarding Gstreamer, I intended this video decoding API [2]. If QEMU
> > can translate virtio-video requests to this API, we can easily support
> > multiple platforms.
> > I'm not sure how feasible it is though, as I have no experience of
> > using this API by myself...
>
> Not sure which API you aim exactly, but what one need to remember is that
> mapping virtio-video CODEC on top of VAAPI, V4L2 Stateless, NVDEC or other 
> type
> of "stateless" CODEC is not trivial and can't be done without userspace. 
> Notably
> because we don't want to do bitstream parsing in the kernel on the main CPU as
> security would otherwise be very hard to guaranty. The other driver using same
> API as virtio-video do bitstream parsing on a dedicated co-processor (through
> firmware blobs though).
>
> Having bridges between virtio-video, qemu and some abstraction library like
> FFMPEG or GStreamer is certainly the best solution if you want to virtualize 
> any
> type of HW accelerated decoder or if you need to virtualized something
> proprietary (like NVDEC). Please shout if you need help.
>

Yeah, I meant we should map virtio-video commands to a set of
abstracted userspace APIs to avoid having many platform-dependent code
in QEMU.
This is the same with what we implemented in crosvm, a VMM on
ChromiumOS. Crosvm's video device translates virtio-video commands
into our own video decoding APIs [1, 2] which supports VAAPI, V4L2
stateful and V4L2 stateless. Unfortunately, since our library is
highly depending on Chrome, we cannot reuse this for QEMU.

So, I agree that using FFMPEG or GStreamer is a good idea. Probably,
APIs in my previous link weren't for this purpose.
Nicolas, do you know any good references for FFMPEG or GStreamer's
abstracted video decoding APIs? Then, I may be able to think about how
virtio-video protocols can be mapped to them.

[1] libvda's C interface:
https://chromium.googlesource.com/chromiumos/platform2/+/refs/heads/master/arc/vm/libvda/libvda_decode.h
[2] libvda's Rust interface:

[PATCH] hw/arm/virt: Fix PL061 node name and properties

2020-05-19 Thread Geert Uytterhoeven

Make the created node comply with the PL061 Device Tree bindings:
  - Use generic node name "gpio" instead of "pl061",
  - Add missing "#interrupt-cells" and "interrupt-controller"
properties.

Signed-off-by: Geert Uytterhoeven 
---
Split off from "[PATCH QEMU v2 2/5] ARM: PL061: Extract pl061_create_fdt()"
(https://lore.kernel.org/r/20200423090118.11199-3-geert+rene...@glider.be).
---
 hw/arm/virt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 7dc96abf72cf2b9a..99593d7bce4d85cb 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -818,13 +818,15 @@ static void create_gpio(const VirtMachineState *vms)
  qdev_get_gpio_in(vms->gic, irq));
 
 uint32_t phandle = qemu_fdt_alloc_phandle(vms->fdt);
-nodename = g_strdup_printf("/pl061@%" PRIx64, base);
+nodename = g_strdup_printf("/gpio@%" PRIx64, base);
 qemu_fdt_add_subnode(vms->fdt, nodename);
 qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
  2, base, 2, size);
 qemu_fdt_setprop(vms->fdt, nodename, "compatible", compat, sizeof(compat));
 qemu_fdt_setprop_cell(vms->fdt, nodename, "#gpio-cells", 2);
 qemu_fdt_setprop(vms->fdt, nodename, "gpio-controller", NULL, 0);
+qemu_fdt_setprop_cell(vms->fdt, nodename, "#interrupt-cells", 2);
+qemu_fdt_setprop(vms->fdt, nodename, "interrupt-controller", NULL, 0);
 qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts",
GIC_FDT_IRQ_TYPE_SPI, irq,
GIC_FDT_IRQ_FLAGS_LEVEL_HI);
-- 
2.17.1

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-19 Thread Dima Stepanov

On Wed, May 13, 2020 at 01:56:18PM +0800, Jason Wang wrote:
> 
> On 2020/5/13 下午12:15, Michael S. Tsirkin wrote:
> >On Tue, May 12, 2020 at 12:35:30PM +0300, Dima Stepanov wrote:
> >>On Tue, May 12, 2020 at 11:32:50AM +0800, Jason Wang wrote:
> >>>On 2020/5/11 下午5:25, Dima Stepanov wrote:
> On Mon, May 11, 2020 at 11:15:53AM +0800, Jason Wang wrote:
> >On 2020/4/30 下午9:36, Dima Stepanov wrote:
> >>If vhost-user daemon is used as a backend for the vhost device, then we
> >>should consider a possibility of disconnect at any moment. If such
> >>disconnect happened in the vhost_migration_log() routine the vhost
> >>device structure will be clean up.
> >>At the start of the vhost_migration_log() function there is a check:
> >>   if (!dev->started) {
> >>   dev->log_enabled = enable;
> >>   return 0;
> >>   }
> >>To be consistent with this check add the same check after calling the
> >>vhost_dev_set_log() routine. This in general help not to break a
> >>migration due the assert() message. But it looks like that this code
> >>should be revised to handle these errors more carefully.
> >>
> >>In case of vhost-user device backend the fail paths should consider the
> >>state of the device. In this case we should skip some function calls
> >>during rollback on the error paths, so not to get the NULL dereference
> >>errors.
> >>
> >>Signed-off-by: Dima Stepanov
> >>---
> >>  hw/virtio/vhost.c | 39 +++
> >>  1 file changed, 35 insertions(+), 4 deletions(-)
> >>
> >>diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>index 3ee50c4..d5ab96d 100644
> >>--- a/hw/virtio/vhost.c
> >>+++ b/hw/virtio/vhost.c
> >>@@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev 
> >>*dev,
> >>  static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
> >>  {
> >>  int r, i, idx;
> >>+
> >>+if (!dev->started) {
> >>+/*
> >>+ * If vhost-user daemon is used as a backend for the
> >>+ * device and the connection is broken, then the vhost_dev
> >>+ * structure will be reset all its values to 0.
> >>+ * Add additional check for the device state.
> >>+ */
> >>+return -1;
> >>+}
> >>+
> >>  r = vhost_dev_set_features(dev, enable_log);
> >>  if (r < 0) {
> >>  goto err_features;
> >>@@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev 
> >>*dev, bool enable_log)
> >>  }
> >>  return 0;
> >>  err_vq:
> >>-for (; i >= 0; --i) {
> >>+/*
> >>+ * Disconnect with the vhost-user daemon can lead to the
> >>+ * vhost_dev_cleanup() call which will clean up vhost_dev
> >>+ * structure.
> >>+ */
> >>+for (; dev->started && (i >= 0); --i) {
> >>  idx = dev->vhost_ops->vhost_get_vq_index(
> >Why need the check of dev->started here, can started be modified outside
> >mainloop? If yes, I don't get the check of !dev->started in the 
> >beginning of
> >this function.
> >
> No dev->started can't change outside the mainloop. The main problem is
> only for the vhost_user_blk daemon. Consider the case when we
> successfully pass the dev->started check at the beginning of the
> function, but after it we hit the disconnect on the next call on the
> second or third iteration:
>   r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, enable_log);
> The unix socket backend device will call the disconnect routine for this
> device and reset the structure. So the structure will be reset (and
> dev->started set to false) inside this set_addr() call.
> >>>I still don't get here. I think the disconnect can not happen in the middle
> >>>of vhost_dev_set_log() since both of them were running in mainloop. And 
> >>>even
> >>>if it can, we probably need other synchronization mechanism other than
> >>>simple check here.
> >>Disconnect isn't happened in the separate thread it is happened in this
> >>routine inside vhost_dev_set_log. When for instance vhost_user_write()
> >>call failed:
> >>   vhost_user_set_log_base()
> >> vhost_user_write()
> >>   vhost_user_blk_disconnect()
> >> vhost_dev_cleanup()
> >>   vhost_user_backend_cleanup()
> >>So the point is that if we somehow got a disconnect with the
> >>vhost-user-blk daemon before the vhost_user_write() call then it will
> >>continue clean up by running vhost_user_blk_disconnect() function. I
> >>wrote a more detailed backtrace stack in the separate thread, which is
> >>pretty similar to what we have here:
> >>   Re: [PATCH v2 4/5] vhost: check vring address before calling unmap
> >>The places are different but the problem is pretty similar.
> >>
> >>So if vhost-user commands handshake then

Re: [PATCH 07/10] softfloat: Inline float64 compare specializations

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Replace the float64 compare specializations with inline functions
> that call the standard float64_compare{,_quiet} functions.
> Use bool as the return type.
>
> Signed-off-by: Richard Henderson 
> ---
>  include/fpu/softfloat.h   |  49 ++--
>  fpu/softfloat.c   | 220 --
>  target/s390x/vec_fpu_helper.c |   2 +-
>  3 files changed, 42 insertions(+), 229 deletions(-)



Reviewed-by: Alex Bennée 


>
> diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
> index 4d1af6ab45..281f0fd971 100644
> --- a/include/fpu/softfloat.h
> +++ b/include/fpu/softfloat.h
> @@ -532,14 +532,6 @@ float64 float64_rem(float64, float64, float_status 
> *status);
>  float64 float64_muladd(float64, float64, float64, int, float_status *status);
>  float64 float64_sqrt(float64, float_status *status);
>  float64 float64_log2(float64, float_status *status);
> -int float64_eq(float64, float64, float_status *status);
> -int float64_le(float64, float64, float_status *status);
> -int float64_lt(float64, float64, float_status *status);
> -int float64_unordered(float64, float64, float_status *status);
> -int float64_eq_quiet(float64, float64, float_status *status);
> -int float64_le_quiet(float64, float64, float_status *status);
> -int float64_lt_quiet(float64, float64, float_status *status);
> -int float64_unordered_quiet(float64, float64, float_status *status);
>  FloatRelation float64_compare(float64, float64, float_status *status);
>  FloatRelation float64_compare_quiet(float64, float64, float_status *status);
>  float64 float64_min(float64, float64, float_status *status);
> @@ -615,6 +607,47 @@ static inline float64 float64_set_sign(float64 a, int 
> sign)
>  | ((int64_t)sign << 63));
>  }
>  
> +static inline bool float64_eq(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool float64_le(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool float64_lt(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool float64_unordered(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare(a, b, s) == float_relation_unordered;
> +}
> +
> +static inline bool float64_eq_quiet(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare_quiet(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool float64_le_quiet(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare_quiet(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool float64_lt_quiet(float64 a, float64 b, float_status *s)
> +{
> +return float64_compare_quiet(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool float64_unordered_quiet(float64 a, float64 b,
> +   float_status *s)
> +{
> +return float64_compare_quiet(a, b, s) == float_relation_unordered;
> +}
> +
>  #define float64_zero make_float64(0)
>  #define float64_half make_float64(0x3fe0LL)
>  #define float64_one make_float64(0x3ff0LL)
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index f6bfc40c97..5d7fc2c17a 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -4941,226 +4941,6 @@ float64 float64_log2(float64 a, float_status *status)
>  return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
>  }
>  
> -/*
> -| Returns 1 if the double-precision floating-point value `a' is equal to the
> -| corresponding value `b', and 0 otherwise.  The invalid exception is raised
> -| if either operand is a NaN.  Otherwise, the comparison is performed
> -| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
> -**/
> -
> -int float64_eq(float64 a, float64 b, float_status *status)
> -{
> -uint64_t av, bv;
> -a = float64_squash_input_denormal(a, status);
> -b = float64_squash_input_denormal(b, status);
> -
> -if (( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) 
> )
> - || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) 
> )
> -   ) {
> -float_raise(float_flag_invalid, status);
> -return 0;
> -}
> -av = float64_val(a);
> -bv = float64_val(b);
> -return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
> -
> -}
> -
> -/*
> -| Returns 1 if the double-precision floating-point value `a' is less than or
> -| equal to the corresponding value `b', and 0 otherwise.  The invalid
> -| exception is raised if either operand is a NaN.  The comparison is 
> performed
> -| according

Re: [PATCH 0/9] ppc/pnv: Introduce the XIVE2 and PHB5 controllers for the POWER10 chip

2020-05-19 Thread Greg Kurz

On Wed, 13 May 2020 17:11:00 +0200
Cédric Le Goater  wrote:

> Hello,
> 
> This series completes the PowerNV POWER10 machine with the XIVE2
> interrupt controller and the PHB5 PCI host bridge controller.
> 
> The XIVE2 interrupt controller of the POWER10 processor follows the
> same logic than on POWER9 but the HW interface has been largely
> reviewed. To avoid adding too much complexity in the P9 XIVE models,
> a new XIVE2 core framework is introduced.
> 
> The PHB5 and PHB4 controllers are very similar. Not much changes there
> apart from the mapping addresses and ids. There is more to come on
> PHB5 regarding interrupt offload on the interrupt controller, but this
> is for another round.
> 
> This series also adds basic OCC and CPU Quad models which are not very
> much exercised by the firmware.
> 
> Thanks,
> 
> C.
> 

This series is huge and it requires great knowledge of the HW, that I don't
have (who does? ;-), to do a real review. I'll stick to find obvious errors
and nits...

> Cédric Le Goater (9):
>   ppc/xive: Export PQ get/set routines
>   ppc/xive: Export xive_presenter_notify()
>   ppc/xive2: Introduce a XIVE2 core framework
>   ppc/xive2: Introduce a presenter matching routine
>   ppc/pnv: Add a XIVE2 controller to the POWER10 chip.
>   ppc/pnv: Add a OCC model for POWER10
>   ppc/pnv: Add POWER10 quads
>   ppc/pnv: Add model for POWER9 PHB5 PCIe Host bridge
>   ppc/psi: Add support for StoreEOI and 64k ESB pages (POWER10)
> 
>  hw/intc/pnv_xive2_regs.h   |  428 +++
>  include/hw/pci-host/pnv_phb4.h |   11 +
>  include/hw/ppc/pnv.h   |   30 +
>  include/hw/ppc/pnv_occ.h   |2 +
>  include/hw/ppc/pnv_xive.h  |   71 ++
>  include/hw/ppc/pnv_xscom.h |   12 +
>  include/hw/ppc/xive.h  |8 +
>  include/hw/ppc/xive2.h |   93 ++
>  include/hw/ppc/xive2_regs.h|  198 
>  hw/intc/pnv_xive2.c| 2026 
>  hw/intc/spapr_xive_kvm.c   |8 +-
>  hw/intc/xive.c |   14 +-
>  hw/intc/xive2.c|  756 
>  hw/pci-host/pnv_phb4_pec.c |   44 +
>  hw/ppc/pnv.c   |  243 +++-
>  hw/ppc/pnv_occ.c   |   17 +
>  hw/ppc/pnv_psi.c   |   32 +-
>  hw/intc/Makefile.objs  |4 +-
>  18 files changed, 3974 insertions(+), 23 deletions(-)
>  create mode 100644 hw/intc/pnv_xive2_regs.h
>  create mode 100644 include/hw/ppc/xive2.h
>  create mode 100644 include/hw/ppc/xive2_regs.h
>  create mode 100644 hw/intc/pnv_xive2.c
>  create mode 100644 hw/intc/xive2.c
>

Re: [PATCH v2 5/9] block/io: expand in_flight inc/dec section: simple cases

2020-05-19 Thread Kevin Wolf

Am 06.05.2020 um 09:02 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 27.04.2020 17:39, Vladimir Sementsov-Ogievskiy wrote:
> > It's safer to expand in_flight request to start before enter to
> > coroutine in synchronous wrappers, due to the following (theoretical)
> > problem:
> > 
> > Consider write.
> > It's possible, that qemu_coroutine_enter only schedules execution,
> > assume such case.
> > 
> > Then we may possibly have the following:
> > 
> > 1. Somehow check that we are not in drained section in outer code.
> > 
> > 2. Call bdrv_pwritev(), assuming that it will increase in_flight, which
> > will protect us from starting drained section.
> > 
> > 3. It calls bdrv_prwv_co() -> bdrv_coroutine_enter() (not yet increased
> > in_flight).
> > 
> > 4. Assume coroutine not yet actually entered, only scheduled, and we go
> > to some code, which starts drained section (as in_flight is zero).
> > 
> > 5. Scheduled coroutine starts, and blindly increases in_flight, and we
> > are in drained section with in_flight request.
> > 
> > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> 
> Very interesting: this patch breaks test-replication. It hangs:
> 
> (gdb) thr a a bt
> 
> Thread 2 (Thread 0x7eff256cd700 (LWP 2843)):
> #0  0x7eff2f5fd1fd in syscall () from /lib64/libc.so.6
> #1  0x55af9a9a4f11 in qemu_futex_wait (f=0x55af9aa6f758 
> , val=4294967295) at 
> /work/src/qemu/up-expand-bdrv-in_flight-bounds/include/qemu/futex.h:29
> #2  0x55af9a9a50d5 in qemu_event_wait (ev=0x55af9aa6f758 
> ) at util/qemu-thread-posix.c:459
> #3  0x55af9a9bd20d in call_rcu_thread (opaque=0x0) at util/rcu.c:260
> #4  0x55af9a9a5288 in qemu_thread_start (args=0x55af9c4f1b80) at 
> util/qemu-thread-posix.c:519
> #5  0x7eff2f6d44c0 in start_thread () from /lib64/libpthread.so.0
> #6  0x7eff2f602553 in clone () from /lib64/libc.so.6
> 
> Thread 1 (Thread 0x7eff25820a80 (LWP 2842)):
> #0  0x7eff2f5f7bd6 in ppoll () from /lib64/libc.so.6
> #1  0x55af9a99e405 in qemu_poll_ns (fds=0x55af9c52a830, nfds=1, 
> timeout=-1) at util/qemu-timer.c:335
> #2  0x55af9a9a1cab in fdmon_poll_wait (ctx=0x55af9c526890, 
> ready_list=0x7ffc73e8c5d0, timeout=-1) at util/fdmon-poll.c:79
> #3  0x55af9a9a160c in aio_poll (ctx=0x55af9c526890, blocking=true) at 
> util/aio-posix.c:600
> #4  0x55af9a8f0bb0 in bdrv_do_drained_begin (bs=0x55af9c52a8d0, 
> recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
> block/io.c:429
> #5  0x55af9a8f0c95 in bdrv_drained_begin (bs=0x55af9c52a8d0) at 
> block/io.c:435
> #6  0x55af9a8dc6a8 in blk_drain (blk=0x55af9c542c10) at 
> block/block-backend.c:1681
> #7  0x55af9a8da0b6 in blk_unref (blk=0x55af9c542c10) at 
> block/block-backend.c:473
> #8  0x55af9a8eb5e7 in mirror_exit_common (job=0x55af9c6c45c0) at 
> block/mirror.c:667
> #9  0x55af9a8eb9c1 in mirror_prepare (job=0x55af9c6c45c0) at 
> block/mirror.c:765
> #10 0x55af9a87cd65 in job_prepare (job=0x55af9c6c45c0) at job.c:781
> #11 0x55af9a87b62a in job_txn_apply (job=0x55af9c6c45c0, 
> fn=0x55af9a87cd28 ) at job.c:158
> #12 0x55af9a87cdee in job_do_finalize (job=0x55af9c6c45c0) at job.c:798
> #13 0x55af9a87cfb5 in job_completed_txn_success (job=0x55af9c6c45c0) at 
> job.c:852
> #14 0x55af9a87d055 in job_completed (job=0x55af9c6c45c0) at job.c:865
> #15 0x55af9a87d0a8 in job_exit (opaque=0x55af9c6c45c0) at job.c:885
> #16 0x55af9a99b981 in aio_bh_call (bh=0x55af9c547440) at util/async.c:136
> #17 0x55af9a99ba8b in aio_bh_poll (ctx=0x55af9c526890) at util/async.c:164
> #18 0x55af9a9a17ff in aio_poll (ctx=0x55af9c526890, blocking=true) at 
> util/aio-posix.c:650
> #19 0x55af9a8f7011 in bdrv_flush (bs=0x55af9c53b900) at block/io.c:3019
> #20 0x55af9a874351 in bdrv_close (bs=0x55af9c53b900) at block.c:4252
> #21 0x55af9a874ca3 in bdrv_delete (bs=0x55af9c53b900) at block.c:4498
> #22 0x55af9a877862 in bdrv_unref (bs=0x55af9c53b900) at block.c:5866
> #23 0x55af9a870837 in bdrv_root_unref_child (child=0x55af9c6c4430) at 
> block.c:2684
> #24 0x55af9a8da9a2 in blk_remove_bs (blk=0x55af9c547bd0) at 
> block/block-backend.c:803
> #25 0x55af9a8d9e54 in blk_delete (blk=0x55af9c547bd0) at 
> block/block-backend.c:422
> #26 0x55af9a8da0f8 in blk_unref (blk=0x55af9c547bd0) at 
> block/block-backend.c:477
> #27 0x55af9a86a6f1 in teardown_secondary () at 
> tests/test-replication.c:392
> #28 0x55af9a86aac1 in test_secondary_stop () at 
> tests/test-replication.c:490
> #29 0x7eff2fd7df7e in g_test_run_suite_internal () from 
> /lib64/libglib-2.0.so.0
> #30 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
> /lib64/libglib-2.0.so.0
> #31 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
> /lib64/libglib-2.0.so.0
> #32 0x7eff2fd7e46a in g_test_run_suite () from /lib64/libglib-2.0.so.0
> #33 0x7eff2fd7e485 in g_test_run () from /lib64/libglib-2.0.so.0
> #34 0x55af9a86b19c in main (argc=1,

Re: [PATCH v3 4/7] migration/block-dirty-bitmap: fix bitmaps pre-blockdev migration during mirror job

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


18.05.2020 23:36, Eric Blake wrote:

On 5/15/20 7:40 AM, Vladimir Sementsov-Ogievskiy wrote:

Important thing for bitmap migration is to select destination block
node to obtain the migrated bitmap.

Prepatch, on source we use bdrv_get_device_or_node_name() to identify
the node, and on target we do bdrv_lookup_bs.
bdrv_get_device_or_node_name() returns blk name only for direct
children of blk. So, bitmaps of direct children of blks are migrated by
blk name and others - by node name.

Old libvirt is unprepared to bitmap migration by node-name,
node-names are mostly auto-generated. So actually only migration by blk
name works for it.

Newer libvirt will use new interface (which will be added soon) to
specify node-mapping for bitmaps migration explicitly. Still, let's
improve the current behavior a bit.

Now, consider classic libvirt migrations assisted by mirror block job:
mirror block job inserts filter, so our source is not a direct child of
blk, and bitmaps are migrated by node-names. And this just don't work


either "won't" or "doesn't"


with auto-generated node names


trailing '.'



Let's fix it by allowing use blk-name even if some implicit filters are
inserted.


s/allowing use/using/



Note2: we, of course, can't skip filters and use blk name to migrate
bitmaps in filtered node by blk name for this blk if these filters have
named bitmaps which should be migrated.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1652424
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  migration/block-dirty-bitmap.c | 39 +-
  1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 7e93718086..5d3a7d2b07 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -319,14 +319,48 @@ static int init_dirty_bitmap_migration(void)
  {
  BlockDriverState *bs;
  DirtyBitmapMigBitmapState *dbms;
+    GHashTable *handled_by_blk = g_hash_table_new(NULL, NULL);
+    BlockBackend *blk;
  dirty_bitmap_mig_state.bulk_completed = false;
  dirty_bitmap_mig_state.prev_bs = NULL;
  dirty_bitmap_mig_state.prev_bitmap = NULL;
  dirty_bitmap_mig_state.no_bitmaps = false;
+    /*
+ * Use blockdevice name for direct (or filtered) children of named block
+ * backends.
+ */
+    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
+    const char *name = blk_name(blk);
+
+    if (!name || strcmp(name, "") == 0) {
+    continue;
+    }
+
+    bs = blk_bs(blk);
+
+    /* Skip filters without bitmaos */
+    while (bs && bs->drv && bs->drv->is_filter &&
+   !bdrv_has_named_bitmaps(bs))
+    {
+    bs = bs->backing->bs ?: bs->file->bs;


Is this correct, or should it be:

bs = bs->backing ? bs->backing->bs : bs->file->bs;


Hmm, yes, otherwise it should crash on file-based filter :)



Otherwise looks reasonable, but I'm hesitant to include it in today's bitmap 
pull request in order to give it more review/testing time.  It should be ready 
for a pull request next week, though.




--
Best regards,
Vladimir

Re: [PATCH v2 6/9] block/io: expand in_flight inc/dec section: block-status

2020-05-19 Thread Kevin Wolf

Am 02.05.2020 um 00:00 hat Eric Blake geschrieben:
> On 4/27/20 9:39 AM, Vladimir Sementsov-Ogievskiy wrote:
> > It's safer to expand in_flight request to start before enter to
> > coroutine in synchronous wrappers and end after BDRV_POLL_WHILE loop.
> > Note that qemu_coroutine_enter may only schedule the coroutine in some
> > circumstances.
> 
> Wording suggestion:
> 
> It's safer to expand the region protected by an in_flight request to begin
> in the synchronous wrapper and end after the BDRV_POLL_WHILE loop.  Leaving
> the in_flight request in the coroutine itself risks a race where calling
> qemu_coroutine_enter() may have only scheduled, rather than started, the
> coroutine, allowing some other thread a chance to not realize an operation
> is in flight.
> 
> > 
> > block-status requests are complex, they involve querying different
> > block driver states across backing chain. Let's expand only in_flight
> > section for the top bs, keeping other sections as is.
> 
> block-status requests are complex, involving a query of different block
> driver states across the backing chain.  Let's expand only the in_flight
> section for the top bs, and keep the other sections as-is.
> 
> I'd welcome Kevin's review on my next comment, but if I'm correct, I think
> we can further add the following justification to the commit message:
> 
> Gathering block status only requires reads from the block device, and
> backing devices are typically read-only, so losing any in_flight race on a
> backing device is less likely to cause problems with concurrent
> modifications on the overall backing chain.

Actually, my question is what we gain by increasing in_flight only for
the top level. It feels wrong to me, though maybe it doesn't actually
lead to bugs because in practice, we completely drain the parents
instead of just draining requests going to one specific child.

But as this patch shows, not increasing in_flight in some cases is a lot
more work than doing it, and it's harder to understand why it's correct.
So why not simply increase it unconditionally?

This is how other requests work as well. If you make a read request to a
qcow2 image, you'll get in_flight increased for both the qcow2 node and
the file-posix node.

Kevin

Re: [PATCH v2 5/9] block/io: expand in_flight inc/dec section: simple cases

2020-05-19 Thread Kevin Wolf

Am 27.04.2020 um 16:39 hat Vladimir Sementsov-Ogievskiy geschrieben:
> It's safer to expand in_flight request to start before enter to
> coroutine in synchronous wrappers, due to the following (theoretical)
> problem:
> 
> Consider write.
> It's possible, that qemu_coroutine_enter only schedules execution,
> assume such case.
> 
> Then we may possibly have the following:
> 
> 1. Somehow check that we are not in drained section in outer code.
> 
> 2. Call bdrv_pwritev(), assuming that it will increase in_flight, which
> will protect us from starting drained section.
> 
> 3. It calls bdrv_prwv_co() -> bdrv_coroutine_enter() (not yet increased
> in_flight).
> 
> 4. Assume coroutine not yet actually entered, only scheduled, and we go
> to some code, which starts drained section (as in_flight is zero).
> 
> 5. Scheduled coroutine starts, and blindly increases in_flight, and we
> are in drained section with in_flight request.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

> diff --git a/block/io.c b/block/io.c
> index 061f3f2590..a91d8c1e21 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1511,7 +1511,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
>  return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
>  }
>  
> -int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
> +/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */

You have lots of comments like this one. Isn't this condition too
strict, though?

In the BlockBackend layer, it needs to be true because
blk_wait_while_drained() decreases in_flight only once (which is an ugly
hack, honestly, but it works...). It's comparable to how
AIO_WAIT_WHILE() relies on having locked the context exactly once even
though it is a recursive lock, because it wants to drop the lock
temporarily.

I don't think the same reasoning applies to BDS in_flight, does it?

We can potentially simplify the code if we don't have to fulfill the
condition.

Kevin

Re: [PATCH v2 0/9] block/io: safer inc/dec in_flight sections

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


27.04.2020 17:38, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

This is inspired by Kevin's
"block: Fix blk->in_flight during blk_wait_while_drained()" series.

So, like it's now done for block-backends, let's expand
in_flight-protected sections for bdrv_ interfaces, including
coroutine_enter and BDRV_POLL_WHILE loop into these sections.



OK, let's postpone this thing.

1. Idea to move ind/dec out of coroutine seems wrong, it leads to dead-lock, as 
shown in backtrace in my answer to 5/9.

2. Idea to keep request inside only one pair of ind/dec is probably an extra 
restriction in bdrv layer (I just blindly followed how it was done in blk layer 
by Kevin)

3. We still may have a theoretical race between request start and drained 
section start, but it needs another audit and smarter solution.

So, seems that we should not apply these series as it is, sorry for the noise. 
I think, I'll resend my 64bit-block-layer series based on master instead of 
this one.

--
Best regards,
Vladimir

Re: [PATCH] xen: fix build without pci passthrough

2020-05-19 Thread Roger Pau Monné

On Mon, May 11, 2020 at 02:40:43PM +0100, Anthony PERARD wrote:
> On Mon, May 04, 2020 at 12:14:43PM +0200, Roger Pau Monne wrote:
> > diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h
> > index 179775db7b..660dd8a008 100644
> > --- a/hw/xen/xen_pt.h
> > +++ b/hw/xen/xen_pt.h
> > @@ -1,6 +1,7 @@
> >  #ifndef XEN_PT_H
> >  #define XEN_PT_H
> >  
> > +#include "qemu/osdep.h"
> 
> Why do you need osdep?

For CONFIG_XEN_PCI_PASSTHROUGH IIRC.

> 
> >  #include "hw/xen/xen_common.h"
> >  #include "hw/pci/pci.h"
> >  #include "xen-host-pci-device.h"
> > @@ -322,7 +323,13 @@ extern void *pci_assign_dev_load_option_rom(PCIDevice 
> > *dev,
> >  unsigned int domain,
> >  unsigned int bus, unsigned int 
> > slot,
> >  unsigned int function);
> > +
> > +#ifdef CONFIG_XEN_PCI_PASSTHROUGH
> >  extern bool has_igd_gfx_passthru;
> > +#else
> > +# define has_igd_gfx_passthru false
> > +#endif
> 
> I don't quite like the use of define here. Could you introduce a
> function that return a bool instead? And defining that function in
> hw/xen/xen.h like xen_enabled() would be fine I think.

But has_igd_gfx_passthru is defined in xen_pt.c which is only compiled
if CONFIG_XEN_PCI_PASSTHROUGH is defined, yet the variable is set from
xen-common.c. I think the former is fine, an any attempt to set
has_igd_gfx_passthru without CONFIG_XEN_PCI_PASSTHROUGH will result in
a compile error which is easier to catch?

Thanks, Roger.

Re: [RFC PATCH 0/8] RISCV risu porting

2020-05-19 Thread LIU Zhiwei




On 2020/5/12 0:30, Richard Henderson wrote:

On 4/30/20 12:21 AM, LIU Zhiwei wrote:

It's some difficult when I try to support RV32, because it's very
similiar to RV64, so I can't make two .risu files like arm.risu and
aarch64.risu.

You could a command-line parameter, like --be or --sve for this.
Yes. I should add a "--xlen" parameter to specify the general register 
length in risugen_riscv.pm.


Besides, I should modify current riscv64.risu.

For instructions in RV32 and RV64:

LB RV32_64 imm:12 rs1:5 000 rd:5 011

For RV64 only instructions:

LD RV64 imm:12 rs1:5 011 rd:5 011

So I can  generate RV32 instructions through  --pattern '*.RV32.*', and 
the  RV64 instructions through --pattern '.*RV64.*'.


Best Regards,
Zhiwei


r~

[Bug 1856335] Re: Cache Layout wrong on many Zen Arch CPUs

2020-05-19 Thread Jan Klos



adds "host-cache-info=on,l3-cache=off"

to the qemu -cpu args

I believe l3-cache=off is useless with host-cache-info=on

So  should do what you want.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1856335

Title:
  Cache Layout wrong on many Zen Arch CPUs

Status in QEMU:
  New

Bug description:
  AMD CPUs have L3 cache per 2, 3 or 4 cores. Currently, TOPOEXT seems
  to always map Cache ass if it was an 4-Core per CCX CPU, which is
  incorrect, and costs upwards 30% performance (more realistically 10%)
  in L3 Cache Layout aware applications.

  Example on a 4-CCX CPU (1950X /w 8 Cores and no SMT):

    
  EPYC-IBPB
  AMD
  

  In windows, coreinfo reports correctly:

    Unified Cache 1, Level 3,8 MB, Assoc  16, LineSize  64
    Unified Cache 6, Level 3,8 MB, Assoc  16, LineSize  64

  On a 3-CCX CPU (3960X /w 6 cores and no SMT):

   
  EPYC-IBPB
  AMD
  

  in windows, coreinfo reports incorrectly:

  --  Unified Cache  1, Level 3,8 MB, Assoc  16, LineSize  64
  **  Unified Cache  6, Level 3,8 MB, Assoc  16, LineSize  64

  Validated against 3.0, 3.1, 4.1 and 4.2 versions of qemu-kvm.

  With newer Qemu there is a fix (that does behave correctly) in using the dies 
parameter:
   

  The problem is that the dies are exposed differently than how AMD does
  it natively, they are exposed to Windows as sockets, which means, that
  if you are nto a business user, you can't ever have a machine with
  more than two CCX (6 cores) as consumer versions of Windows only
  supports two sockets. (Should this be reported as a separate bug?)

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1856335/+subscriptions

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-19 Thread Michael S. Tsirkin

On Fri, May 15, 2020 at 07:54:57PM +0300, Dima Stepanov wrote:
> On Thu, May 14, 2020 at 03:34:24PM +0800, Jason Wang wrote:
> > 
> > On 2020/5/13 下午5:47, Dima Stepanov wrote:
> > >>> case CHR_EVENT_CLOSED:
> > >>> /* a close event may happen during a read/write, but vhost
> > >>>  * code assumes the vhost_dev remains setup, so delay the
> > >>>  * stop & clear to idle.
> > >>>  * FIXME: better handle failure in vhost code, remove bh
> > >>>  */
> > >>> if (s->watch) {
> > >>> AioContext *ctx = qemu_get_current_aio_context();
> > >>>
> > >>> g_source_remove(s->watch);
> > >>> s->watch = 0;
> > >>> qemu_chr_fe_set_handlers(>chr, NULL, NULL, NULL, NULL,
> > >>>  NULL, NULL, false);
> > >>>
> > >>> aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque);
> > >>> }
> > >>> break;
> > >>>
> > >>>I think it's time we dropped the FIXME and moved the handling to common
> > >>>code. Jason? Marc-André?
> > >>I agree. Just to confirm, do you prefer bh or doing changes like what is
> > >>done in this series? It looks to me bh can have more easier codes.
> > >Could it be a good idea just to make disconnect in the char device but
> > >postphone clean up in the vhost-user-blk (or any other vhost-user
> > >device) itself? So we are moving the postphone logic and decision from
> > >the char device to vhost-user device. One of the idea i have is as
> > >follows:
> > >   - Put ourself in the INITIALIZATION state
> > >   - Start these vhost-user "handshake" commands
> > >   - If we got a disconnect error, perform disconnect, but don't clean up
> > > device (it will be clean up on the roll back). I can be done by
> > > checking the state in vhost_user_..._disconnect routine or smth like 
> > > it
> > 
> > 
> > Any issue you saw just using the aio bh as Michael posted above.
> > 
> > Then we don't need to deal with the silent vhost_dev_stop() and we will have
> > codes that is much more easier to understand.
> I've implemented this solution inside
> hw/block/vhost-user-blk.c:vhost_user_blk_event() in the similar way by
> using the s->connected field. Looks good and more correct fix ). I have
> two questions here before i'll rework the fixes:
> 1. Is it okay to make the similar fix inside vhost_user_blk_event() or
> we are looking for more generic vhost-user solution? What do you think?

Either works I think.

> 2. For migration we require an additional information that for the
> vhost-user device it isn't an error, because i'm trigerring the
> following assert error:
>   Core was generated by `x86_64-softmmu/qemu-system-x86_64 -nodefaults 
> -no-user-config -M q35,sata=false'.
>   Program terminated with signal SIGABRT, Aborted.
>   #0  0x7fb56e729428 in raise () from /lib/x86_64-linux-gnu/libc.so.6
>   [Current thread is 1 (Thread 0x7fb486ef5700 (LWP 527734))]
> 
>   (gdb) bt
>   #0  0x7fb56e729428 in raise () from /lib/x86_64-linux-gnu/libc.so.6
>   #1  0x7fb56e72b02a in abort () from /lib/x86_64-linux-gnu/libc.so.6
>   #2  0x5648ea376ee6 in vhost_log_global_start
>   (listener=0x5648ece4eb08) at ./hw/virtio/vhost.c:857
>   #3  0x5648ea2dde7e in memory_global_dirty_log_start ()
>   at ./memory.c:2611
>   #4  0x5648ea2e68e7 in ram_init_bitmaps (rs=0x7fb4740008c0)
>   at ./migration/ram.c:2305
>   #5  0x5648ea2e698b in ram_init_all (rsp=0x5648eb1f0f20 )
>   at ./migration/ram.c:2323
>   #6  0x5648ea2e6cc5 in ram_save_setup (f=0x5648ec609e00,
>   opaque=0x5648eb1f0f20 )
>   at ./migration/ram.c:2436
>   #7  0x5648ea67b7d3 in qemu_savevm_state_setup (f=0x5648ec609e00) at
>   migration/savevm.c:1176
>   #8  0x5648ea674511 in migration_thread (opaque=0x5648ec031ff0) at
>   migration/migration.c:3416
>   #9  0x5648ea85d65d in qemu_thread_start (args=0x5648ec6057f0) at
>   util/qemu-thread-posix.c:519
>   #10 0x7fb56eac56ba in start_thread () from
>   /lib/x86_64-linux-gnu/libpthread.so.0
>   #11 0x7fb56e7fb41d in clone () from /lib/x86_64-linux-gnu/libc.so.6
>   (gdb) frame 2
>   #2  0x5648ea376ee6 in vhost_log_global_start
>  (listener=0x5648ece4eb08) at ./hw/virtio/vhost.c:857
>   857 abort();
>   (gdb) list
>   852 {
>   853 int r;
>   854
>   855 r = vhost_migration_log(listener, true);
>   856 if (r < 0) {
>   857 abort();
>   858 }
>   859 }
>   860
>   861 static void vhost_log_global_stop(MemoryListener *listener)
> Since bh postphone the clean up, we can't use the ->started field.
> Do we have any mechanism to get the device type/state in the common
> vhost_migration_log() routine? So for example for the vhost-user/disconnect
> device we will be able to return 0. Or should we implement it and introduce
> it in this patch set?
> 
> Thanks, Dima.
> 
> > 
> > Thank
> > 
> > 
> > >   - vhost-user

Re: [PATCH 10/10] softfloat: Return bool from all classification predicates

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> This includes *_is_any_nan, *_is_neg, *_is_inf, etc.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

As an aside do you think we'll see any benefit from rolling up the
classifications with the decomposed versions? I would hope the compiler
could drop the stuff it doesn't need but I guess we loose the ability to
inline in the target helpers?

> ---
>  include/fpu/softfloat.h| 66 +-
>  fpu/softfloat-specialize.inc.c | 16 -
>  2 files changed, 41 insertions(+), 41 deletions(-)
>
> diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
> index 37217d9b9b..16ca697a73 100644
> --- a/include/fpu/softfloat.h
> +++ b/include/fpu/softfloat.h
> @@ -235,31 +235,31 @@ float16 float16_sqrt(float16, float_status *status);
>  FloatRelation float16_compare(float16, float16, float_status *status);
>  FloatRelation float16_compare_quiet(float16, float16, float_status *status);
>  
> -int float16_is_quiet_nan(float16, float_status *status);
> -int float16_is_signaling_nan(float16, float_status *status);
> +bool float16_is_quiet_nan(float16, float_status *status);
> +bool float16_is_signaling_nan(float16, float_status *status);
>  float16 float16_silence_nan(float16, float_status *status);
>  
> -static inline int float16_is_any_nan(float16 a)
> +static inline bool float16_is_any_nan(float16 a)
>  {
>  return ((float16_val(a) & ~0x8000) > 0x7c00);
>  }
>  
> -static inline int float16_is_neg(float16 a)
> +static inline bool float16_is_neg(float16 a)
>  {
>  return float16_val(a) >> 15;
>  }
>  
> -static inline int float16_is_infinity(float16 a)
> +static inline bool float16_is_infinity(float16 a)
>  {
>  return (float16_val(a) & 0x7fff) == 0x7c00;
>  }
>  
> -static inline int float16_is_zero(float16 a)
> +static inline bool float16_is_zero(float16 a)
>  {
>  return (float16_val(a) & 0x7fff) == 0;
>  }
>  
> -static inline int float16_is_zero_or_denormal(float16 a)
> +static inline bool float16_is_zero_or_denormal(float16 a)
>  {
>  return (float16_val(a) & 0x7c00) == 0;
>  }
> @@ -351,8 +351,8 @@ float32 float32_minnum(float32, float32, float_status 
> *status);
>  float32 float32_maxnum(float32, float32, float_status *status);
>  float32 float32_minnummag(float32, float32, float_status *status);
>  float32 float32_maxnummag(float32, float32, float_status *status);
> -int float32_is_quiet_nan(float32, float_status *status);
> -int float32_is_signaling_nan(float32, float_status *status);
> +bool float32_is_quiet_nan(float32, float_status *status);
> +bool float32_is_signaling_nan(float32, float_status *status);
>  float32 float32_silence_nan(float32, float_status *status);
>  float32 float32_scalbn(float32, int, float_status *status);
>  
> @@ -372,27 +372,27 @@ static inline float32 float32_chs(float32 a)
>  return make_float32(float32_val(a) ^ 0x8000);
>  }
>  
> -static inline int float32_is_infinity(float32 a)
> +static inline bool float32_is_infinity(float32 a)
>  {
>  return (float32_val(a) & 0x7fff) == 0x7f80;
>  }
>  
> -static inline int float32_is_neg(float32 a)
> +static inline bool float32_is_neg(float32 a)
>  {
>  return float32_val(a) >> 31;
>  }
>  
> -static inline int float32_is_zero(float32 a)
> +static inline bool float32_is_zero(float32 a)
>  {
>  return (float32_val(a) & 0x7fff) == 0;
>  }
>  
> -static inline int float32_is_any_nan(float32 a)
> +static inline bool float32_is_any_nan(float32 a)
>  {
>  return ((float32_val(a) & ~(1 << 31)) > 0x7f80UL);
>  }
>  
> -static inline int float32_is_zero_or_denormal(float32 a)
> +static inline bool float32_is_zero_or_denormal(float32 a)
>  {
>  return (float32_val(a) & 0x7f80) == 0;
>  }
> @@ -540,8 +540,8 @@ float64 float64_minnum(float64, float64, float_status 
> *status);
>  float64 float64_maxnum(float64, float64, float_status *status);
>  float64 float64_minnummag(float64, float64, float_status *status);
>  float64 float64_maxnummag(float64, float64, float_status *status);
> -int float64_is_quiet_nan(float64 a, float_status *status);
> -int float64_is_signaling_nan(float64, float_status *status);
> +bool float64_is_quiet_nan(float64 a, float_status *status);
> +bool float64_is_signaling_nan(float64, float_status *status);
>  float64 float64_silence_nan(float64, float_status *status);
>  float64 float64_scalbn(float64, int, float_status *status);
>  
> @@ -561,27 +561,27 @@ static inline float64 float64_chs(float64 a)
>  return make_float64(float64_val(a) ^ 0x8000LL);
>  }
>  
> -static inline int float64_is_infinity(float64 a)
> +static inline bool float64_is_infinity(float64 a)
>  {
>  return (float64_val(a) & 0x7fffLL ) == 0x7ff0LL;
>  }
>  
> -static inline int float64_is_neg(float64 a)
> +static inline bool float64_is_neg(float64 a)
>  {
>  return float64_val(a) >> 63;
>  }
>  
> -static inline int float64_is_zero(float64 a)
> +static

Re: [PATCH] replay: synchronize on every virtual timer callback

2020-05-19 Thread Pavel Dovgalyuk




On 19.05.2020 11:11, Alex Bennée wrote:

Pavel Dovgalyuk  writes:


On 18.05.2020 18:56, Alex Bennée wrote:

Philippe Mathieu-Daudé  writes:


+ Alex

On 5/6/20 10:17 AM, Pavel Dovgalyuk wrote:

Sometimes virtual timer callbacks depend on order
of virtual timer processing and warping of virtual clock.
Therefore every callback should be logged to make replay deterministic.
This patch creates a checkpoint before every virtual timer callback.
With these checkpoints virtual timers processing and clock warping
events order is completely deterministic.
Signed-off-by: Pavel Dovgalyuk 
---
util/qemu-timer.c |5 +
1 file changed, 5 insertions(+)
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index d548d3c1ad..47833f338f 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -588,6 +588,11 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
qemu_mutex_lock(_list->active_timers_lock);
  progress = true;
+/*
+ * Callback may insert new checkpoints, therefore add new checkpoint
+ * for the virtual timers.
+ */
+need_replay_checkpoint = timer_list->clock->type == QEMU_CLOCK_VIRTUAL;
}
qemu_mutex_unlock(_list->active_timers_lock);

So the problem I have with this as with all the record/replay stuff I
need want to review is it's very hard to see things in action. I added a
*very* basic record/replay test to the aarch64 softmmu tests but they
won't exercise any of this code because no timers get fired. I'm
assuming the sort of tests that is really needed is something that not
only causes QEMU_CLOCK_VIRTUAL timers to fire and trigger logged HW
events and ensure that things don't get confused in the process.

I encounter most of the bugs in different OS boot scenarios.

We also have internal tests that include some computational, disk, and
network interaction tasks.

Is it possible to add a test like booting a "real" OS and replaying
it?

Yes - for these bigger more complex setups we should use the acceptance
tests that run under Avocado. See "make check-acceptance".


I've installed avocado and avocado-framework, but got the following error:

venv/bin/python: No module named avocado




If I read up the file I just get more questions than answers. For
example why do we release the qemu_timers lock before processing the
replay event? Is it that the replay event could cause another timer to

We release the lock, because accessing the replay module may process
some events and add more timers.

OK. I guess the adding of the timer is a side effect of processing the
event rather than something that gets added directly?


Right.


Pavel Dovgalyuk

Re: [PATCH v2 5/9] block/io: expand in_flight inc/dec section: simple cases

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


19.05.2020 13:52, Kevin Wolf wrote:

Am 06.05.2020 um 09:02 hat Vladimir Sementsov-Ogievskiy geschrieben:

27.04.2020 17:39, Vladimir Sementsov-Ogievskiy wrote:

It's safer to expand in_flight request to start before enter to
coroutine in synchronous wrappers, due to the following (theoretical)
problem:

Consider write.
It's possible, that qemu_coroutine_enter only schedules execution,
assume such case.

Then we may possibly have the following:

1. Somehow check that we are not in drained section in outer code.

2. Call bdrv_pwritev(), assuming that it will increase in_flight, which
will protect us from starting drained section.

3. It calls bdrv_prwv_co() -> bdrv_coroutine_enter() (not yet increased
in_flight).

4. Assume coroutine not yet actually entered, only scheduled, and we go
to some code, which starts drained section (as in_flight is zero).

5. Scheduled coroutine starts, and blindly increases in_flight, and we
are in drained section with in_flight request.

Signed-off-by: Vladimir Sementsov-Ogievskiy 


Very interesting: this patch breaks test-replication. It hangs:

(gdb) thr a a bt

Thread 2 (Thread 0x7eff256cd700 (LWP 2843)):
#0  0x7eff2f5fd1fd in syscall () from /lib64/libc.so.6
#1  0x55af9a9a4f11 in qemu_futex_wait (f=0x55af9aa6f758 
, val=4294967295) at 
/work/src/qemu/up-expand-bdrv-in_flight-bounds/include/qemu/futex.h:29
#2  0x55af9a9a50d5 in qemu_event_wait (ev=0x55af9aa6f758 
) at util/qemu-thread-posix.c:459
#3  0x55af9a9bd20d in call_rcu_thread (opaque=0x0) at util/rcu.c:260
#4  0x55af9a9a5288 in qemu_thread_start (args=0x55af9c4f1b80) at 
util/qemu-thread-posix.c:519
#5  0x7eff2f6d44c0 in start_thread () from /lib64/libpthread.so.0
#6  0x7eff2f602553 in clone () from /lib64/libc.so.6

Thread 1 (Thread 0x7eff25820a80 (LWP 2842)):
#0  0x7eff2f5f7bd6 in ppoll () from /lib64/libc.so.6
#1  0x55af9a99e405 in qemu_poll_ns (fds=0x55af9c52a830, nfds=1, timeout=-1) 
at util/qemu-timer.c:335
#2  0x55af9a9a1cab in fdmon_poll_wait (ctx=0x55af9c526890, 
ready_list=0x7ffc73e8c5d0, timeout=-1) at util/fdmon-poll.c:79
#3  0x55af9a9a160c in aio_poll (ctx=0x55af9c526890, blocking=true) at 
util/aio-posix.c:600
#4  0x55af9a8f0bb0 in bdrv_do_drained_begin (bs=0x55af9c52a8d0, 
recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
block/io.c:429
#5  0x55af9a8f0c95 in bdrv_drained_begin (bs=0x55af9c52a8d0) at 
block/io.c:435
#6  0x55af9a8dc6a8 in blk_drain (blk=0x55af9c542c10) at 
block/block-backend.c:1681
#7  0x55af9a8da0b6 in blk_unref (blk=0x55af9c542c10) at 
block/block-backend.c:473
#8  0x55af9a8eb5e7 in mirror_exit_common (job=0x55af9c6c45c0) at 
block/mirror.c:667
#9  0x55af9a8eb9c1 in mirror_prepare (job=0x55af9c6c45c0) at 
block/mirror.c:765
#10 0x55af9a87cd65 in job_prepare (job=0x55af9c6c45c0) at job.c:781
#11 0x55af9a87b62a in job_txn_apply (job=0x55af9c6c45c0, fn=0x55af9a87cd28 
) at job.c:158
#12 0x55af9a87cdee in job_do_finalize (job=0x55af9c6c45c0) at job.c:798
#13 0x55af9a87cfb5 in job_completed_txn_success (job=0x55af9c6c45c0) at 
job.c:852
#14 0x55af9a87d055 in job_completed (job=0x55af9c6c45c0) at job.c:865
#15 0x55af9a87d0a8 in job_exit (opaque=0x55af9c6c45c0) at job.c:885
#16 0x55af9a99b981 in aio_bh_call (bh=0x55af9c547440) at util/async.c:136
#17 0x55af9a99ba8b in aio_bh_poll (ctx=0x55af9c526890) at util/async.c:164
#18 0x55af9a9a17ff in aio_poll (ctx=0x55af9c526890, blocking=true) at 
util/aio-posix.c:650
#19 0x55af9a8f7011 in bdrv_flush (bs=0x55af9c53b900) at block/io.c:3019
#20 0x55af9a874351 in bdrv_close (bs=0x55af9c53b900) at block.c:4252
#21 0x55af9a874ca3 in bdrv_delete (bs=0x55af9c53b900) at block.c:4498
#22 0x55af9a877862 in bdrv_unref (bs=0x55af9c53b900) at block.c:5866
#23 0x55af9a870837 in bdrv_root_unref_child (child=0x55af9c6c4430) at 
block.c:2684
#24 0x55af9a8da9a2 in blk_remove_bs (blk=0x55af9c547bd0) at 
block/block-backend.c:803
#25 0x55af9a8d9e54 in blk_delete (blk=0x55af9c547bd0) at 
block/block-backend.c:422
#26 0x55af9a8da0f8 in blk_unref (blk=0x55af9c547bd0) at 
block/block-backend.c:477
#27 0x55af9a86a6f1 in teardown_secondary () at tests/test-replication.c:392
#28 0x55af9a86aac1 in test_secondary_stop () at tests/test-replication.c:490
#29 0x7eff2fd7df7e in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#30 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#31 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#32 0x7eff2fd7e46a in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x7eff2fd7e485 in g_test_run () from /lib64/libglib-2.0.so.0
#34 0x55af9a86b19c in main (argc=1, argv=0x7ffc73e8d088) at 
tests/test-replication.c:645


(gdb) p ((BlockBackend *)0x55af9c547bd0)->in_flight
$5 = 0
(gdb) p ((BlockBackend *)0x55af9c542c10)->in_flight
$6 = 0
(gdb) p ((BlockDriverState *)0x55af9c53b900)->in_flight
$7 =

Re: [PATCH v2 5/9] block/io: expand in_flight inc/dec section: simple cases

2020-05-19 Thread Kevin Wolf

Am 19.05.2020 um 13:06 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 19.05.2020 13:52, Kevin Wolf wrote:
> > Am 06.05.2020 um 09:02 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > 27.04.2020 17:39, Vladimir Sementsov-Ogievskiy wrote:
> > > > It's safer to expand in_flight request to start before enter to
> > > > coroutine in synchronous wrappers, due to the following (theoretical)
> > > > problem:
> > > > 
> > > > Consider write.
> > > > It's possible, that qemu_coroutine_enter only schedules execution,
> > > > assume such case.
> > > > 
> > > > Then we may possibly have the following:
> > > > 
> > > > 1. Somehow check that we are not in drained section in outer code.
> > > > 
> > > > 2. Call bdrv_pwritev(), assuming that it will increase in_flight, which
> > > > will protect us from starting drained section.
> > > > 
> > > > 3. It calls bdrv_prwv_co() -> bdrv_coroutine_enter() (not yet increased
> > > > in_flight).
> > > > 
> > > > 4. Assume coroutine not yet actually entered, only scheduled, and we go
> > > > to some code, which starts drained section (as in_flight is zero).
> > > > 
> > > > 5. Scheduled coroutine starts, and blindly increases in_flight, and we
> > > > are in drained section with in_flight request.
> > > > 
> > > > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > > 
> > > Very interesting: this patch breaks test-replication. It hangs:
> > > 
> > > (gdb) thr a a bt
> > > 
> > > Thread 2 (Thread 0x7eff256cd700 (LWP 2843)):
> > > #0  0x7eff2f5fd1fd in syscall () from /lib64/libc.so.6
> > > #1  0x55af9a9a4f11 in qemu_futex_wait (f=0x55af9aa6f758 
> > > , val=4294967295) at 
> > > /work/src/qemu/up-expand-bdrv-in_flight-bounds/include/qemu/futex.h:29
> > > #2  0x55af9a9a50d5 in qemu_event_wait (ev=0x55af9aa6f758 
> > > ) at util/qemu-thread-posix.c:459
> > > #3  0x55af9a9bd20d in call_rcu_thread (opaque=0x0) at util/rcu.c:260
> > > #4  0x55af9a9a5288 in qemu_thread_start (args=0x55af9c4f1b80) at 
> > > util/qemu-thread-posix.c:519
> > > #5  0x7eff2f6d44c0 in start_thread () from /lib64/libpthread.so.0
> > > #6  0x7eff2f602553 in clone () from /lib64/libc.so.6
> > > 
> > > Thread 1 (Thread 0x7eff25820a80 (LWP 2842)):
> > > #0  0x7eff2f5f7bd6 in ppoll () from /lib64/libc.so.6
> > > #1  0x55af9a99e405 in qemu_poll_ns (fds=0x55af9c52a830, nfds=1, 
> > > timeout=-1) at util/qemu-timer.c:335
> > > #2  0x55af9a9a1cab in fdmon_poll_wait (ctx=0x55af9c526890, 
> > > ready_list=0x7ffc73e8c5d0, timeout=-1) at util/fdmon-poll.c:79
> > > #3  0x55af9a9a160c in aio_poll (ctx=0x55af9c526890, blocking=true) at 
> > > util/aio-posix.c:600
> > > #4  0x55af9a8f0bb0 in bdrv_do_drained_begin (bs=0x55af9c52a8d0, 
> > > recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
> > > block/io.c:429
> > > #5  0x55af9a8f0c95 in bdrv_drained_begin (bs=0x55af9c52a8d0) at 
> > > block/io.c:435
> > > #6  0x55af9a8dc6a8 in blk_drain (blk=0x55af9c542c10) at 
> > > block/block-backend.c:1681
> > > #7  0x55af9a8da0b6 in blk_unref (blk=0x55af9c542c10) at 
> > > block/block-backend.c:473
> > > #8  0x55af9a8eb5e7 in mirror_exit_common (job=0x55af9c6c45c0) at 
> > > block/mirror.c:667
> > > #9  0x55af9a8eb9c1 in mirror_prepare (job=0x55af9c6c45c0) at 
> > > block/mirror.c:765
> > > #10 0x55af9a87cd65 in job_prepare (job=0x55af9c6c45c0) at job.c:781
> > > #11 0x55af9a87b62a in job_txn_apply (job=0x55af9c6c45c0, 
> > > fn=0x55af9a87cd28 ) at job.c:158
> > > #12 0x55af9a87cdee in job_do_finalize (job=0x55af9c6c45c0) at 
> > > job.c:798
> > > #13 0x55af9a87cfb5 in job_completed_txn_success (job=0x55af9c6c45c0) 
> > > at job.c:852
> > > #14 0x55af9a87d055 in job_completed (job=0x55af9c6c45c0) at job.c:865
> > > #15 0x55af9a87d0a8 in job_exit (opaque=0x55af9c6c45c0) at job.c:885
> > > #16 0x55af9a99b981 in aio_bh_call (bh=0x55af9c547440) at 
> > > util/async.c:136
> > > #17 0x55af9a99ba8b in aio_bh_poll (ctx=0x55af9c526890) at 
> > > util/async.c:164
> > > #18 0x55af9a9a17ff in aio_poll (ctx=0x55af9c526890, blocking=true) at 
> > > util/aio-posix.c:650
> > > #19 0x55af9a8f7011 in bdrv_flush (bs=0x55af9c53b900) at 
> > > block/io.c:3019
> > > #20 0x55af9a874351 in bdrv_close (bs=0x55af9c53b900) at block.c:4252
> > > #21 0x55af9a874ca3 in bdrv_delete (bs=0x55af9c53b900) at block.c:4498
> > > #22 0x55af9a877862 in bdrv_unref (bs=0x55af9c53b900) at block.c:5866
> > > #23 0x55af9a870837 in bdrv_root_unref_child (child=0x55af9c6c4430) at 
> > > block.c:2684
> > > #24 0x55af9a8da9a2 in blk_remove_bs (blk=0x55af9c547bd0) at 
> > > block/block-backend.c:803
> > > #25 0x55af9a8d9e54 in blk_delete (blk=0x55af9c547bd0) at 
> > > block/block-backend.c:422
> > > #26 0x55af9a8da0f8 in blk_unref (blk=0x55af9c547bd0) at 
> > > block/block-backend.c:477
> > > #27 0x55af9a86a6f1 in teardown_secondary () at 
> > > tests/test-replication.c:392
> > > #28 0x55af9a86aac1 in test_secondary_stop () at

Re: [PATCH 08/10] softfloat: Inline float128 compare specializations

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Replace the float128 compare specializations with inline functions
> that call the standard float128_compare{,_quiet} functions.
> Use bool as the return type.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

> ---
>  include/fpu/softfloat.h |  49 +++--
>  fpu/softfloat.c | 238 
>  2 files changed, 41 insertions(+), 246 deletions(-)
>
> diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
> index 281f0fd971..cfb3cda46b 100644
> --- a/include/fpu/softfloat.h
> +++ b/include/fpu/softfloat.h
> @@ -901,14 +901,6 @@ float128 float128_mul(float128, float128, float_status 
> *status);
>  float128 float128_div(float128, float128, float_status *status);
>  float128 float128_rem(float128, float128, float_status *status);
>  float128 float128_sqrt(float128, float_status *status);
> -int float128_eq(float128, float128, float_status *status);
> -int float128_le(float128, float128, float_status *status);
> -int float128_lt(float128, float128, float_status *status);
> -int float128_unordered(float128, float128, float_status *status);
> -int float128_eq_quiet(float128, float128, float_status *status);
> -int float128_le_quiet(float128, float128, float_status *status);
> -int float128_lt_quiet(float128, float128, float_status *status);
> -int float128_unordered_quiet(float128, float128, float_status *status);
>  FloatRelation float128_compare(float128, float128, float_status *status);
>  FloatRelation float128_compare_quiet(float128, float128, float_status 
> *status);
>  int float128_is_quiet_nan(float128, float_status *status);
> @@ -964,6 +956,47 @@ static inline int float128_is_any_nan(float128 a)
>  ((a.low != 0) || ((a.high & 0xLL) != 0));
>  }
>  
> +static inline bool float128_eq(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool float128_le(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool float128_lt(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool float128_unordered(float128 a, float128 b, float_status 
> *s)
> +{
> +return float128_compare(a, b, s) == float_relation_unordered;
> +}
> +
> +static inline bool float128_eq_quiet(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare_quiet(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool float128_le_quiet(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare_quiet(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool float128_lt_quiet(float128 a, float128 b, float_status *s)
> +{
> +return float128_compare_quiet(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool float128_unordered_quiet(float128 a, float128 b,
> +   float_status *s)
> +{
> +return float128_compare_quiet(a, b, s) == float_relation_unordered;
> +}
> +
>  #define float128_zero make_float128(0, 0)
>  
>  
> /*
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 5d7fc2c17a..4567dda112 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -7218,244 +7218,6 @@ float128 float128_sqrt(float128 a, float_status 
> *status)
>  
>  }
>  
> -/*
> -| Returns 1 if the quadruple-precision floating-point value `a' is equal to
> -| the corresponding value `b', and 0 otherwise.  The invalid exception is
> -| raised if either operand is a NaN.  Otherwise, the comparison is performed
> -| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
> -**/
> -
> -int float128_eq(float128 a, float128 b, float_status *status)
> -{
> -
> -if ((( extractFloat128Exp( a ) == 0x7FFF )
> -  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
> - || (( extractFloat128Exp( b ) == 0x7FFF )
> -  && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
> -   ) {
> -float_raise(float_flag_invalid, status);
> -return 0;
> -}
> -return
> -   ( a.low == b.low )
> -&& (( a.high == b.high )
> - || (( a.low == 0 )
> -  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
> -   );
> -
> -}
> -
> -/*
> -| Returns 1 if the quadruple-precision floating-point value `a' is less than
> -| or equal to the corresponding value `b', and 0 otherwise.  The invalid
> -| exception is raised if either operand is a NaN.  The comparison is 
> performed
>

Re: [PATCH 09/10] softfloat: Inline floatx80 compare specializations

2020-05-19 Thread Alex Bennée



Richard Henderson  writes:

> Replace the floatx80 compare specializations with inline functions
> that call the standard floatx80_compare{,_quiet} functions.
> Use bool as the return type.
>
> Signed-off-by: Richard Henderson 

Reviewed-by: Alex Bennée 

> ---
>  include/fpu/softfloat.h |  49 ++--
>  fpu/softfloat.c | 257 
>  2 files changed, 41 insertions(+), 265 deletions(-)
>
> diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
> index cfb3cda46b..37217d9b9b 100644
> --- a/include/fpu/softfloat.h
> +++ b/include/fpu/softfloat.h
> @@ -689,14 +689,6 @@ floatx80 floatx80_mul(floatx80, floatx80, float_status 
> *status);
>  floatx80 floatx80_div(floatx80, floatx80, float_status *status);
>  floatx80 floatx80_rem(floatx80, floatx80, float_status *status);
>  floatx80 floatx80_sqrt(floatx80, float_status *status);
> -int floatx80_eq(floatx80, floatx80, float_status *status);
> -int floatx80_le(floatx80, floatx80, float_status *status);
> -int floatx80_lt(floatx80, floatx80, float_status *status);
> -int floatx80_unordered(floatx80, floatx80, float_status *status);
> -int floatx80_eq_quiet(floatx80, floatx80, float_status *status);
> -int floatx80_le_quiet(floatx80, floatx80, float_status *status);
> -int floatx80_lt_quiet(floatx80, floatx80, float_status *status);
> -int floatx80_unordered_quiet(floatx80, floatx80, float_status *status);
>  FloatRelation floatx80_compare(floatx80, floatx80, float_status *status);
>  FloatRelation floatx80_compare_quiet(floatx80, floatx80, float_status 
> *status);
>  int floatx80_is_quiet_nan(floatx80, float_status *status);
> @@ -746,6 +738,47 @@ static inline int floatx80_is_any_nan(floatx80 a)
>  return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1);
>  }
>  
> +static inline bool floatx80_eq(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool floatx80_le(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool floatx80_lt(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool floatx80_unordered(floatx80 a, floatx80 b, float_status 
> *s)
> +{
> +return floatx80_compare(a, b, s) == float_relation_unordered;
> +}
> +
> +static inline bool floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare_quiet(a, b, s) == float_relation_equal;
> +}
> +
> +static inline bool floatx80_le_quiet(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare_quiet(a, b, s) <= float_relation_equal;
> +}
> +
> +static inline bool floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *s)
> +{
> +return floatx80_compare_quiet(a, b, s) < float_relation_equal;
> +}
> +
> +static inline bool floatx80_unordered_quiet(floatx80 a, floatx80 b,
> +   float_status *s)
> +{
> +return floatx80_compare_quiet(a, b, s) == float_relation_unordered;
> +}
> +
>  
> /*
>  | Return whether the given value is an invalid floatx80 encoding.
>  | Invalid floatx80 encodings arise when the integer bit is not set, but
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 4567dda112..6c8f2d597a 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -5849,263 +5849,6 @@ floatx80 floatx80_sqrt(floatx80 a, float_status 
> *status)
>  0, zExp, zSig0, zSig1, status);
>  }
>  
> -/*
> -| Returns 1 if the extended double-precision floating-point value `a' is 
> equal
> -| to the corresponding value `b', and 0 otherwise.  The invalid exception is
> -| raised if either operand is a NaN.  Otherwise, the comparison is performed
> -| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
> -**/
> -
> -int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
> -{
> -
> -if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
> -|| (extractFloatx80Exp(a) == 0x7FFF
> -&& (uint64_t) (extractFloatx80Frac(a) << 1))
> -|| (extractFloatx80Exp(b) == 0x7FFF
> -&& (uint64_t) (extractFloatx80Frac(b) << 1))
> -   ) {
> -float_raise(float_flag_invalid, status);
> -return 0;
> -}
> -return
> -   ( a.low == b.low )
> -&& (( a.high == b.high )
> - || (( a.low == 0 )
> -  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
> -   );
> -
> -}
> -
> -/*
> -| Returns 1 if the extended double-precision floating-point value `a'

Re: [PATCH 5/9] ppc/pnv: Add a XIVE2 controller to the POWER10 chip.

2020-05-19 Thread Greg Kurz

On Wed, 13 May 2020 17:11:05 +0200
Cédric Le Goater  wrote:

> The XIVE2 interrupt controller of the POWER10 processor follows the
> same logic than on POWER9 but the HW interface has been largely
> reviewed.  It has a new register interface, different BARs, extra
> VSDs, new layout for the XIVE2 structures, and a set of new features
> which are described below.
> 
> This is a model of the POWER10 XIVE2 interrupt controller for the
> PowerNV machine. It focuses primarily on the needs of the skiboot
> firmware but some initial hypervisor support is implemented for KVM
> use (escalation).
> 
> Support for new features will be implemented in time and will require
> new support from the OS.
> 
> * XIVE2 BARS
> 
> The interrupt controller BARs have a different layout outlined below.
> Each sub-engine has now own its range and the indirect TIMA access was
> replaced with a set of pages, one per CPU, under the IC BAR:
> 
>   - IC BAR (Interrupt Controller)
> . 4 pages, one per sub-engine
> . 128 indirect TIMA pages
>   - TM BAR (Thread Interrupt Management Area)
> . 4 pages
>   - ESB BAR (ESB pages for IPIs)
> . up to 1TB
>   - END BAR (ESB pages for ENDs)
> . up to 2TB
>   - NVC BAR (Notification Virtual Crowd)
> . up to 128
>   - NVPG BAR (Notification Virtual Process and Group)
> . up to 1TB
>   - Direct mapped Thread Context Area (reads & writes)
> 
> OPAL does not use the grouping and crowd capability.
> 
> * Virtual Structure Tables
> 
> XIVE2 adds new tables types and also changes the field layout of the END
> and NVP Virtualization Structure Descriptors.
> 
>   - EAS
>   - END new layout
>   - NVT was splitted in :
> . NVP (Processor), 32B
> . NVG (Group), 32B
> . NVC (Crowd == P9 block group) 32B
>   - IC for remote configuration
>   - SYNC for cache injection
>   - ERQ for event input queue
> 
> The setup is slighly different on XIVE2 because the indexing has changed
> for some of the tables, block ID or the chip topology ID can be used.
> 
> * XIVE2 features
> 
> SCOM and MMIO registers have a new layout and XIVE2 adds a new global
> capability and configuration registers.
> 
> The lowlevel hardware offers a set of new features among which :
> 
>   - a configurable number of priorities : 1 - 8
>   - StoreEOI with load-after-store ordering is activated by default
>   - Gen2 TIMA layout
>   - A P9-compat mode, or Gen1, TIMA toggle bit for SW compatibility
>   - increase to 24bit for VP number
> 
> Other features will have some impact on the Hypervisor and guest OS
> when activated, but this is not required for initial support of the
> controller.
> 
> Signed-off-by: Cédric Le Goater 
> ---

Apart from plugging the XIVE2 on the main system bus, like Markus's
fix for XIVE, I don't see any obvious things to change.

>  hw/intc/pnv_xive2_regs.h   |  428 
>  include/hw/ppc/pnv.h   |   22 +
>  include/hw/ppc/pnv_xive.h  |   71 ++
>  include/hw/ppc/pnv_xscom.h |3 +
>  hw/intc/pnv_xive2.c| 2026 
>  hw/ppc/pnv.c   |   89 +-
>  hw/intc/Makefile.objs  |2 +-
>  7 files changed, 2637 insertions(+), 4 deletions(-)
>  create mode 100644 hw/intc/pnv_xive2_regs.h
>  create mode 100644 hw/intc/pnv_xive2.c
> 
> diff --git a/hw/intc/pnv_xive2_regs.h b/hw/intc/pnv_xive2_regs.h
> new file mode 100644
> index ..6ad0eee824b1
> --- /dev/null
> +++ b/hw/intc/pnv_xive2_regs.h
> @@ -0,0 +1,428 @@
> +/*
> + * QEMU PowerPC XIVE2 interrupt controller model  (POWER10)
> + *
> + * Copyright (c) 2019-2020, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#ifndef PPC_PNV_XIVE2_REGS_H
> +#define PPC_PNV_XIVE2_REGS_H
> +
> +/*
> + * CQ Common Queue (PowerBus bridge) Registers
> + */
> +
> +/* XIVE2 Capabilities */
> +#define X_CQ_XIVE_CAP   0x02
> +#define CQ_XIVE_CAP 0x010
> +#defineCQ_XIVE_CAP_VERSION  PPC_BITMASK(0, 3)
> +/* 4:6 reserved */
> +#defineCQ_XIVE_CAP_USER_INT_PRIOPPC_BITMASK(8, 9)
> +#define   CQ_XIVE_CAP_USER_INT_PRIO_1   0
> +#define   CQ_XIVE_CAP_USER_INT_PRIO_1_2 1
> +#define   CQ_XIVE_CAP_USER_INT_PRIO_1_4 2
> +#define   CQ_XIVE_CAP_USER_INT_PRIO_1_8 3
> +#defineCQ_XIVE_CAP_VP_INT_PRIO  PPC_BITMASK(10, 11)
> +#define   CQ_XIVE_CAP_VP_INT_PRIO_1_8   0
> +#define   CQ_XIVE_CAP_VP_INT_PRIO_2_8   1
> +#define   CQ_XIVE_CAP_VP_INT_PRIO_4_8   2
> +#define   CQ_XIVE_CAP_VP_INT_PRIO_8 3
> +#defineCQ_XIVE_CAP_BLOCK_ID_WIDTH   PPC_BITMASK(12, 13)
> +
> +/* XIVE2 Configuration */
> +#define X_CQ_XIVE_CFG   0x03
> +#define CQ_XIVE_CFG 0x018
> +
> +/* 0:7 reserved */
> +#defineCQ_XIVE_CFG_USER_INT_PRIOPPC_BITMASK(8, 9)
> +#define

Re: [PATCH 02/24] display/xlnx_dp: Fix to realize "i2c-ddc" and "aux-to-i2c-bridge"

2020-05-19 Thread Peter Maydell

On Tue, 19 May 2020 at 06:09, Markus Armbruster  wrote:
> I figure the "device becomes real only on realize" thing is actually
> more myth than thing.

It's not a myth, it's an API guarantee thing. If you don't realize
the device you create before you use it then you're in the world of
unspecified behaviour, and anything could happen: maybe it works,
maybe it doesn't, maybe it works today and breaks tomorrow.

thanks
-- PMM

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-19 Thread Dr. David Alan Gilbert

* Dima Stepanov (dimas...@yandex-team.ru) wrote:
> On Mon, May 18, 2020 at 10:53:59AM +0100, Dr. David Alan Gilbert wrote:
> > * Dima Stepanov (dimas...@yandex-team.ru) wrote:
> > > On Mon, May 18, 2020 at 10:50:39AM +0800, Jason Wang wrote:
> > > > 
> > > > On 2020/5/16 上午12:54, Dima Stepanov wrote:
> > > > >On Thu, May 14, 2020 at 03:34:24PM +0800, Jason Wang wrote:
> > > > >>On 2020/5/13 下午5:47, Dima Stepanov wrote:
> > > > > case CHR_EVENT_CLOSED:
> > > > > /* a close event may happen during a read/write, but vhost
> > > > >  * code assumes the vhost_dev remains setup, so delay the
> > > > >  * stop & clear to idle.
> > > > >  * FIXME: better handle failure in vhost code, remove bh
> > > > >  */
> > > > > if (s->watch) {
> > > > > AioContext *ctx = qemu_get_current_aio_context();
> > > > >
> > > > > g_source_remove(s->watch);
> > > > > s->watch = 0;
> > > > > qemu_chr_fe_set_handlers(>chr, NULL, NULL, NULL, 
> > > > > NULL,
> > > > >  NULL, NULL, false);
> > > > >
> > > > > aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque);
> > > > > }
> > > > > break;
> > > > >
> > > > >I think it's time we dropped the FIXME and moved the handling to 
> > > > >common
> > > > >code. Jason? Marc-André?
> > > > I agree. Just to confirm, do you prefer bh or doing changes like 
> > > > what is
> > > > done in this series? It looks to me bh can have more easier codes.
> > > > >>>Could it be a good idea just to make disconnect in the char device 
> > > > >>>but
> > > > >>>postphone clean up in the vhost-user-blk (or any other vhost-user
> > > > >>>device) itself? So we are moving the postphone logic and decision 
> > > > >>>from
> > > > >>>the char device to vhost-user device. One of the idea i have is as
> > > > >>>follows:
> > > > >>>   - Put ourself in the INITIALIZATION state
> > > > >>>   - Start these vhost-user "handshake" commands
> > > > >>>   - If we got a disconnect error, perform disconnect, but don't 
> > > > >>> clean up
> > > > >>> device (it will be clean up on the roll back). I can be done by
> > > > >>> checking the state in vhost_user_..._disconnect routine or smth 
> > > > >>> like it
> > > > >>
> > > > >>Any issue you saw just using the aio bh as Michael posted above.
> > > > >>
> > > > >>Then we don't need to deal with the silent vhost_dev_stop() and we 
> > > > >>will have
> > > > >>codes that is much more easier to understand.
> > > > >I've implemented this solution inside
> > > > >hw/block/vhost-user-blk.c:vhost_user_blk_event() in the similar way by
> > > > >using the s->connected field. Looks good and more correct fix ). I have
> > > > >two questions here before i'll rework the fixes:
> > > > >1. Is it okay to make the similar fix inside vhost_user_blk_event() or
> > > > >we are looking for more generic vhost-user solution? What do you think?
> > > > 
> > > > 
> > > > I think I agree with Michael, it's better to have a generic vhost-user
> > > > solution. But if it turns out to be not easy, we can start from fixing
> > > > vhost-user-blk.
> > > I also agree, but as i see it right now the connect/disconnect events
> > > are handled inside each vhost-user device implementation file. So it will
> > > need some global refactoring. So i suggest having this fix first and
> > > after it refactoring the code:
> > >  - more devices will be involved
> > >  - i see there is some difference in device handling
> > 
> > I'm following bits of this discussion, some thoughts;
> > if your device doesn't support reconnect, then if, at the start of
> > migration you find that you can't start the log what is the correct
> > behaviour?
> I'm not sure here, but it looks like that in this case the device state
> will be:
>   disconnect -> stopped (will not be changed during migration, because
>   reconnect isn't supported)
> And because of it the device state will not be changed during migration,
> so there is no need for log and migration could be completed
> successfully.
> So as i see it (i could be wrong here) that:
>  - it is okay: if device is not started and we will not change this
>state during migration + log start is failed
>  - it is not okay: if device is started + log start is failed (because
>we can't handle the dirty pages and so on during migration)

Yes, that does make sense to me.

> > You can't carry on with the migration because you'd have an
> > inconsistent migration state; so I guess that's why the abort() is there
> > - but I think I'd generally prefer to fail the migration and hope the
> > vhsot device is still working for anything other than the log.
> > 
> > You're going to have to be pretty careful with the ordering of reconect
> > - reconnecting on the source during a migration sounds pretty hairy, but
> > a

Re: [PATCH] replay: synchronize on every virtual timer callback

2020-05-19 Thread Alex Bennée



Pavel Dovgalyuk  writes:

> On 19.05.2020 11:11, Alex Bennée wrote:
>> Pavel Dovgalyuk  writes:
>>
>>> On 18.05.2020 18:56, Alex Bennée wrote:
 Philippe Mathieu-Daudé  writes:

> + Alex
>
> On 5/6/20 10:17 AM, Pavel Dovgalyuk wrote:
>> Sometimes virtual timer callbacks depend on order
>> of virtual timer processing and warping of virtual clock.
>> Therefore every callback should be logged to make replay deterministic.
>> This patch creates a checkpoint before every virtual timer callback.
>> With these checkpoints virtual timers processing and clock warping
>> events order is completely deterministic.
>> Signed-off-by: Pavel Dovgalyuk 
>> ---
>> util/qemu-timer.c |5 +
>> 1 file changed, 5 insertions(+)
>> diff --git a/util/qemu-timer.c b/util/qemu-timer.c
>> index d548d3c1ad..47833f338f 100644
>> --- a/util/qemu-timer.c
>> +++ b/util/qemu-timer.c
>> @@ -588,6 +588,11 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
>> qemu_mutex_lock(_list->active_timers_lock);
>>   progress = true;
>> +/*
>> + * Callback may insert new checkpoints, therefore add new 
>> checkpoint
>> + * for the virtual timers.
>> + */
>> +need_replay_checkpoint = timer_list->clock->type == 
>> QEMU_CLOCK_VIRTUAL;
>> }
>> qemu_mutex_unlock(_list->active_timers_lock);
 So the problem I have with this as with all the record/replay stuff I
 need want to review is it's very hard to see things in action. I added a
 *very* basic record/replay test to the aarch64 softmmu tests but they
 won't exercise any of this code because no timers get fired. I'm
 assuming the sort of tests that is really needed is something that not
 only causes QEMU_CLOCK_VIRTUAL timers to fire and trigger logged HW
 events and ensure that things don't get confused in the process.
>>> I encounter most of the bugs in different OS boot scenarios.
>>>
>>> We also have internal tests that include some computational, disk, and
>>> network interaction tasks.
>>>
>>> Is it possible to add a test like booting a "real" OS and replaying
>>> it?
>> Yes - for these bigger more complex setups we should use the acceptance
>> tests that run under Avocado. See "make check-acceptance".
>
> I've installed avocado and avocado-framework, but got the following error:
>
> venv/bin/python: No module named avocado

Hmm make check-acceptance should automatically setup local copies of
avocado using virtualenv. You shouldn't need to install the system
version.

>
>>
 If I read up the file I just get more questions than answers. For
 example why do we release the qemu_timers lock before processing the
 replay event? Is it that the replay event could cause another timer to
>>> We release the lock, because accessing the replay module may process
>>> some events and add more timers.
>> OK. I guess the adding of the timer is a side effect of processing the
>> event rather than something that gets added directly?
>
> Right.
>
>
> Pavel Dovgalyuk


-- 
Alex Bennée

Re: [PATCH] replay: synchronize on every virtual timer callback

2020-05-19 Thread Pavel Dovgalyuk




On 19.05.2020 13:32, Alex Bennée wrote:

Pavel Dovgalyuk  writes:


On 19.05.2020 11:11, Alex Bennée wrote:

Pavel Dovgalyuk  writes:


On 18.05.2020 18:56, Alex Bennée wrote:

Philippe Mathieu-Daudé  writes:


+ Alex

On 5/6/20 10:17 AM, Pavel Dovgalyuk wrote:

Sometimes virtual timer callbacks depend on order
of virtual timer processing and warping of virtual clock.
Therefore every callback should be logged to make replay deterministic.
This patch creates a checkpoint before every virtual timer callback.
With these checkpoints virtual timers processing and clock warping
events order is completely deterministic.
Signed-off-by: Pavel Dovgalyuk 
---
 util/qemu-timer.c |5 +
 1 file changed, 5 insertions(+)
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index d548d3c1ad..47833f338f 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -588,6 +588,11 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
 qemu_mutex_lock(_list->active_timers_lock);
   progress = true;
+/*
+ * Callback may insert new checkpoints, therefore add new checkpoint
+ * for the virtual timers.
+ */
+need_replay_checkpoint = timer_list->clock->type == QEMU_CLOCK_VIRTUAL;
 }
 qemu_mutex_unlock(_list->active_timers_lock);

So the problem I have with this as with all the record/replay stuff I
need want to review is it's very hard to see things in action. I added a
*very* basic record/replay test to the aarch64 softmmu tests but they
won't exercise any of this code because no timers get fired. I'm
assuming the sort of tests that is really needed is something that not
only causes QEMU_CLOCK_VIRTUAL timers to fire and trigger logged HW
events and ensure that things don't get confused in the process.

I encounter most of the bugs in different OS boot scenarios.

We also have internal tests that include some computational, disk, and
network interaction tasks.

Is it possible to add a test like booting a "real" OS and replaying
it?

Yes - for these bigger more complex setups we should use the acceptance
tests that run under Avocado. See "make check-acceptance".

I've installed avocado and avocado-framework, but got the following error:

venv/bin/python: No module named avocado

Hmm make check-acceptance should automatically setup local copies of
avocado using virtualenv. You shouldn't need to install the system
version.



What should I try then?

[PATCH] icount: fix shift=auto for record/replay

2020-05-19 Thread Pavel Dovgalyuk

This patch fixes shift=auto when record/replay is enabled.
Now user does not need to guess the best shift value.

Signed-off-by: Pavel Dovgalyuk 

--

v2:
  moved icount_time_shift to vmstate subsection
---
 cpus.c |   20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/cpus.c b/cpus.c
index 5670c96bcf..7ce0d569b3 100644
--- a/cpus.c
+++ b/cpus.c
@@ -379,7 +379,8 @@ static void icount_adjust(void)
 
 seqlock_write_lock(_state.vm_clock_seqlock,
_state.vm_clock_lock);
-cur_time = cpu_get_clock_locked();
+cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
+   cpu_get_clock_locked());
 cur_icount = cpu_get_icount_locked();
 
 delta = cur_icount - cur_time;
@@ -647,6 +648,11 @@ static bool adjust_timers_state_needed(void *opaque)
 return s->icount_rt_timer != NULL;
 }
 
+static bool shift_state_needed(void *opaque)
+{
+return use_icount == 2;
+}
+
 /*
  * Subsection for warp timer migration is optional, because may not be created
  */
@@ -674,6 +680,17 @@ static const VMStateDescription 
icount_vmstate_adjust_timers = {
 }
 };
 
+static const VMStateDescription icount_vmstate_shift = {
+.name = "timer/icount/shift",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = shift_state_needed,
+.fields = (VMStateField[]) {
+VMSTATE_INT16(icount_time_shift, TimersState),
+VMSTATE_END_OF_LIST()
+}
+};
+
 /*
  * This is a subsection for icount migration.
  */
@@ -690,6 +707,7 @@ static const VMStateDescription icount_vmstate_timers = {
 .subsections = (const VMStateDescription*[]) {
 _vmstate_warp_timer,
 _vmstate_adjust_timers,
+_vmstate_shift,
 NULL
 }
 };

Re: [PATCH RFC 01/32] python/qemu: create qemu.lib module

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


19.05.2020 03:27, John Snow wrote:



On 5/18/20 3:33 PM, Vladimir Sementsov-Ogievskiy wrote:

18.05.2020 21:23, John Snow wrote:



On 5/18/20 2:14 PM, Vladimir Sementsov-Ogievskiy wrote:

14.05.2020 08:53, John Snow wrote:

move python/qemu/*.py to python/qemu/lib/*.py.

To create a namespace package, the 'qemu' directory itself shouldn't
have module files in it. Thus, these files will go under a 'lib'
package
directory instead.


Hmm..

On the first glance, it looks better to have

    from qemu import QEMUMachine

than
      from qemu.lib import QEMUMachine

why do we need this extra ".lib" part?

Is it needed only for internal use?

Assume we have installed qemu package. Can we write

    from qemu import QEMUMachine

? Or we still need qemu.lib ?

I don't remember any python package, which made me to write "import from
package_name.lib ..."




It's a strategy to create "qemu" as a PEP420 namespace package; i.e.
"qemu" forms a namespace, but you need a name for the actual package
underneath it.

"qemu.lib" is one package, with qmp, qtest, and machine modules. "qemu"
isn't really a package in this system, it's just a namespace.

The idea is that this allows us to create a more modular rollout of
various python scripts and services as desired instead of monolithically
bundling them all inside of a "qemu" package.

It also allows us to fork or split out the sub-packages to separate
repos, if we wish. i.e., let's say we create a "qemu.sdk" subpackage, we
can eventually fork it off into its own repo with its own installer and
so forth. These subpackages can be installed and managed separately.



Okay, I understand.. No real objections than.

Still, maybe, everything should not go into lib, maybe something like

qemu/vm/  - qmp, QEMUMachine, etc
qemu/qtest/  - qtest

would be more user friendly? But I'm not sure. I just thought that "lib"
is too generic.



lib is a very generic name, I agree.

Splitting accel, qmp and QEMUMachine in one package and keeping qtest in
another is fine too. I'm not sure if I like "vm" for the name of that
core package, though.

I want to avoid using "qemu/sdk" because I have some plans for trying to
generate and package a "real" SDK using that namespace.

"devkit"? "testkit"? "core"? Naming things is always the worst part.



I think, "core" sounds good.



--
Best regards,
Vladimir

Re: [PULL 0/6] NBD patches for 2020-05-18

2020-05-19 Thread Peter Maydell

On Mon, 18 May 2020 at 17:38, Eric Blake  wrote:
>
> The following changes since commit debe78ce14bf8f8940c2bdf3ef387505e9e035a9:
>
>   Merge remote-tracking branch 'remotes/rth/tags/pull-fpu-20200515' into 
> staging (2020-05-15 19:51:16 +0100)
>
> are available in the Git repository at:
>
>   https://repo.or.cz/qemu/ericb.git tags/pull-nbd-2020-05-18
>
> for you to fetch changes up to d8154b0945f795177511ea0e2212bd5c749fe84c:
>
>   iotests: Enhance 223 to cover qemu-img map improvements (2020-05-18 
> 11:02:05 -0500)
>
> We've got a couple of first-time contributors included in this one :)
>
> 
> nbd patches for 2020-05-20
>
> - fix stranded fd in 'qemu-nbd -c /dev/nbd0'
> - add 'qemu-img map --start-offset --max-length' options
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.1
for any user-visible changes.

-- PMM

Re: [PATCH v2 5/9] block/io: expand in_flight inc/dec section: simple cases

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


19.05.2020 14:16, Kevin Wolf wrote:

Am 19.05.2020 um 13:06 hat Vladimir Sementsov-Ogievskiy geschrieben:

19.05.2020 13:52, Kevin Wolf wrote:

Am 06.05.2020 um 09:02 hat Vladimir Sementsov-Ogievskiy geschrieben:

27.04.2020 17:39, Vladimir Sementsov-Ogievskiy wrote:

It's safer to expand in_flight request to start before enter to
coroutine in synchronous wrappers, due to the following (theoretical)
problem:

Consider write.
It's possible, that qemu_coroutine_enter only schedules execution,
assume such case.

Then we may possibly have the following:

1. Somehow check that we are not in drained section in outer code.

2. Call bdrv_pwritev(), assuming that it will increase in_flight, which
will protect us from starting drained section.

3. It calls bdrv_prwv_co() -> bdrv_coroutine_enter() (not yet increased
in_flight).

4. Assume coroutine not yet actually entered, only scheduled, and we go
to some code, which starts drained section (as in_flight is zero).

5. Scheduled coroutine starts, and blindly increases in_flight, and we
are in drained section with in_flight request.

Signed-off-by: Vladimir Sementsov-Ogievskiy 


Very interesting: this patch breaks test-replication. It hangs:

(gdb) thr a a bt

Thread 2 (Thread 0x7eff256cd700 (LWP 2843)):
#0  0x7eff2f5fd1fd in syscall () from /lib64/libc.so.6
#1  0x55af9a9a4f11 in qemu_futex_wait (f=0x55af9aa6f758 
, val=4294967295) at 
/work/src/qemu/up-expand-bdrv-in_flight-bounds/include/qemu/futex.h:29
#2  0x55af9a9a50d5 in qemu_event_wait (ev=0x55af9aa6f758 
) at util/qemu-thread-posix.c:459
#3  0x55af9a9bd20d in call_rcu_thread (opaque=0x0) at util/rcu.c:260
#4  0x55af9a9a5288 in qemu_thread_start (args=0x55af9c4f1b80) at 
util/qemu-thread-posix.c:519
#5  0x7eff2f6d44c0 in start_thread () from /lib64/libpthread.so.0
#6  0x7eff2f602553 in clone () from /lib64/libc.so.6

Thread 1 (Thread 0x7eff25820a80 (LWP 2842)):
#0  0x7eff2f5f7bd6 in ppoll () from /lib64/libc.so.6
#1  0x55af9a99e405 in qemu_poll_ns (fds=0x55af9c52a830, nfds=1, timeout=-1) 
at util/qemu-timer.c:335
#2  0x55af9a9a1cab in fdmon_poll_wait (ctx=0x55af9c526890, 
ready_list=0x7ffc73e8c5d0, timeout=-1) at util/fdmon-poll.c:79
#3  0x55af9a9a160c in aio_poll (ctx=0x55af9c526890, blocking=true) at 
util/aio-posix.c:600
#4  0x55af9a8f0bb0 in bdrv_do_drained_begin (bs=0x55af9c52a8d0, 
recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
block/io.c:429
#5  0x55af9a8f0c95 in bdrv_drained_begin (bs=0x55af9c52a8d0) at 
block/io.c:435
#6  0x55af9a8dc6a8 in blk_drain (blk=0x55af9c542c10) at 
block/block-backend.c:1681
#7  0x55af9a8da0b6 in blk_unref (blk=0x55af9c542c10) at 
block/block-backend.c:473
#8  0x55af9a8eb5e7 in mirror_exit_common (job=0x55af9c6c45c0) at 
block/mirror.c:667
#9  0x55af9a8eb9c1 in mirror_prepare (job=0x55af9c6c45c0) at 
block/mirror.c:765
#10 0x55af9a87cd65 in job_prepare (job=0x55af9c6c45c0) at job.c:781
#11 0x55af9a87b62a in job_txn_apply (job=0x55af9c6c45c0, fn=0x55af9a87cd28 
) at job.c:158
#12 0x55af9a87cdee in job_do_finalize (job=0x55af9c6c45c0) at job.c:798
#13 0x55af9a87cfb5 in job_completed_txn_success (job=0x55af9c6c45c0) at 
job.c:852
#14 0x55af9a87d055 in job_completed (job=0x55af9c6c45c0) at job.c:865
#15 0x55af9a87d0a8 in job_exit (opaque=0x55af9c6c45c0) at job.c:885
#16 0x55af9a99b981 in aio_bh_call (bh=0x55af9c547440) at util/async.c:136
#17 0x55af9a99ba8b in aio_bh_poll (ctx=0x55af9c526890) at util/async.c:164
#18 0x55af9a9a17ff in aio_poll (ctx=0x55af9c526890, blocking=true) at 
util/aio-posix.c:650
#19 0x55af9a8f7011 in bdrv_flush (bs=0x55af9c53b900) at block/io.c:3019
#20 0x55af9a874351 in bdrv_close (bs=0x55af9c53b900) at block.c:4252
#21 0x55af9a874ca3 in bdrv_delete (bs=0x55af9c53b900) at block.c:4498
#22 0x55af9a877862 in bdrv_unref (bs=0x55af9c53b900) at block.c:5866
#23 0x55af9a870837 in bdrv_root_unref_child (child=0x55af9c6c4430) at 
block.c:2684
#24 0x55af9a8da9a2 in blk_remove_bs (blk=0x55af9c547bd0) at 
block/block-backend.c:803
#25 0x55af9a8d9e54 in blk_delete (blk=0x55af9c547bd0) at 
block/block-backend.c:422
#26 0x55af9a8da0f8 in blk_unref (blk=0x55af9c547bd0) at 
block/block-backend.c:477
#27 0x55af9a86a6f1 in teardown_secondary () at tests/test-replication.c:392
#28 0x55af9a86aac1 in test_secondary_stop () at tests/test-replication.c:490
#29 0x7eff2fd7df7e in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#30 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#31 0x7eff2fd7dd24 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#32 0x7eff2fd7e46a in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x7eff2fd7e485 in g_test_run () from /lib64/libglib-2.0.so.0
#34 0x55af9a86b19c in main (argc=1, argv=0x7ffc73e8d088) at 
tests/test-replication.c:645


(gdb) p ((BlockBackend *)0x55af9c547bd0)->in_flight
$5 = 0
(gdb) p

Re: [PATCH v4 9/9] iotests: rename and move 169 and 199 tests

2020-05-19 Thread Kevin Wolf

Am 19.05.2020 um 13:32 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 19.05.2020 12:07, Kevin Wolf wrote:
> > Am 18.05.2020 um 18:12 hat Thomas Huth geschrieben:
> > > On 15/05/2020 23.15, Vladimir Sementsov-Ogievskiy wrote:
> > > > Rename bitmaps migration tests and move them to tests subdirectory to
> > > > demonstrate new human-friendly test naming.
> > > > 
> > > > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > > > ---
> > > >   tests/qemu-iotests/{199 => tests/migrate-bitmaps-postcopy-test}   | 0
> > > >   .../{199.out => tests/migrate-bitmaps-postcopy-test.out}  | 0
> > > >   tests/qemu-iotests/{169 => tests/migrate-bitmaps-test}| 0
> > > >   tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out}| 0
> > > >   4 files changed, 0 insertions(+), 0 deletions(-)
> > > >   rename tests/qemu-iotests/{199 => 
> > > > tests/migrate-bitmaps-postcopy-test} (100%)
> > > >   rename tests/qemu-iotests/{199.out => 
> > > > tests/migrate-bitmaps-postcopy-test.out} (100%)
> > > >   rename tests/qemu-iotests/{169 => tests/migrate-bitmaps-test} (100%)
> > > >   rename tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out} 
> > > > (100%)
> > > > 
> > > > diff --git a/tests/qemu-iotests/199 
> > > > b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
> > > > similarity index 100%
> > > > rename from tests/qemu-iotests/199
> > > > rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
> > > > diff --git a/tests/qemu-iotests/199.out 
> > > > b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
> > > > similarity index 100%
> > > > rename from tests/qemu-iotests/199.out
> > > > rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
> > > > diff --git a/tests/qemu-iotests/169 
> > > > b/tests/qemu-iotests/tests/migrate-bitmaps-test
> > > > similarity index 100%
> > > > rename from tests/qemu-iotests/169
> > > > rename to tests/qemu-iotests/tests/migrate-bitmaps-test
> > > > diff --git a/tests/qemu-iotests/169.out 
> > > > b/tests/qemu-iotests/tests/migrate-bitmaps-test.out
> > > > similarity index 100%
> > > > rename from tests/qemu-iotests/169.out
> > > > rename to tests/qemu-iotests/tests/migrate-bitmaps-test.out
> > > 
> > > I like the idea ... but the path name + file names get now quite long.
> > > While you're at it, what about renaming the "qemu-iotests" directory to
> > > just "iotests" or even just "io" now?
> > 
> > Renames are always kind of painful. Do we have a real reason for the
> > rename except that the paths feel a bit long subjectively?
> > 
> > Of course, if we're renaming all files anyway, changing the directory
> > name at the same time shouldn't give any additional pain, so it would be
> > completely reasonable then. We're not renaming the test harness files,
> > though, and even only two test cases in this patch.
> > 
> > Maybe this final patch should stay RFC until we have the infrastructure
> > in and then we can have a single series that moves all tests and also
> > renames the directory? Maybe a not strictly necessary rename of the
> > tooling would be bearable in the context of a mass rename of tests.
> 
> I'm absolutely not hurrying about this thing. And actual aim of the
> series is another. I even doubt that we will mass rename the tests:
> who knows what they all test?) I don't.

Good point.

And conversely, there are a few test cases that I do know (like 026 030
040 041 055) and probably wouldn't recognise for a while after a rename.
:-)

> Still we may rename some tests, and we'll create new named tests which
> is good enough.. OK, if I resend a new version, I'll add an RFC patch
> on renaming the directory, up to maintainers, take it now or not :)

I guess a final patch to rename the directory as an RFC makes sense.
Then we can continue the discussion there and decide whether or not to
apply it without holding up the rest of the series.

I think I would be inclined to leave the name unchanged as long as we
don't have a real reason, but if people overwhelmingly think otherwise,
we can still rename.

Kevin

Re: [PATCH] ARM: PL061: Introduce N_GPIOS

2020-05-19 Thread Philippe Mathieu-Daudé


On 5/19/20 10:51 AM, Geert Uytterhoeven wrote:

Add a definition for the number of GPIO lines controlled by a PL061
instance, and use it instead of the hardcoded magic value 8.

Suggested-by: Philippe Mathieu-Daudé 
Signed-off-by: Geert Uytterhoeven 


Thanks for following up.

Reviewed-by: Philippe Mathieu-Daudé 


---
  hw/gpio/pl061.c | 12 +++-
  1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/gpio/pl061.c b/hw/gpio/pl061.c
index 2a828260bdb0b946..6d3c36bc16cf9e0d 100644
--- a/hw/gpio/pl061.c
+++ b/hw/gpio/pl061.c
@@ -36,6 +36,8 @@ static const uint8_t pl061_id_luminary[12] =
  #define TYPE_PL061 "pl061"
  #define PL061(obj) OBJECT_CHECK(PL061State, (obj), TYPE_PL061)
  
+#define N_GPIOS 8

+
  typedef struct PL061State {
  SysBusDevice parent_obj;
  
@@ -62,7 +64,7 @@ typedef struct PL061State {

  uint32_t cr;
  uint32_t amsel;
  qemu_irq irq;
-qemu_irq out[8];
+qemu_irq out[N_GPIOS];
  const unsigned char *id;
  uint32_t rsvd_start; /* reserved area: [rsvd_start, 0xfcc] */
  } PL061State;
@@ -112,7 +114,7 @@ static void pl061_update(PL061State *s)
  changed = s->old_out_data ^ out;
  if (changed) {
  s->old_out_data = out;
-for (i = 0; i < 8; i++) {
+for (i = 0; i < N_GPIOS; i++) {
  mask = 1 << i;
  if (changed & mask) {
  DPRINTF("Set output %d = %d\n", i, (out & mask) != 0);
@@ -125,7 +127,7 @@ static void pl061_update(PL061State *s)
  changed = (s->old_in_data ^ s->data) & ~s->dir;
  if (changed) {
  s->old_in_data = s->data;
-for (i = 0; i < 8; i++) {
+for (i = 0; i < N_GPIOS; i++) {
  mask = 1 << i;
  if (changed & mask) {
  DPRINTF("Changed input %d = %d\n", i, (s->data & mask) != 0);
@@ -364,8 +366,8 @@ static void pl061_init(Object *obj)
  memory_region_init_io(>iomem, obj, _ops, s, "pl061", 0x1000);
  sysbus_init_mmio(sbd, >iomem);
  sysbus_init_irq(sbd, >irq);
-qdev_init_gpio_in(dev, pl061_set_irq, 8);
-qdev_init_gpio_out(dev, s->out, 8);
+qdev_init_gpio_in(dev, pl061_set_irq, N_GPIOS);
+qdev_init_gpio_out(dev, s->out, N_GPIOS);
  }
  
  static void pl061_class_init(ObjectClass *klass, void *data)

Re: [PATCH] xen: fix build without pci passthrough

2020-05-19 Thread Peter Maydell

On Tue, 19 May 2020 at 12:28, Roger Pau Monné  wrote:
>
> On Mon, May 11, 2020 at 02:40:43PM +0100, Anthony PERARD wrote:
> > On Mon, May 04, 2020 at 12:14:43PM +0200, Roger Pau Monne wrote:
> > > diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h
> > > index 179775db7b..660dd8a008 100644
> > > --- a/hw/xen/xen_pt.h
> > > +++ b/hw/xen/xen_pt.h
> > > @@ -1,6 +1,7 @@
> > >  #ifndef XEN_PT_H
> > >  #define XEN_PT_H
> > >
> > > +#include "qemu/osdep.h"
> >
> > Why do you need osdep?
>
> For CONFIG_XEN_PCI_PASSTHROUGH IIRC.

All .c files should always include osdep as the first include
in the file, and .h files should never include osdep (we note
this in CODING_STYLE.rst).

If you added this #include to fix a compile issue that would
suggest that there's a .c file somewhere that's missing the
mandatory osdep include. I did a quick eyeball of all the files
that include xen_pt.h, though, and none of them are missing the
osdep include. So I think you should be able to simply drop the
osdep include here. If that produces an error, let us know what
fails and we can work out what's gone wrong.

thanks
-- PMM

[PATCH v3 03/10] Add VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS

2020-05-19 Thread Raphael Norwitz

This change introduces a new feature to the vhost-user protocol allowing
a backend device to specify the maximum number of ram slots it supports.

At this point, the value returned by the backend will be capped at the
maximum number of ram slots which can be supported by vhost-user, which
is currently set to 8 because of underlying protocol limitations.

The returned value will be stored inside the VhostUserState struct so
that on device reconnect we can verify that the ram slot limitation
has not decreased since the last time the device connected.

Signed-off-by: Raphael Norwitz 
Signed-off-by: Peter Turschmid 
---
 docs/interop/vhost-user.rst| 16 ++
 hw/virtio/vhost-user.c | 49 --
 include/hw/virtio/vhost-user.h |  1 +
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 3b1b660..b3cf5c3 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -815,6 +815,7 @@ Protocol features
   #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD   12
   #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13
   #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
+  #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS  15
 
 Master message types
 
@@ -1263,6 +1264,21 @@ Master message types
 
   The state.num field is currently reserved and must be set to 0.
 
+``VHOST_USER_GET_MAX_MEM_SLOTS``
+  :id: 36
+  :equivalent ioctl: N/A
+  :slave payload: u64
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by master to the slave. The slave should return the message with a
+  u64 payload containing the maximum number of memory slots for
+  QEMU to expose to the guest. At this point, the value returned
+  by the backend will be capped at the maximum number of ram slots
+  which can be supported by vhost-user. Currently that limit is set
+  at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol
+  limitations.
+
 Slave message types
 ---
 
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index dacf5bb..15406a7 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -59,6 +59,8 @@ enum VhostUserProtocolFeature {
 VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
 VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
+/* Feature 14 reserved for VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. */
+VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
 VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -100,6 +102,8 @@ typedef enum VhostUserRequest {
 VHOST_USER_SET_INFLIGHT_FD = 32,
 VHOST_USER_GPU_SET_SOCKET = 33,
 VHOST_USER_RESET_DEVICE = 34,
+/* Message number 35 reserved for VHOST_USER_VRING_KICK. */
+VHOST_USER_GET_MAX_MEM_SLOTS = 36,
 VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -894,6 +898,23 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
 return 0;
 }
 
+static int vhost_user_get_max_memslots(struct vhost_dev *dev,
+   uint64_t *max_memslots)
+{
+uint64_t backend_max_memslots;
+int err;
+
+err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS,
+ _max_memslots);
+if (err < 0) {
+return err;
+}
+
+*max_memslots = backend_max_memslots;
+
+return 0;
+}
+
 static int vhost_user_reset_device(struct vhost_dev *dev)
 {
 VhostUserMsg msg = {
@@ -1391,7 +1412,7 @@ static int 
vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
 
 static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
 {
-uint64_t features, protocol_features;
+uint64_t features, protocol_features, ram_slots;
 struct vhost_user *u;
 int err;
 
@@ -1453,6 +1474,27 @@ static int vhost_user_backend_init(struct vhost_dev 
*dev, void *opaque)
  "slave-req protocol features.");
 return -1;
 }
+
+/* get max memory regions if backend supports configurable RAM slots */
+if (!virtio_has_feature(dev->protocol_features,
+VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
+u->user->memory_slots = VHOST_MEMORY_MAX_NREGIONS;
+} else {
+err = vhost_user_get_max_memslots(dev, _slots);
+if (err < 0) {
+return err;
+}
+
+if (ram_slots < u->user->memory_slots) {
+error_report("The backend specified a max ram slots limit "
+ "of %lu, when the prior validated limit was %d. "
+ "This limit should never decrease.", ram_slots,
+ u->user->memory_slots);
+return -1;
+}
+
+u->user->memory_slots = MIN(ram_slots, VHOST_MEMORY_MAX_NREGIONS);
+}
 }
 
 if

[PATCH v3 04/10] Transmit vhost-user memory regions individually

2020-05-19 Thread Raphael Norwitz

With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature has been negotiated, Qemu no longer sends the backend
all the memory regions in a single message. Rather, when the memory
tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and
VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map
and/or unmap instead of sending send all the regions in one fixed size
VHOST_USER_SET_MEM_TABLE message.

The vhost_user struct maintains a shadow state of the VM’s memory
regions. When the memory tables are modified, the
vhost_user_set_mem_table() function compares the new device memory state
to the shadow state and only sends regions which need to be unmapped or
mapped in. The regions which must be unmapped are sent first, followed
by the new regions to be mapped in. After all the messages have been
sent, the shadow state is set to the current virtual device state.

Existing backends which do not support
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected.

Signed-off-by: Raphael Norwitz 
Signed-off-by: Swapnil Ingle 
Signed-off-by: Peter Turschmid 
Suggested-by: Mike Cui 
---
 docs/interop/vhost-user.rst |  33 ++-
 hw/virtio/vhost-user.c  | 507 +---
 2 files changed, 466 insertions(+), 74 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index b3cf5c3..037eefa 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1276,8 +1276,37 @@ Master message types
   QEMU to expose to the guest. At this point, the value returned
   by the backend will be capped at the maximum number of ram slots
   which can be supported by vhost-user. Currently that limit is set
-  at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol
-  limitations.
+  at VHOST_USER_MAX_RAM_SLOTS = 8.
+
+``VHOST_USER_ADD_MEM_REG``
+  :id: 37
+  :equivalent ioctl: N/A
+  :slave payload: memory region
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by the master to the slave. The message payload contains a memory
+  region descriptor struct, describing a region of guest memory which
+  the slave device must map in. When the
+  ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
+  been successfully negotiated, along with the
+  ``VHOST_USER_REM_MEM_REG`` message, this message is used to set and
+  update the memory tables of the slave device.
+
+``VHOST_USER_REM_MEM_REG``
+  :id: 38
+  :equivalent ioctl: N/A
+  :slave payload: memory region
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by the master to the slave. The message payload contains a memory
+  region descriptor struct, describing a region of guest memory which
+  the slave device must unmap. When the
+  ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
+  been successfully negotiated, along with the
+  ``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and
+  update the memory tables of the slave device.
 
 Slave message types
 ---
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 15406a7..4af8476 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -104,6 +104,8 @@ typedef enum VhostUserRequest {
 VHOST_USER_RESET_DEVICE = 34,
 /* Message number 35 reserved for VHOST_USER_VRING_KICK. */
 VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+VHOST_USER_ADD_MEM_REG = 37,
+VHOST_USER_REM_MEM_REG = 38,
 VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -128,6 +130,11 @@ typedef struct VhostUserMemory {
 VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserMemRegMsg {
+uint32_t padding;
+VhostUserMemoryRegion region;
+} VhostUserMemRegMsg;
+
 typedef struct VhostUserLog {
 uint64_t mmap_size;
 uint64_t mmap_offset;
@@ -186,6 +193,7 @@ typedef union {
 struct vhost_vring_state state;
 struct vhost_vring_addr addr;
 VhostUserMemory memory;
+VhostUserMemRegMsg mem_reg;
 VhostUserLog log;
 struct vhost_iotlb_msg iotlb;
 VhostUserConfig config;
@@ -226,6 +234,16 @@ struct vhost_user {
 
 /* True once we've entered postcopy_listen */
 bool   postcopy_listen;
+
+/* Our current regions */
+int num_shadow_regions;
+struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS];
+};
+
+struct scrub_regions {
+struct vhost_memory_region *region;
+int reg_idx;
+int fd_idx;
 };
 
 static bool ioeventfd_enabled(void)
@@ -488,8 +506,329 @@ static int vhost_user_fill_set_mem_table_msg(struct 
vhost_user *u,
 return 1;
 }
 
+static inline bool reg_equal(struct vhost_memory_region *shadow_reg,
+ struct vhost_memory_region *vdev_reg)
+{
+return

[PATCH v3 02/10] Add vhost-user helper to get MemoryRegion data

2020-05-19 Thread Raphael Norwitz

When setting the memory tables, qemu uses a memory region's userspace
address to look up the region's MemoryRegion struct. Among other things,
the MemoryRegion contains the region's offset and associated file
descriptor, all of which need to be sent to the backend.

With VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS, this logic will be
needed in multiple places, so before feature support is added it
should be moved to a helper function.

This helper is also used to simplify the vhost_user_can_merge()
function.

Signed-off-by: Raphael Norwitz 
---
 hw/virtio/vhost-user.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index ee6d1ed..dacf5bb 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -407,6 +407,18 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, 
uint64_t base,
 return 0;
 }
 
+static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset,
+int *fd)
+{
+MemoryRegion *mr;
+
+assert((uintptr_t)addr == addr);
+mr = memory_region_from_host((void *)(uintptr_t)addr, offset);
+*fd = memory_region_get_fd(mr);
+
+return mr;
+}
+
 static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
struct vhost_memory_region *src)
 {
@@ -432,10 +444,7 @@ static int vhost_user_fill_set_mem_table_msg(struct 
vhost_user *u,
 for (i = 0; i < dev->mem->nregions; ++i) {
 reg = dev->mem->regions + i;
 
-assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
-mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr,
- );
-fd = memory_region_get_fd(mr);
+mr = vhost_user_get_mr_data(reg->userspace_addr, , );
 if (fd > 0) {
 if (track_ramblocks) {
 assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS);
@@ -1550,13 +1559,9 @@ static bool vhost_user_can_merge(struct vhost_dev *dev,
 {
 ram_addr_t offset;
 int mfd, rfd;
-MemoryRegion *mr;
-
-mr = memory_region_from_host((void *)(uintptr_t)start1, );
-mfd = memory_region_get_fd(mr);
 
-mr = memory_region_from_host((void *)(uintptr_t)start2, );
-rfd = memory_region_get_fd(mr);
+(void)vhost_user_get_mr_data(start1, , );
+(void)vhost_user_get_mr_data(start2, , );
 
 return mfd == rfd;
 }
-- 
1.8.3.1

[PATCH v3 06/10] Refactor out libvhost-user fault generation logic

2020-05-19 Thread Raphael Norwitz

In libvhost-user, the incoming postcopy migration path for setting the
backend's memory tables has become convolued. In particular, moving the
logic which starts generating faults, having received the final ACK from
qemu can be moved to a separate function. This simplifies the code
substantially.

This logic will also be needed by the postcopy path once the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is supported.

Signed-off-by: Raphael Norwitz 
---
 contrib/libvhost-user/libvhost-user.c | 147 ++
 1 file changed, 79 insertions(+), 68 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 3bca996..cccfa22 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -584,6 +584,84 @@ map_ring(VuDev *dev, VuVirtq *vq)
 }
 
 static bool
+generate_faults(VuDev *dev) {
+int i;
+for (i = 0; i < dev->nregions; i++) {
+VuDevRegion *dev_region = >regions[i];
+int ret;
+#ifdef UFFDIO_REGISTER
+/*
+ * We should already have an open ufd. Mark each memory
+ * range as ufd.
+ * Discard any mapping we have here; note I can't use MADV_REMOVE
+ * or fallocate to make the hole since I don't want to lose
+ * data that's already arrived in the shared process.
+ * TODO: How to do hugepage
+ */
+ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
+  dev_region->size + dev_region->mmap_offset,
+  MADV_DONTNEED);
+if (ret) {
+fprintf(stderr,
+"%s: Failed to madvise(DONTNEED) region %d: %s\n",
+__func__, i, strerror(errno));
+}
+/*
+ * Turn off transparent hugepages so we dont get lose wakeups
+ * in neighbouring pages.
+ * TODO: Turn this backon later.
+ */
+ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
+  dev_region->size + dev_region->mmap_offset,
+  MADV_NOHUGEPAGE);
+if (ret) {
+/*
+ * Note: This can happen legally on kernels that are configured
+ * without madvise'able hugepages
+ */
+fprintf(stderr,
+"%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
+__func__, i, strerror(errno));
+}
+struct uffdio_register reg_struct;
+reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
+reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
+reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, _struct)) {
+vu_panic(dev, "%s: Failed to userfault region %d "
+  "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
+ __func__, i,
+ dev_region->mmap_addr,
+ dev_region->size, dev_region->mmap_offset,
+ dev->postcopy_ufd, strerror(errno));
+return false;
+}
+if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
+vu_panic(dev, "%s Region (%d) doesn't support COPY",
+ __func__, i);
+return false;
+}
+DPRINT("%s: region %d: Registered userfault for %"
+   PRIx64 " + %" PRIx64 "\n", __func__, i,
+   (uint64_t)reg_struct.range.start,
+   (uint64_t)reg_struct.range.len);
+/* Now it's registered we can let the client at it */
+if (mprotect((void *)(uintptr_t)dev_region->mmap_addr,
+ dev_region->size + dev_region->mmap_offset,
+ PROT_READ | PROT_WRITE)) {
+vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
+ i, strerror(errno));
+return false;
+}
+/* TODO: Stash 'zero' support flags somewhere */
+#endif
+}
+
+return true;
+}
+
+static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
 int i;
@@ -655,74 +733,7 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg 
*vmsg)
 }
 
 /* OK, now we can go and register the memory and generate faults */
-for (i = 0; i < dev->nregions; i++) {
-VuDevRegion *dev_region = >regions[i];
-int ret;
-#ifdef UFFDIO_REGISTER
-/* We should already have an open ufd. Mark each memory
- * range as ufd.
- * Discard any mapping we have here; note I can't use MADV_REMOVE
- * or fallocate to make the hole since I don't want to lose
- * data that's already arrived in the shared process.
- * TODO: How to do hugepage
- */
-ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
-  dev_region->size + dev_region->mmap_offset,
-  MADV_DONTNEED);
-if (ret) {
-

[PATCH v3 08/10] Support adding individual regions in libvhost-user

2020-05-19 Thread Raphael Norwitz

When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS is enabled, qemu will
transmit memory regions to a backend individually using the new message
VHOST_USER_ADD_MEM_REG. With this change vhost-user backends built with
libvhost-user can now map in new memory regions when VHOST_USER_ADD_MEM_REG
messages are received.

Qemu only sends VHOST_USER_ADD_MEM_REG messages when the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and
since it is not yet supported in libvhost-user, this new functionality
is not yet used.

Signed-off-by: Raphael Norwitz 
---
 contrib/libvhost-user/libvhost-user.c | 103 ++
 contrib/libvhost-user/libvhost-user.h |   7 +++
 2 files changed, 110 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 9f039b7..2c2a8d9 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -138,6 +138,7 @@ vu_request_to_string(unsigned int req)
 REQ(VHOST_USER_GPU_SET_SOCKET),
 REQ(VHOST_USER_VRING_KICK),
 REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
+REQ(VHOST_USER_ADD_MEM_REG),
 REQ(VHOST_USER_MAX),
 };
 #undef REQ
@@ -663,6 +664,106 @@ generate_faults(VuDev *dev) {
 }
 
 static bool
+vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
+int i;
+bool track_ramblocks = dev->postcopy_listening;
+VhostUserMemoryRegion *msg_region = >payload.memreg.region;
+VuDevRegion *dev_region = >regions[dev->nregions];
+void *mmap_addr;
+
+/*
+ * If we are in postcopy mode and we receive a u64 payload with a 0 value
+ * we know all the postcopy client bases have been recieved, and we
+ * should start generating faults.
+ */
+if (track_ramblocks &&
+vmsg->size == sizeof(vmsg->payload.u64) &&
+vmsg->payload.u64 == 0) {
+(void)generate_faults(dev);
+return false;
+}
+
+DPRINT("Adding region: %d\n", dev->nregions);
+DPRINT("guest_phys_addr: 0x%016"PRIx64"\n",
+   msg_region->guest_phys_addr);
+DPRINT("memory_size: 0x%016"PRIx64"\n",
+   msg_region->memory_size);
+DPRINT("userspace_addr   0x%016"PRIx64"\n",
+   msg_region->userspace_addr);
+DPRINT("mmap_offset  0x%016"PRIx64"\n",
+   msg_region->mmap_offset);
+
+dev_region->gpa = msg_region->guest_phys_addr;
+dev_region->size = msg_region->memory_size;
+dev_region->qva = msg_region->userspace_addr;
+dev_region->mmap_offset = msg_region->mmap_offset;
+
+/*
+ * We don't use offset argument of mmap() since the
+ * mapped address has to be page aligned, and we use huge
+ * pages.
+ */
+if (track_ramblocks) {
+/*
+ * In postcopy we're using PROT_NONE here to catch anyone
+ * accessing it before we userfault.
+ */
+mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+ PROT_NONE, MAP_SHARED,
+ vmsg->fds[0], 0);
+} else {
+mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+ PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0],
+ 0);
+}
+
+if (mmap_addr == MAP_FAILED) {
+vu_panic(dev, "region mmap error: %s", strerror(errno));
+} else {
+dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+DPRINT("mmap_addr:   0x%016"PRIx64"\n",
+   dev_region->mmap_addr);
+}
+
+close(vmsg->fds[0]);
+
+if (track_ramblocks) {
+/*
+ * Return the address to QEMU so that it can translate the ufd
+ * fault addresses back.
+ */
+msg_region->userspace_addr = (uintptr_t)(mmap_addr +
+ dev_region->mmap_offset);
+
+/* Send the message back to qemu with the addresses filled in. */
+vmsg->fd_num = 0;
+if (!vu_send_reply(dev, dev->sock, vmsg)) {
+vu_panic(dev, "failed to respond to add-mem-region for postcopy");
+return false;
+}
+
+DPRINT("Successfully added new region in postcopy\n");
+dev->nregions++;
+return false;
+
+} else {
+for (i = 0; i < dev->max_queues; i++) {
+if (dev->vq[i].vring.desc) {
+if (map_ring(dev, >vq[i])) {
+vu_panic(dev, "remapping queue %d for new memory region",
+ i);
+}
+}
+}
+
+DPRINT("Successfully added new region\n");
+dev->nregions++;
+vmsg_set_reply_u64(vmsg, 0);
+return true;
+}
+}
+
+static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
 int i;
@@ -1668,6 +1769,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 return vu_handle_vring_kick(dev, vmsg);
 case VHOST_USER_GET_MAX_MEM_SLOTS:
 return

Re: [RFC PATCH 0/3] block: Synchronous bdrv_*() from coroutine in different AioContext

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


14.05.2020 17:26, Kevin Wolf wrote:

Am 14.05.2020 um 15:21 hat Thomas Lamprecht geschrieben:

On 5/12/20 4:43 PM, Kevin Wolf wrote:

Stefan (Reiter), after looking a bit closer at this, I think there is no
bug in QEMU, but the bug is in your coroutine code that calls block
layer functions without moving into the right AioContext first. I've
written this series anyway as it potentially makes the life of callers
easier and would probably make your buggy code correct.



However, it doesn't feel right to commit something like patch 2 without
having a user for it. Is there a reason why you can't upstream your
async snapshot code?


I mean I understand what you mean, but it would make the interface IMO so
much easier to use, if one wants to explicit schedule it beforehand they
can still do. But that would open the way for two styles doing things, not
sure if this would seen as bad. The assert about from patch 3/3 would be
already really helping a lot, though.


I think patches 1 and 3 are good to be committed either way if people
think they are useful. They make sense without the async snapshot code.

My concern with the interface in patch 2 is both that it could give
people a false sense of security and that it would be tempting to write
inefficient code.

Usually, you won't have just a single call into the block layer for a
given block node, but you'll perform multiple operations. Switching to
the target context once rather than switching back and forth in every
operation is obviously more efficient.

But chances are that even if one of these function is bdrv_flush(),
which now works correctly from a different thread, you might need
another function that doesn't implement the same magic. So you always
need to be aware which functions support cross-context calls which
ones don't.

I feel we'd see a few bugs related to this.


Regarding upstreaming, there was some historical attempt to upstream it
from Dietmar, but in the time frame of ~ 8 to 10 years ago or so.
I'm not quite sure why it didn't went through then, I see if I can get
some time searching the mailing list archive.

We'd be naturally open and glad to upstream it, what it effectively
allow us to do is to not block the VM to much during snapshoting it
live.


Yes, there is no doubt that this is useful functionality. There has been
talk about this every now and then, but I don't think we ever got to a
point where it actually could be implemented.

Vladimir, I seem to remember you (or someone else from your team?) were
interested in async snapshots as well a while ago?


Den is working on this (add him to CC)




I pushed a tree[0] with mostly just that specific code squashed together (hope
I did not break anything), most of the actual code is in commit [1].
It'd be cleaned up a bit and checked for coding style issues, but works good
here.

Anyway, thanks for your help and pointers!

[0]: https://github.com/ThomasLamprecht/qemu/tree/savevm-async
[1]: 
https://github.com/ThomasLamprecht/qemu/commit/ffb9531f370ef0073e4b6f6021f4c47ccd702121


It doesn't even look that bad in terms of patch size. I had imagined it
a bit larger.

But it seems this is not really just an async 'savevm' (which would save
the VM state in a qcow2 file), but you store the state in a separate
raw file. What is the difference between this and regular migration into
a file?

I remember people talking about how snapshotting can store things in a
way that a normal migration stream can't do, like overwriting outdated
RAM state instead of just appending the new state, but you don't seem to
implement something like this.

Kevin




--
Best regards,
Vladimir

Re: [PATCH v4 9/9] iotests: rename and move 169 and 199 tests

2020-05-19 Thread Vladimir Sementsov-Ogievskiy


19.05.2020 12:07, Kevin Wolf wrote:

Am 18.05.2020 um 18:12 hat Thomas Huth geschrieben:

On 15/05/2020 23.15, Vladimir Sementsov-Ogievskiy wrote:

Rename bitmaps migration tests and move them to tests subdirectory to
demonstrate new human-friendly test naming.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  tests/qemu-iotests/{199 => tests/migrate-bitmaps-postcopy-test}   | 0
  .../{199.out => tests/migrate-bitmaps-postcopy-test.out}  | 0
  tests/qemu-iotests/{169 => tests/migrate-bitmaps-test}| 0
  tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out}| 0
  4 files changed, 0 insertions(+), 0 deletions(-)
  rename tests/qemu-iotests/{199 => tests/migrate-bitmaps-postcopy-test} (100%)
  rename tests/qemu-iotests/{199.out => 
tests/migrate-bitmaps-postcopy-test.out} (100%)
  rename tests/qemu-iotests/{169 => tests/migrate-bitmaps-test} (100%)
  rename tests/qemu-iotests/{169.out => tests/migrate-bitmaps-test.out} (100%)

diff --git a/tests/qemu-iotests/199 
b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
similarity index 100%
rename from tests/qemu-iotests/199
rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
diff --git a/tests/qemu-iotests/199.out 
b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
similarity index 100%
rename from tests/qemu-iotests/199.out
rename to tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test.out
diff --git a/tests/qemu-iotests/169 
b/tests/qemu-iotests/tests/migrate-bitmaps-test
similarity index 100%
rename from tests/qemu-iotests/169
rename to tests/qemu-iotests/tests/migrate-bitmaps-test
diff --git a/tests/qemu-iotests/169.out 
b/tests/qemu-iotests/tests/migrate-bitmaps-test.out
similarity index 100%
rename from tests/qemu-iotests/169.out
rename to tests/qemu-iotests/tests/migrate-bitmaps-test.out


I like the idea ... but the path name + file names get now quite long.
While you're at it, what about renaming the "qemu-iotests" directory to
just "iotests" or even just "io" now?


Renames are always kind of painful. Do we have a real reason for the
rename except that the paths feel a bit long subjectively?

Of course, if we're renaming all files anyway, changing the directory
name at the same time shouldn't give any additional pain, so it would be
completely reasonable then. We're not renaming the test harness files,
though, and even only two test cases in this patch.

Maybe this final patch should stay RFC until we have the infrastructure
in and then we can have a single series that moves all tests and also
renames the directory? Maybe a not strictly necessary rename of the
tooling would be bearable in the context of a mass rename of tests.



I'm absolutely not hurrying about this thing. And actual aim of the series is 
another. I even doubt that we will mass rename the tests: who knows what they 
all test?) I don't. Still we may rename some tests, and we'll create new named 
tests which is good enough.. OK, if I resend a new version, I'll add an RFC 
patch on renaming the directory, up to maintainers, take it now or not :)

--
Best regards,
Vladimir

Re: [PATCH 04/24] aspeed: Don't create unwanted "ftgmac100", "aspeed-mmi" devices

2020-05-19 Thread Philippe Mathieu-Daudé


On 5/19/20 7:45 AM, Markus Armbruster wrote:

"Andrew Jeffery"  writes:


On Mon, 18 May 2020, at 21:49, Cédric Le Goater wrote:

On 5/18/20 7:03 AM, Markus Armbruster wrote:

These devices are optional, and controlled by @nb_nics.
aspeed_soc_ast2600_init() and aspeed_soc_init() create the maximum
supported number.  aspeed_soc_ast2600_realize() and
aspeed_soc_realize() realize only the wanted number.  Works, although
it can leave unrealized devices hanging around in the QOM composition
tree.  Affects machines ast2500-evb, ast2600-evb, palmetto-bmc,
romulus-bmc, swift-bmc, tacoma-bmc, and witherspoon-bmc.

Make the init functions create only the wanted ones.  Visible in "info
qom-tree"; here's the change for ast2600-evb:

  /machine (ast2600-evb-machine)
[...]
/soc (ast2600-a1)
  [...]
  /ftgmac100[0] (ftgmac100)
/ftgmac100[0] (qemu:memory-region)
 -/ftgmac100[1] (ftgmac100)
 -/ftgmac100[2] (ftgmac100)
 -/ftgmac100[3] (ftgmac100)
  /gpio (aspeed.gpio-ast2600)
  [...]
  /mii[0] (aspeed-mmi)
/aspeed-mmi[0] (qemu:memory-region)
 -/mii[1] (aspeed-mmi)
 -/mii[2] (aspeed-mmi)
 -/mii[3] (aspeed-mmi)
  /rtc (aspeed.rtc)

I'm not sure creating @nb_nics devices makes sense.  How many does the
physical chip provide?


The AST2400, AST2500 SoC have 2 macs and the AST2600 has 4. Each machine
define the one it uses, generally MAC0 but the tacoma board uses MAC3.

Shouldn't the model reflect the real address space independently from
the NIC backends defined on the command line ?


If the SoC has N ftgmac100 peripherals, you need to mmio-map the N 
instances, else your guest will get MEMTX_DECODE_ERROR trying to access 
it, regardless command line NIC plugged.




That's my feeling too, though I'm not sure what to make of the unrealised 
devices
in the QOM tree. Does it matter? It hasn't bothered me.


Depending on what the initialization code does, unrealized devices can
be anything from a little wasted memory to open bear trap.  I don't
really expect the latter extreme in the code, as I expect bear traps to
quickly catch the developer that set them.

I guess the unrealized devices cleaned up in this patch did no actual
harm.

Still, it's an unhealthy state, and that's why I clean it up.  "[PATCH
24/24] qdev: Assert onboard devices all get realized properly" should
ensure we stay clean.

Re: [PATCH 11/24] pnv/phb4: Bury unwanted "pnv-phb4-pec-stack" devices

2020-05-19 Thread Markus Armbruster

Cédric Le Goater  writes:

> On 5/18/20 7:03 AM, Markus Armbruster wrote:
>> The number of stacks is controlled by property "num-stacks".
>> pnv_pec_instance_init() creates the maximum supported number, because
>> the property has not been set then.  pnv_pec_realize() realizes only
>> the wanted number.  Works, although it can leave unrealized devices
>> hanging around in the QOM composition tree.  Affects machine powernv9.
>
> I have used this pattern in many models. Is there a better one ?

The pattern is just fine, we just need to unparent any devices that turn
out to be unwanted.

Of course, when we already know what's wanted at instance_init time,
there's no reason for creating more.

Re: [PATCH 02/24] display/xlnx_dp: Fix to realize "i2c-ddc" and "aux-to-i2c-bridge"

2020-05-19 Thread Markus Armbruster

Peter Maydell  writes:

> On Tue, 19 May 2020 at 06:09, Markus Armbruster  wrote:
>> I figure the "device becomes real only on realize" thing is actually
>> more myth than thing.
>
> It's not a myth, it's an API guarantee thing. If you don't realize
> the device you create before you use it then you're in the world of
> unspecified behaviour, and anything could happen: maybe it works,
> maybe it doesn't, maybe it works today and breaks tomorrow.

It's a myth in the sense "we want it to be that way, but it often
ain't" :)

Of course your right in that it is also a case of "use the interface the
specified way, or else".

Re: [PULL v2 00/52] Block layer patches

2020-05-19 Thread Peter Maydell

On Mon, 18 May 2020 at 18:07, Kevin Wolf  wrote:
>
> The following changes since commit debe78ce14bf8f8940c2bdf3ef387505e9e035a9:
>
>   Merge remote-tracking branch 'remotes/rth/tags/pull-fpu-20200515' into 
> staging (2020-05-15 19:51:16 +0100)
>
> are available in the Git repository at:
>
>   git://repo.or.cz/qemu/kevin.git tags/for-upstream
>
> for you to fetch changes up to 4cdd0a774dc35b2ffe6ddb634e0c431f17dfe07e:
>
>   hw: Use QEMU_IS_ALIGNED() on parallel flash block size (2020-05-18 19:05:25 
> +0200)
>
> 
> Block layer patches:
>
> - Introduce real BdrvChildRole
> - blk/bdrv_make_empty() functions instead of calling callbacks directly
> - mirror: Make sure that source and target size match
> - block-copy: Fix uninitialized variable
> - block/replication: Avoid cancelling the job twice
> - ahci: Log lost IRQs
> - iotests: Run pylint and mypy in a testcase
> - iotests: log messages from notrun()


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.1
for any user-visible changes.

-- PMM

Re: [PATCH] linux-user/mmap.c: fix integer underflow in target_mremap

2020-05-19 Thread Philippe Mathieu-Daudé


Hi Jonathan.

On 5/19/20 10:11 AM, Stefano Garzarella wrote:

Hi Jonathan,
thanks for the patch!

CCing Riku and Laurent.

On Mon, May 18, 2020 at 12:13:41PM -0600, Jonathan Marler wrote:

Been a few more days.  Not sure how often I should be pinging.  If this is
too much to ping every few days let me know.


Pinging every week is fine. The problem here, as noticed by Stefano, is 
you forgot to Cc the maintainers, so they surely missed your patch.


Last time Riku sent an email to qemu-devel was more than 2 years ago, 
letling Laurent second him, then went MIA:

https://www.mail-archive.com/qemu-devel@nongnu.org/msg507843.html

I'd say count 1 week starting today.

Regards,

Phil.



Is not too much, but next time is better to CC the maintainers.
You can use 'scripts/get_maintainer.pl' to get the list of maintainers
and reviewers.

Please take a look at https://wiki.qemu.org/Contribute/SubmitAPatch



On Fri, May 15, 2020 at 7:36 AM Jonathan Marler 
wrote:


Been a couple weeks, checking to see if anyone has looked at this.

On Sat, May 2, 2020 at 5:43 PM Jonathan Marler 
wrote:


FYI, I applied this patch to the qemu build that zig uses to run
non-native tests (
https://github.com/ziglang/qemu-static/blob/master/patch/mremap-underflow.diff
)

After applying it, my new code that calls mremap now passes,
whereas before the fix I was getting a segfault.

On Sat, May 2, 2020 at 10:12 AM Jonathan Marler 
wrote:


Fixes: https://bugs.launchpad.net/bugs/1876373


should be "Buglink: https://bugs.launchpad.net/bugs/1876373;



This code path in mmap occurs when a page size is decreased with
mremap.  When a section of pages is shrunk, qemu calls mmap_reserve on the
pages that were released.  However, it has the diff operation reversed,
subtracting the larger old_size from the smaller new_size.  Instead, it
should be subtracting the smaller new_size from the larger old_size.  You
can also see in the previous line of the change that this mmap_reserve call
only occurs when old_size > new_size.


Please break the lines of the commit message (max 76 charactes per line):
https://wiki.qemu.org/Contribute/SubmitAPatch#Write_a_meaningful_commit_message

Thanks,
Stefano



Signed-off-by: Jonathan Marler 
---
  linux-user/mmap.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index e378033797..caab62909e 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -708,7 +708,7 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong
old_size,
  if (prot == 0) {
  host_addr = mremap(g2h(old_addr), old_size, new_size,
flags);
  if (host_addr != MAP_FAILED && reserved_va && old_size >
new_size) {
-mmap_reserve(old_addr + old_size, new_size - old_size);
+mmap_reserve(old_addr + old_size, old_size - new_size);
  }
  } else {
  errno = ENOMEM;
--
2.23.1

Re: [PATCH] target/i386: Fix OUTL debug output

2020-05-19 Thread Philippe Mathieu-Daudé


On 5/19/20 5:39 AM, Richard Henderson wrote:

On 5/17/20 4:01 AM, Philippe Mathieu-Daudé wrote:

Fix OUTL instructions incorrectly displayed as OUTW.

Signed-off-by: Philippe Mathieu-Daudé 
---
  target/i386/misc_helper.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Richard Henderson 

as far as it goes.  We should of course never be printing to stderr, but
logging or tracing.


Yes, this is what the cpu_io() functions do in ioport.c, but they use 
MEMTXATTRS_UNSPECIFIED.


If we could replace one with another, I'm not sure which one is the 
correct one.


Maybe keep cpu_io() from ioport.c but add a MemTxAttrs argument?

Regards,

Phil.

[PATCH v3 01/10] Add helper to populate vhost-user message regions

2020-05-19 Thread Raphael Norwitz

When setting vhost-user memory tables, memory region descriptors must be
copied from the vhost_dev struct to the vhost-user message. To avoid
duplicating code in setting the memory tables, we should use a helper to
populate this field. This change adds this helper.

Signed-off-by: Raphael Norwitz 
---
 hw/virtio/vhost-user.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index ec21e8f..ee6d1ed 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -407,6 +407,15 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, 
uint64_t base,
 return 0;
 }
 
+static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
+   struct vhost_memory_region *src)
+{
+assert(src != NULL && dst != NULL);
+dst->userspace_addr = src->userspace_addr;
+dst->memory_size = src->memory_size;
+dst->guest_phys_addr = src->guest_phys_addr;
+}
+
 static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
  struct vhost_dev *dev,
  VhostUserMsg *msg,
@@ -441,12 +450,8 @@ static int vhost_user_fill_set_mem_table_msg(struct 
vhost_user *u,
 error_report("Failed preparing vhost-user memory table msg");
 return -1;
 }
-msg->payload.memory.regions[*fd_num].userspace_addr =
-reg->userspace_addr;
-msg->payload.memory.regions[*fd_num].memory_size =
-reg->memory_size;
-msg->payload.memory.regions[*fd_num].guest_phys_addr =
-reg->guest_phys_addr;
+vhost_user_fill_msg_region(>payload.memory.regions[*fd_num],
+   reg);
 msg->payload.memory.regions[*fd_num].mmap_offset = offset;
 fds[(*fd_num)++] = fd;
 } else if (track_ramblocks) {
-- 
1.8.3.1

[PATCH v3 00/10] vhost-user: Lift Max Ram Slots Limitation

2020-05-19 Thread Raphael Norwitz

In QEMU today, a VM with a vhost-user device can hot add memory a
maximum of 8 times. See these threads, among others:

[1] https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg01046.html
https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg01236.html

[2] https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg04656.html

This series introduces a new protocol feature
VHOST_USER_PROTOCOL_F_CONFIGURE_SLOTS which, when enabled, lifts the
restriction on the maximum number RAM slots imposed by vhost-user.

Without vhost-user, a Qemu VM can support 256 ram slots (for ACPI targets),
or potentially more (the KVM max is 512). With each region, a file descriptor
must be sent over the socket. If that many regions are sent in a single message
there could be upwards of 256 file descriptors being opened in the backend 
process
at once. Opening that many fds could easily push the process past the open fd 
limit,
especially considering one backend process could have multiple vhost threads,
exposing different devices to different Qemu instances. Therefore to safely 
lift the
limit, transmitting regions should be split up over multiple messages.

In addition, the VHOST_USER_SET_MEM_TABLE message was not reused because
as the number of regions grows, the message becomes very large. In practice, 
such
large messages caused problems (truncated messages) and in the past it seems
the community has opted for smaller fixed size messages where possible. VRINGs,
for example, are sent to the backend individually instead of in one massive
message.

The implementation details are explained in more detail in the commit
messages, but at a high level the new protocol feature works as follows:
- If the VHOST_USER_PROTCOL_F_CONFIGURE_MEM_SLOTS feature is enabled,
  QEMU will send multiple VHOST_USER_ADD_MEM_REG and
  VHOST_USER_REM_MEM_REG messages to map and unmap individual memory
 regions instead of one large VHOST_USER_SET_MEM_TABLE message containing
  all memory regions.
- The vhost-user struct maintains a ’shadow state’ of memory regions
  already sent to the guest. Each time vhost_user_set_mem_table is called,
  the shadow state is compared with the new device state. A
  VHOST_USER_REM_MEM_REG will be sent for each region in the shadow state
  not in the device state. Then, a VHOST_USER_ADD_MEM_REG will be sent
  for each region in the device state but not the shadow state. After
  these messages have been sent, the shadow state will be updated to
  reflect the new device state.

The series consists of 10 changes:
1. Add helper to populate vhost-user message regions:
This change adds a helper to populate a VhostUserMemoryRegion from a
struct vhost_memory_region, which needs to be done in multiple places in
in this series.

2. Add vhost-user helper to get MemoryRegion data
This changes adds a helper to get a pointer to a MemoryRegion struct, along
with it's offset address and associated file descriptor. This helper is 
used to
simplify other vhost-user code paths and will be needed elsewhere in this
series.

3. Add VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
This change adds the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature. At this point, if negotiated, the feature only allows the
backend to limit the number of max ram slots to a number less than
VHOST_MEMORY_MAX_NREGIONS = 8.

4. Transmit vhost-user memory regions individually
With this change, if the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature is enabled, Qemu will send regions to the backend using
individual VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG
messages.
The max number of ram slots supported is still limited to 8.

5. Lift max memory slots imposed by vhost-user
With this change, if the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature is enabled, the backend can support a configurable number 
of
ram slots up to the maximum allowed by the target platform.

6. Refactor out libvhost-user fault generation logic
This cleanup moves some logic from vu_set_mem_table_exec_postcopy() to a
separate helper, which will be needed elsewhere.

7. Support ram slot configuration in libvhost-user
   This change adds support for processing VHOST_USER_GET_MAX_MEMSLOTS
messages in libvhost-user.
The VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol is not yet
enabled in libvhost-user, so at this point this change is non-functional.

8. Support adding individual regions in libvhost-user
This change adds libvhost-user support for mapping in new memory regions
when receiving VHOST_USER_ADD_MEM_REG messages.
The VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol is not yet
enabled in libvhost-user, so at this point this change is non-functional.

9. Support individual region unmap in libvhost-user
This change adds libvhost-user support for unmapping removed memory regions
when receiving VHOST_USER_REM_MEM_REG messages.

[PATCH v3 05/10] Lift max memory slots limit imposed by vhost-user

2020-05-19 Thread Raphael Norwitz

Historically, sending all memory regions to vhost-user backends in a
single message imposed a limitation on the number of times memory
could be hot-added to a VM with a vhost-user device. Now that backends
which support the VHOST_USER_PROTOCOL_F_CONFIGURE_SLOTS send memory
regions individually, we no longer need to impose this limitation on
devices which support this feature.

With this change, VMs with a vhost-user device which supports the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS can support a configurable
number of memory slots, up to the maximum allowed by the target
platform.

Existing backends which do not support
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected.

Signed-off-by: Raphael Norwitz 
Signed-off-by: Peter Turschmid 
Suggested-by: Mike Cui 
---
 docs/interop/vhost-user.rst |  7 +++---
 hw/virtio/vhost-user.c  | 56 ++---
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 037eefa..688b7c6 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1273,10 +1273,9 @@ Master message types
   feature has been successfully negotiated, this message is submitted
   by master to the slave. The slave should return the message with a
   u64 payload containing the maximum number of memory slots for
-  QEMU to expose to the guest. At this point, the value returned
-  by the backend will be capped at the maximum number of ram slots
-  which can be supported by vhost-user. Currently that limit is set
-  at VHOST_USER_MAX_RAM_SLOTS = 8.
+  QEMU to expose to the guest. The value returned by the backend
+  will be capped at the maximum number of ram slots which can be
+  supported by the target platform.
 
 ``VHOST_USER_ADD_MEM_REG``
   :id: 37
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 4af8476..270a96d 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -35,11 +35,29 @@
 #include 
 #endif
 
-#define VHOST_MEMORY_MAX_NREGIONS8
+#define VHOST_MEMORY_BASELINE_NREGIONS8
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 #define VHOST_USER_SLAVE_MAX_FDS 8
 
 /*
+ * Set maximum number of RAM slots supported to
+ * the maximum number supported by the target
+ * hardware plaform.
+ */
+#if defined(TARGET_X86) || defined(TARGET_X86_64) || \
+defined(TARGET_ARM) || defined(TARGET_ARM_64)
+#include "hw/acpi/acpi.h"
+#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS
+
+#elif defined(TARGET_PPC) || defined(TARGET_PPC_64)
+#include "hw/ppc/spapr.h"
+#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
+
+#else
+#define VHOST_USER_MAX_RAM_SLOTS 512
+#endif
+
+/*
  * Maximum size of virtio device config space
  */
 #define VHOST_USER_MAX_CONFIG_SIZE 256
@@ -127,7 +145,7 @@ typedef struct VhostUserMemoryRegion {
 typedef struct VhostUserMemory {
 uint32_t nregions;
 uint32_t padding;
-VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
 } VhostUserMemory;
 
 typedef struct VhostUserMemRegMsg {
@@ -222,7 +240,7 @@ struct vhost_user {
 int slave_fd;
 NotifierWithReturn postcopy_notifier;
 struct PostCopyFD  postcopy_fd;
-uint64_t   postcopy_client_bases[VHOST_MEMORY_MAX_NREGIONS];
+uint64_t   postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS];
 /* Length of the region_rb and region_rb_offset arrays */
 size_t region_rb_len;
 /* RAMBlock associated with a given region */
@@ -237,7 +255,7 @@ struct vhost_user {
 
 /* Our current regions */
 int num_shadow_regions;
-struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS];
+struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS];
 };
 
 struct scrub_regions {
@@ -392,7 +410,7 @@ int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd)
 static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
struct vhost_log *log)
 {
-int fds[VHOST_MEMORY_MAX_NREGIONS];
+int fds[VHOST_USER_MAX_RAM_SLOTS];
 size_t fd_num = 0;
 bool shmfd = virtio_has_feature(dev->protocol_features,
 VHOST_USER_PROTOCOL_F_LOG_SHMFD);
@@ -469,7 +487,7 @@ static int vhost_user_fill_set_mem_table_msg(struct 
vhost_user *u,
 mr = vhost_user_get_mr_data(reg->userspace_addr, , );
 if (fd > 0) {
 if (track_ramblocks) {
-assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS);
+assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS);
 trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name,
   reg->memory_size,
   reg->guest_phys_addr,
@@ -477,7 +495,7 @@ static int vhost_user_fill_set_mem_table_msg(struct 
vhost_user *u,

[PATCH v3 07/10] Support ram slot configuration in libvhost-user

2020-05-19 Thread Raphael Norwitz

The VHOST_USER_GET_MAX_MEM_SLOTS message allows a vhost-user backend to
specify a maximum number of ram slots it is willing to support. This
change adds support for libvhost-user to process this message. For now
the backend will reply with 8 as the maximum number of regions
supported.

libvhost-user does not yet support the vhost-user protocol feature
VHOST_USER_PROTOCOL_F_CONFIGIRE_MEM_SLOTS, so qemu should never
send the VHOST_USER_GET_MAX_MEM_SLOTS message. Therefore this new
functionality is not currently used.

Signed-off-by: Raphael Norwitz 
---
 contrib/libvhost-user/libvhost-user.c | 19 +++
 contrib/libvhost-user/libvhost-user.h |  1 +
 2 files changed, 20 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index cccfa22..9f039b7 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -137,6 +137,7 @@ vu_request_to_string(unsigned int req)
 REQ(VHOST_USER_SET_INFLIGHT_FD),
 REQ(VHOST_USER_GPU_SET_SOCKET),
 REQ(VHOST_USER_VRING_KICK),
+REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
 REQ(VHOST_USER_MAX),
 };
 #undef REQ
@@ -1565,6 +1566,22 @@ vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
 return false;
 }
 
+static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg)
+{
+vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+vmsg->size  = sizeof(vmsg->payload.u64);
+vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS;
+vmsg->fd_num = 0;
+
+if (!vu_message_write(dev, dev->sock, vmsg)) {
+vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno));
+}
+
+DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS);
+
+return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1649,6 +1666,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 return vu_set_inflight_fd(dev, vmsg);
 case VHOST_USER_VRING_KICK:
 return vu_handle_vring_kick(dev, vmsg);
+case VHOST_USER_GET_MAX_MEM_SLOTS:
+return vu_handle_get_max_memslots(dev, vmsg);
 default:
 vmsg_close_fds(vmsg);
 vu_panic(dev, "Unhandled request: %d", vmsg->request);
diff --git a/contrib/libvhost-user/libvhost-user.h 
b/contrib/libvhost-user/libvhost-user.h
index f30394f..88ef40d 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -97,6 +97,7 @@ typedef enum VhostUserRequest {
 VHOST_USER_SET_INFLIGHT_FD = 32,
 VHOST_USER_GPU_SET_SOCKET = 33,
 VHOST_USER_VRING_KICK = 35,
+VHOST_USER_GET_MAX_MEM_SLOTS = 36,
 VHOST_USER_MAX
 } VhostUserRequest;
 
-- 
1.8.3.1

Re: [RFC PATCH 8/8] riscv: Add RV64F instructions description

2020-05-19 Thread LIU Zhiwei





On 2020/5/12 2:11, Richard Henderson wrote:

On 4/30/20 12:21 AM, LIU Zhiwei wrote:

+FCVT_L_S RISCV 110 00010 rs1:5 rm:3 rd:5 1010011 \
+!constraints { $rd != 2 && $rd != 3 && $rd != 4 && $rm != 6 && $rm != 5 }
+
+FCVT_LU_S RISCV 110 00011 rs1:5 rm:3 rd:5 1010011 \
+!constraints { $rd != 2 && $rd != 3 && $rd != 4 && $rm != 6 && $rm != 5 }
+
+FCVT_S_L RISCV 1101000 00010 rs1:5 rm:3 rd:5 1010011 \
+!constraints { $rs1 != 2 && $rm != 6 && $rm != 5 }
+
+FCVT_S_LU RISCV 1101000 00011 rs1:5 rm:3 rd:5 1010011 \
+!constraints { $rs1 != 2 && $rm != 6 && $rm != 5 }

Interesting question here: Do we really want to avoid the reserved rounding
modes, or do we want to verify that we raise an invalid operand exception?
I think we should always generate legal  instructions except explicitly 
illegal instructions

with proper risu ops code.

Zhiwei

I guess I'm fine with it either way.


r~

[PATCH v3 10/10] Lift max ram slots limit in libvhost-user

2020-05-19 Thread Raphael Norwitz

Historically, VMs with vhost-user devices could hot-add memory a maximum
of 8 times. Now that the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature has been added, VMs with vhost-user backends which
support this new feature can support a configurable number of ram slots
up to the maximum supported by the target platform.

This change adds VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS support for
backends built with libvhost-user, and increases the number of supported
ram slots from 8 to 32.

Memory hot-add, hot-remove and postcopy migration were tested with
the vhost-user-bridge sample.

Signed-off-by: Raphael Norwitz 
---
 contrib/libvhost-user/libvhost-user.c | 17 +
 contrib/libvhost-user/libvhost-user.h | 15 +++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 635cfb1..eeb6899 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -269,7 +269,7 @@ have_userfault(void)
 static bool
 vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
-char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = 
{};
 struct iovec iov = {
 .iov_base = (char *)vmsg,
 .iov_len = VHOST_USER_HDR_SIZE,
@@ -340,7 +340,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg 
*vmsg)
 {
 int rc;
 uint8_t *p = (uint8_t *)vmsg;
-char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = 
{};
 struct iovec iov = {
 .iov_base = (char *)vmsg,
 .iov_len = VHOST_USER_HDR_SIZE,
@@ -353,7 +353,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg 
*vmsg)
 struct cmsghdr *cmsg;
 
 memset(control, 0, sizeof(control));
-assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
+assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
 if (vmsg->fd_num > 0) {
 size_t fdsize = vmsg->fd_num * sizeof(int);
 msg.msg_controllen = CMSG_SPACE(fdsize);
@@ -780,7 +780,7 @@ static bool
 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 int i, j;
 bool found = false;
-VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {};
+VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {};
 VhostUserMemoryRegion *msg_region = >payload.memreg.region;
 
 DPRINT("Removing region:\n");
@@ -813,7 +813,7 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 
 if (found) {
 memcpy(dev->regions, shadow_regions,
-   sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS);
+   sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS);
 DPRINT("Successfully removed a region\n");
 dev->nregions--;
 vmsg_set_reply_u64(vmsg, 0);
@@ -1394,7 +1394,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg 
*vmsg)
 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD |
-1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
+1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
+1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS;
 
 if (have_userfault()) {
 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
@@ -1732,14 +1733,14 @@ static bool vu_handle_get_max_memslots(VuDev *dev, 
VhostUserMsg *vmsg)
 {
 vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
 vmsg->size  = sizeof(vmsg->payload.u64);
-vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS;
+vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS;
 vmsg->fd_num = 0;
 
 if (!vu_message_write(dev, dev->sock, vmsg)) {
 vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno));
 }
 
-DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS);
+DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS);
 
 return false;
 }
diff --git a/contrib/libvhost-user/libvhost-user.h 
b/contrib/libvhost-user/libvhost-user.h
index f843971..844c37c 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -28,7 +28,13 @@
 
 #define VIRTQUEUE_MAX_SIZE 1024
 
-#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_MEMORY_BASELINE_NREGIONS 8
+
+/*
+ * Set a reasonable maximum number of ram slots, which will be supported by
+ * any architecture.
+ */
+#define VHOST_USER_MAX_RAM_SLOTS 32
 
 typedef enum VhostSetConfigType {
 VHOST_SET_CONFIG_TYPE_MASTER = 0,
@@ -55,6 +61,7 @@ enum VhostUserProtocolFeature {
 VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
 VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
+VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS

[PATCH v3 09/10] Support individual region unmap in libvhost-user

2020-05-19 Thread Raphael Norwitz

When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature is
enabled, on memory hot-unplug qemu will transmit memory regions to
remove individually using the new message VHOST_USER_REM_MEM_REG
message. With this change, vhost-user backends build with libvhost-user
can now unmap individual memory regions when receiving the
VHOST_USER_REM_MEM_REG message.

Qemu only sends VHOST_USER_REM_MEM_REG messages when the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and
support for that feature has not yet been added in libvhost-user, this
new functionality is not yet used.

Signed-off-by: Raphael Norwitz 
---
 contrib/libvhost-user/libvhost-user.c | 63 +++
 contrib/libvhost-user/libvhost-user.h |  1 +
 2 files changed, 64 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 2c2a8d9..635cfb1 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -139,6 +139,7 @@ vu_request_to_string(unsigned int req)
 REQ(VHOST_USER_VRING_KICK),
 REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
 REQ(VHOST_USER_ADD_MEM_REG),
+REQ(VHOST_USER_REM_MEM_REG),
 REQ(VHOST_USER_MAX),
 };
 #undef REQ
@@ -763,6 +764,66 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 }
 }
 
+static inline bool reg_equal(VuDevRegion *vudev_reg,
+ VhostUserMemoryRegion *msg_reg)
+{
+if (vudev_reg->gpa == msg_reg->guest_phys_addr &&
+vudev_reg->qva == msg_reg->userspace_addr &&
+vudev_reg->size == msg_reg->memory_size) {
+return true;
+}
+
+return false;
+}
+
+static bool
+vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
+int i, j;
+bool found = false;
+VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {};
+VhostUserMemoryRegion *msg_region = >payload.memreg.region;
+
+DPRINT("Removing region:\n");
+DPRINT("guest_phys_addr: 0x%016"PRIx64"\n",
+   msg_region->guest_phys_addr);
+DPRINT("memory_size: 0x%016"PRIx64"\n",
+   msg_region->memory_size);
+DPRINT("userspace_addr   0x%016"PRIx64"\n",
+   msg_region->userspace_addr);
+DPRINT("mmap_offset  0x%016"PRIx64"\n",
+   msg_region->mmap_offset);
+
+for (i = 0, j = 0; i < dev->nregions; i++) {
+if (!reg_equal(>regions[i], msg_region)) {
+shadow_regions[j].gpa = dev->regions[i].gpa;
+shadow_regions[j].size = dev->regions[i].size;
+shadow_regions[j].qva = dev->regions[i].qva;
+shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset;
+j++;
+} else {
+found = true;
+VuDevRegion *r = >regions[i];
+void *m = (void *) (uintptr_t) r->mmap_addr;
+
+if (m) {
+munmap(m, r->size + r->mmap_offset);
+}
+}
+}
+
+if (found) {
+memcpy(dev->regions, shadow_regions,
+   sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS);
+DPRINT("Successfully removed a region\n");
+dev->nregions--;
+vmsg_set_reply_u64(vmsg, 0);
+} else {
+vu_panic(dev, "Specified region not found\n");
+}
+
+return true;
+}
+
 static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1771,6 +1832,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 return vu_handle_get_max_memslots(dev, vmsg);
 case VHOST_USER_ADD_MEM_REG:
 return vu_add_mem_reg(dev, vmsg);
+case VHOST_USER_REM_MEM_REG:
+return vu_rem_mem_reg(dev, vmsg);
 default:
 vmsg_close_fds(vmsg);
 vu_panic(dev, "Unhandled request: %d", vmsg->request);
diff --git a/contrib/libvhost-user/libvhost-user.h 
b/contrib/libvhost-user/libvhost-user.h
index 60ef7fd..f843971 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -99,6 +99,7 @@ typedef enum VhostUserRequest {
 VHOST_USER_VRING_KICK = 35,
 VHOST_USER_GET_MAX_MEM_SLOTS = 36,
 VHOST_USER_ADD_MEM_REG = 37,
+VHOST_USER_REM_MEM_REG = 38,
 VHOST_USER_MAX
 } VhostUserRequest;
 
-- 
1.8.3.1

1 2 3 4 >

1 - 100 of 394 matches

Mail list logo